In [1]:
import numpy as np
import pandas as pd


train_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/test.csv')
sub_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/gender_submission.csv')

In [2]:
# Remove features that won't be used to make model
def delete_features(df):
    feature_list = ['PassengerId', 'Ticket', 'Cabin']
    df.drop(feature_list, axis = 1, inplace = True)


# Remove outlier data at Age
def remove_age(df):
    index = df[df['Age'] >= 80].index
    df = df.drop(index, axis = 0, inplace = False)


# Fill Null Value at train data
def fill_NaN_train(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()

    df['Embarked'] = df['Embarked'].fillna('C')

# Fill Null Value at train test
def fill_NaN_test(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()
            
    df['Fare'][152] = 28.230436


# Extract title from Name, remove titles that not exist in test data and do encoding at train data
def Name_Engineering_train(df):
    Title_list = list()
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)

    df['Title'] = Title_list
    drop_title = ['Mlle', 'Major', 'the Countess', 'Capt', 'Sir', 'Lady', 'Mme', 'Don', 'Jonkheer']
    drop_index = list()
    for title in drop_title:
        index = df[df['Title'] == title].index.tolist()
        drop_index.append(index)
    drop_index = sum(drop_index,[])
    df.drop(drop_index, axis = 0, inplace = True)
    
    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    new_title = label.fit_transform(df['Title'])
    df['Title'] = new_title
    df.drop('Name', axis = 1, inplace = True)

# Extract title from Name and do encoding at test data
def Name_Engineering_test(df):
    Title_list = list()
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)
        
    # Dona index = 414 change Dona to ZDona for last labeling (not in train set)
    Title_list[414] = 'ZDona'
    
    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    new_title = label.fit_transform(Title_list)
    df['Title'] = new_title
    df.drop('Name', axis = 1, inplace = True)


# Encoding the Sex feature
def Sex_Encoding(df):
    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    new_sex = label.fit_transform(df['Sex'])
    df['Sex'] = new_sex


# Transform continous data to 9 selections and Encoding
def Age_Engineering(df):
    def Age_Conversion(age):
        title = ''
        if age <= 5:
            title = 'Baby'
        elif age <= 16:
            title = 'Child'
        elif age <= 32:
            title = 'Young_Adult'
        elif age <= 48:
            title = 'Adult'
        elif age <= 64:
            title = 'Old_Adult'
        else:
            title = 'Senior'

        return title

    df['Age_selection'] = df['Age'].apply(lambda x : Age_Conversion(x))
    df.drop('Age', axis = 1, inplace = True)

    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    new_age = label.fit_transform(df['Age_selection'])
    df['Age_selection'] = new_age


# Combine SibSp feature and Parch feature to make Family new feature
def Family_Conversion(df):
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch'], axis = 1, inplace = True)


# Do log conversion on Fare feature to reduce right skewness
def Fare_Log(df):
    df['Fare'] = np.log1p(df['Fare'])


# Encoding Embarked feature
def Embarked_Encoding(df):
    from sklearn.preprocessing import LabelEncoder
    label = LabelEncoder()
    new_embarked = label.fit_transform(df['Embarked'])
    df['Embarked'] = new_embarked

In [3]:
def Feature_Engineering_train(df):
    delete_features(df)
    remove_age(df)
    fill_NaN_train(df)
    Name_Engineering_train(df)
    Sex_Encoding(df)
    Age_Engineering(df)
    Family_Conversion(df)
    Fare_Log(df)
    Embarked_Encoding(df)

def Feature_Engineering_test(df):
    delete_features(df)
    fill_NaN_test(df)
    Name_Engineering_test(df)
    Sex_Encoding(df)
    Age_Engineering(df)
    Family_Conversion(df)
    Fare_Log(df)
    Embarked_Encoding(df)

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

Feature_Engineering_train(train_df)
Feature_Engineering_test(test_df)

In [5]:
Feature = train_df.drop('Survived', axis = 1, inplace = False)
Label = train_df['Survived']

**사용할 모델들** : Decision Tree, Random Forest, XGBoost, LightGBM, Logistic Regression

Hard Voting, Soft Voting 사용 비교 후 더 좋은 성능의 모델 선택

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

dt_model = DecisionTreeClassifier(random_state = 0)
rf_model = RandomForestClassifier(random_state = 0)
xgb_model = XGBClassifier(random_state = 0)
lgb_model = LGBMClassifier(random_state = 0)
lr_model = LogisticRegression()

In [7]:
# Cheking None-Tuning models performance
from sklearn.model_selection import cross_val_score

dt_score = cross_val_score(dt_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Decision Tree 정확도 : ", np.mean(dt_score))

rf_score = cross_val_score(rf_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Random Forest 정확도 : ", np.mean(rf_score))

xgb_score = cross_val_score(xgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("XGBoost 정확도 : ", np.mean(xgb_score))

lgb_score = cross_val_score(lgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("LightGBM 정확도 : ", np.mean(dt_score))

lr_score = cross_val_score(lr_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Logistic Regression 정확도 : ", np.mean(dt_score))

Decision Tree 정확도 :  0.8011363636363636
Random Forest 정확도 :  0.8056818181818182
XGBoost 정확도 :  0.8238636363636364
LightGBM 정확도 :  0.8011363636363636
Logistic Regression 정확도 :  0.8011363636363636


In [8]:
# Feature Selection - removed Embarked
drop_Feature = Feature.drop(['Embarked'], axis = 1, inplace = False)
drop_test = train_df.drop(['Embarked'], axis = 1, inplace = False)

def accuracy_model(model_list, name_list):
    for i, model in enumerate(model_list):
        score = cross_val_score(model, drop_Feature, Label, scoring = 'accuracy', cv = 5)
        print("{0} 정확도 : {1}".format(name_list[i], np.mean(score)))

model_list = [dt_model, rf_model, xgb_model, lgb_model, lr_model]
name_list = ['Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM', 'Logistic Regression']

accuracy_model(model_list, name_list)

Decision Tree 정확도 : 0.8
Random Forest 정확도 : 0.8147727272727273
XGBoost 정확도 : 0.8375
LightGBM 정확도 : 0.8284090909090909
Logistic Regression 정확도 : 0.7874999999999999


In [18]:
# Hyper Parameter tuning

random_seed = [0]

# DecisionTree model hyper parameter
dt_params = {'max_depth' : [ i for i in range(2,11) ],
             'criterion' : ['gini', 'entropy'],
             'random_state' : random_seed}

# RandomForest model hyper parameter
rf_params = {'n_estimators' : [50, 100, 150, 200],
             'max_depth' : [i for i in range(2,11)],
             'criterion' : ['gini', 'entropy'],
             'random_state' : random_seed}

# XGBoost model hyper parameter
xgb_params = {'max_depth' :[i for i in range(2,11)],
              'min_child_weight' : [2 * i for i in range(2,16)],
              'random_state' : random_seed}

# LightGBM model hyper parameter
lgb_params = {'max_depth' : [i for i in range(2,11)],
             'num_leaves' : [2 * i for i in range(2,21)],
             'random_state' : random_seed}

# Logistic Regression model hyper parameter
lr_params = {'penalty' : ['l1', 'l2'],
             'C' : [0.01, 0.1, 0.05, 1, 5, 10],
             'solver' : ['lbfgs', 'liblinear']}
             

In [11]:
from sklearn.model_selection import GridSearchCV

tuned_dt = GridSearchCV(dt_model, param_grid = dt_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_dt.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Decision Tree', tuned_dt.best_params_, tuned_dt.best_score_))

tuned_rf = GridSearchCV(rf_model, param_grid = rf_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_rf.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Random Forest', tuned_rf.best_params_, tuned_rf.best_score_))

tuned_xgb = GridSearchCV(xgb_model, param_grid = xgb_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_xgb.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('XGBoost', tuned_xgb.best_params_, tuned_xgb.best_score_))

tuned_lgb = GridSearchCV(lgb_model, param_grid = lgb_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lgb.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('LightGBM', tuned_lgb.best_params_, tuned_lgb.best_score_))

tuned_lr = GridSearchCV(lr_model, param_grid = lr_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lr.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Logistic Regression', tuned_lr.best_params_, tuned_lr.best_score_))

Decision Tree 최적 파라미터 : {'criterion': 'gini', 'max_depth': 5, 'random_state': 0}, 이때 정확도 : 0.8329545454545455
Random Forest 최적 파라미터 : {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 50, 'random_state': 0}, 이때 정확도 : 0.8306818181818182
XGBoost 최적 파라미터 : {'max_depth': 4, 'min_child_weight': 6, 'random_state': 0}, 이때 정확도 : 0.8511363636363637
LightGBM 최적 파라미터 : {'max_depth': 7, 'num_leaves': 14, 'random_state': 0}, 이때 정확도 : 0.8443181818181816
Logistic Regression 최적 파라미터 : {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}, 이때 정확도 : 0.7897727272727273


In [20]:
lr_params = {'penalty' : ['l1', 'l2'],
             'C' : [1,2,3,4,5,6,7,8,9,10],
             'solver' : ['lbfgs', 'liblinear']}

tuned_lr = GridSearchCV(lr_model, param_grid = lr_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lr.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Logistic Regression', tuned_lr.best_params_, tuned_lr.best_score_))

Logistic Regression 최적 파라미터 : {'C': 2, 'penalty': 'l2', 'solver': 'lbfgs'}, 이때 정확도 : 0.7886363636363636


In [13]:
# Tuning learning_rate
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

xgb_lr = {'learning_rate' : hp.uniform('learning_rate', 0.01,0.2)}
lgb_lr = {'learning_rate' : hp.uniform('learning_rate', 0.01,0.2)}

In [14]:
from sklearn.model_selection import cross_val_score

def xgb_func(space):
    xgb_model = XGBClassifier(max_depth = 4, min_child_weight = 6, random_state = 0, learning_rate = space['learning_rate'])

    score = cross_val_score(xgb_model, drop_Feature, Label, cv = 5, scoring = 'accuracy')

    return {'loss' : -1 * np.mean(score), 'status' : STATUS_OK}

def lgb_func(space):
    lgb_model = LGBMClassifier(max_depth = 7, num_leaves = 14, random_state = 0, learning_rate = space['learning_rate'])

    score = cross_val_score(lgb_model, drop_Feature, Label, cv = 5, scoring = 'accuracy')

    return {'loss' : -1 * np.mean(score), 'status' : STATUS_OK}

In [15]:
trials1 = Trials()
trials2 = Trials()

best_xgb = fmin(fn = xgb_func, space = xgb_lr, algo = tpe.suggest, max_evals = 200, trials = trials1)
best_lgb = fmin(fn = lgb_func, space = lgb_lr, algo = tpe.suggest, max_evals = 200, trials = trials2)

100%|█████████████████████████████████████████████| 200/200 [02:20<00:00,  1.42trial/s, best loss: -0.8511363636363637]
100%|█████████████████████████████████████████████| 200/200 [03:02<00:00,  1.10trial/s, best loss: -0.8488636363636364]


In [16]:
print("최적 XGBoost learning_rate : ", best_xgb)
print("최적 LightGBM learning_rate : ", best_lgb)

최적 XGBoost learning_rate :  {'learning_rate': 0.19099427768421384}
최적 LightGBM learning_rate :  {'learning_rate': 0.1395954806287981}


In [26]:
# voting
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

dt_model = DecisionTreeClassifier(random_state = 0, max_depth = 5, criterion = 'gini')
rf_model = RandomForestClassifier(random_state = 0, criterion = 'entropy', max_depth = 5, n_estimators = 50)
xgb_model = XGBClassifier(random_state = 0, max_depth = 4, min_child_weight = 6, learning_rate = 0.19099427768421384)
lgb_model = LGBMClassifier(random_state = 0, max_depth = 7, num_leaves = 14, learning_rate = 0.1395954806287981)
#lr_model = LogisticRegression(C = 2, penalty = 'l2', solver = 'lbfgs')

# Hard Voting
hard_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model)], 
                             voting = 'hard')
hard_score = cross_val_score(hard_vote, drop_Feature, Label, scoring = 'accuracy', cv = 5)
print("Hard Voting 정확도 : ", np.mean(hard_score))

# Soft Voting
soft_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model)], 
                             voting = 'soft')
soft_score = cross_val_score(soft_vote, drop_Feature, Label, scoring = 'accuracy', cv = 5)
print("Soft Voting 정확도 : ", np.mean(soft_score))

Hard Voting 정확도 :  0.8477272727272727
Soft Voting 정확도 :  0.834090909090909
