## **Fourth Attempt**

In [1]:
import numpy as np
import pandas as pd


train_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/test.csv')
sub_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/gender_submission.csv')

In [2]:
# Remove features that won't be used to make model
def delete_features(df):
    feature_list = ['PassengerId', 'Ticket', 'Cabin']
    df.drop(feature_list, axis = 1, inplace = True)
    

# Fill Null Value at train data
def fill_NaN_train(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()

    df['Embarked'] = df['Embarked'].fillna('C')

# Fill Null Value at train test
def fill_NaN_test(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()
            
    df['Fare'][152] = 28.230436


# Extract title from Name
def Name_Engineering_train(df):
    Title_list = []
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)

    
    df['Title'] = Title_list
    
    list = ['Don', 'Rev', 'Dr', 'Mme', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess', 'Jonkheer', 'Ms']
    for ele in list:
        df['Title'] = df['Title'].replace(ele, 'None', inplace = False)
    
    df.drop('Name', axis = 1, inplace = True)

# Extract title from Name
def Name_Engineering_test(df):
    Title_list = []
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)

    df['Title'] = Title_list
    
    list = ['Dona', 'Rev', 'Dr', 'Col', 'Ms']
    for ele in list:
        df['Title'] = df['Title'].replace(ele, 'None', inplace = False)

    df.drop('Name', axis = 1, inplace = True)

# Transform continous data to 9 selections
def Age_Engineering(df):
    def Age_Conversion(x):
        str = ''
        if x < 13:
            str = 'Child'
        elif x < 35:
            str = 'Adult'
        elif x < 55:
            str = 'Old'
        else:
            str = 'Senior'
        return str

    df['Age'] = df['Age'].apply(lambda x : Age_Conversion(x))


# Combine SibSp feature and Parch feature to make Family new feature and make 4 selections
def Family_Engineering(df):
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch'], axis = 1, inplace = True)
    
    def Family_Conversion(x):
        str = ''
        if x == 0:
            str = 'Alone'
        elif x <= 3:
            str = 'SmallFamily'
        elif x <=5:
            str = 'MediumFamily'
        else:
            str = 'BigFamily'

        return str

    df['Family'] = df['Family'].apply(lambda x : Family_Conversion(x))

# Transform continous data to 4 selections
def Fare_Engineering(df):
    def Fare_Conversion(x):
        str = ''
        if x < 7.910400:
            str = 'level1'
        elif x < 14.454200:
            str = 'level2'
        elif x < 31:
            str = 'level3'
        else:
            str = 'level4'
        return str
    
    df['Fare'] = df['Fare'].apply(lambda x : Fare_Conversion(x))

In [3]:
def Feature_Engineering_train(df):
    delete_features(df)
    fill_NaN_train(df)
    Name_Engineering_train(df)
    Age_Engineering(df)
    Family_Engineering(df)
    Fare_Engineering(df)

def Feature_Engineering_test(df):
    delete_features(df)
    fill_NaN_test(df)
    Name_Engineering_test(df)
    Age_Engineering(df)
    Family_Engineering(df)
    Fare_Engineering(df)

In [4]:
import warnings
warnings.filterwarnings(action='ignore')

Feature_Engineering_train(train_df)
Feature_Engineering_test(test_df)

from sklearn.preprocessing import OneHotEncoder

train_df = pd.get_dummies(train_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare', 'Embarked'])
test_df = pd.get_dummies(test_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare', 'Embarked'])                          

In [5]:
Feature = train_df.drop('Survived', axis = 1, inplace = False)
Label = train_df['Survived']

In [6]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

dt_model = DecisionTreeClassifier(random_state = 0)
rf_model = RandomForestClassifier(random_state = 0)
xgb_model = XGBClassifier(random_state = 0)
lgb_model = LGBMClassifier(random_state = 0)
lr_model = LogisticRegression()

In [7]:
# Cheking None-Tuning models performance
from sklearn.model_selection import cross_val_score

dt_score = cross_val_score(dt_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Decision Tree 정확도 : ", np.mean(dt_score))

rf_score = cross_val_score(rf_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Random Forest 정확도 : ", np.mean(rf_score))

xgb_score = cross_val_score(xgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("XGBoost 정확도 : ", np.mean(xgb_score))

lgb_score = cross_val_score(lgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("LightGBM 정확도 : ", np.mean(dt_score))

lr_score = cross_val_score(lr_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Logistic Regression 정확도 : ", np.mean(dt_score))

Decision Tree 정확도 :  0.8013746783001695
Random Forest 정확도 :  0.8193459293201932
XGBoost 정확도 :  0.8226978846274559
LightGBM 정확도 :  0.8013746783001695
Logistic Regression 정확도 :  0.8013746783001695


In [10]:
# Hyper Parameter tuning

random_seed = [0]

# DecisionTree model hyper parameter
dt_params = {'max_depth' : [ i for i in range(2,11) ],
             'min_samples_leaf' : [ 2 * i for i in range(4,11)],
             'criterion' : ['gini', 'entropy'],
             'random_state' : random_seed}

# RandomForest model hyper parameter
rf_params = {'n_estimators' : [50, 100, 125, 150, 175, 200, 250, 300],
             'max_depth' : [i for i in range(2,11)],
             'criterion' : ['gini', 'entropy'],
             'min_samples_leaf' : [ 2 * i for i in range(4,11)],
             'random_state' : random_seed}


from sklearn.model_selection import GridSearchCV

tuned_dt = GridSearchCV(dt_model, param_grid = dt_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_dt.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Decision Tree', tuned_dt.best_params_, tuned_dt.best_score_))

tuned_rf = GridSearchCV(rf_model, param_grid = rf_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_rf.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Random Forest', tuned_rf.best_params_, tuned_rf.best_score_))

Decision Tree 최적 파라미터 : {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 8, 'random_state': 0}, 이때 정확도 : 0.8260059004456719

Random Forest 최적 파라미터 : {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 8, 'n_estimators': 150, 'random_state': 0}, 이때 정확도 : 0.8226476680685466

In [56]:
# Hyper parameter tuning XGB with Bayesian hyperopt
from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

xgb_search_space = {'max_depth' : hp.quniform('max_depth', 2, 6, 1),
                    'min_child_weight' : hp.quniform('min_child_weight', 6, 20, 2),
                    'colsample_bytree' : hp.uniform('colsample_bytree',0.5, 1),
                    'n_estimators' : hp.quniform('n_estimators', 100, 400, 50)}

In [68]:
# XGB objective function
from sklearn.model_selection import cross_val_score

# caanot use early_stopping_rounds at voting
def xgb_objective_func(search_space):
    xgb_model = XGBClassifier(eval_metric = 'logloss', random_state = 0,
                              max_depth = int(search_space['max_depth']),
                              n_estimators = int(search_space['n_estimators']),
                              min_child_weight = int(search_space['min_child_weight']),
                              colsample_bytree = search_space['colsample_bytree'])
    
    score_list = cross_val_score(xgb_model, Feature, Label, cv = 5, scoring = 'accuracy')

    return {'loss' : -1 * np.mean(score_list), 'status' : STATUS_OK}

In [58]:
# Google Colab

#trials = Trials()

#xgb_best_params = fmin(fn = xgb_objective_func, space = xgb_search_space, algo = tpe.suggest, max_evals = 50, trials = trials)
#print("XGB 최적 파라미터 : ", xgb_best_params)

XGB 최적 파라미터 :  {'colsample_bytree': 0.7851036312778981, 'max_depth': 5.0, 'min_child_weight': 20.0, 'n_estimators': 400.0}

In [93]:
# Find optimal learning_rate for XGB
xgb_lr = {'learning_rate' : hp.uniform('learning_rate', 0.01, 0.3)}

In [94]:
from sklearn.model_selection import cross_val_score

def xgb_lr_func(search_space):
    xgb_model = XGBClassifier(random_state = 0,
                              max_depth = 5,
                              n_estimators = 400,
                              min_child_weight = 20,
                              colsample_bytree = 0.7851036312778981,
                              learning_rate = search_space['learning_rate'])

    score_list = cross_val_score(xgb_model, Feature, Label, scoring = 'accuracy', cv = 5)

    return {'loss' : -1 * np.mean(score_list), 'status' : STATUS_OK}

In [95]:
trials = Trials()

xgb_best_lr = fmin(fn = xgb_lr_func, space = xgb_lr, algo = tpe.suggest, max_evals = 100, trials = trials)
print("XGB 최적 learning_rate : ", xgb_best_lr)

100%|█████████████████████████████████████████████| 100/100 [04:03<00:00,  2.44s/trial, best loss: -0.8294143493817087]
XGB 최적 learning_rate :  {'learning_rate': 0.2839561561151697}


In [103]:
# Hyper parameter tuning LGB with Bayesian hyperopt

from hyperopt import hp, STATUS_OK, fmin, tpe, Trials

lgb_search_space = {'max_depth' : hp.quniform('max_depth', 2, 5, 1),
                    'num_leaves' : hp.quniform('num_leaves', 6, 20, 2),
                    'colsample_bytree' : hp.uniform('colsample_bytree',0.5, 1),
                    'n_estimators' : hp.quniform('n_estimators', 100, 400, 50),
                    'min_child_samples' : hp.quniform('min_child_samples', 10, 30, 2)}

In [104]:
# LGBM objective function
from sklearn.model_selection import cross_val_score

def lgb_objective_func(search_space):
    lgb_model = LGBMClassifier(random_state = 0,
                              max_depth = int(search_space['max_depth']),
                              n_estimators = int(search_space['n_estimators']),
                              min_child_samples = int(search_space['min_child_samples']),
                              colsample_bytree = search_space['colsample_bytree'],
                              num_leaves = int(search_space['num_leaves'])
                              )

    score_list = cross_val_score(lgb_model, Feature, Label , scoring = 'accuracy', cv = 5)

    return {'loss' : -1 * np.mean(score_list), 'status' : STATUS_OK}

In [105]:
trials = Trials()

lgb_best_params = fmin(fn = lgb_objective_func, space = lgb_search_space, algo = tpe.suggest, max_evals = 100, trials = trials)
print("LGB 최적 파라미터 : ", lgb_best_params)

100%|█████████████████████████████████████████████| 100/100 [02:23<00:00,  1.43s/trial, best loss: -0.8316427091833534]
LGB 최적 파라미터 :  {'colsample_bytree': 0.8557949199571184, 'max_depth': 5.0, 'min_child_samples': 12.0, 'n_estimators': 100.0, 'num_leaves': 18.0}


In [90]:
# Find optimal learning_rate for LGBM
lgb_lr = {'learning_rate' : hp.uniform('learning_rate', 0.01, 0.3)}

In [106]:
from sklearn.model_selection import cross_val_score

def lgb_lr_func(search_space):
    lgb_model = LGBMClassifier(
                              max_depth = 5,
                              n_estimators = 100,
                              min_child_samples = 12,
                              colsample_bytree = 0.8557949199571184,
                              num_leaves = 18,
                              learning_rate = search_space['learning_rate']
                              )

    score_list = cross_val_score(lgb_model, Feature, Label , scoring = 'accuracy', cv = 5)

    return {'loss' : -1 * np.mean(score_list), 'status' : STATUS_OK}

In [107]:
trials = Trials()

lgb_best_lr = fmin(fn = lgb_lr_func, space = lgb_lr, algo = tpe.suggest, max_evals = 100, trials = trials)
print("LGB 최적 learning_rate : ", lgb_best_lr)

100%|█████████████████████████████████████████████| 100/100 [01:52<00:00,  1.12s/trial, best loss: -0.8338899001945892]
LGB 최적 learning_rate :  {'learning_rate': 0.11482673793937354}


In [50]:
# Logistic Regression model hyper parameter
lr_params = {'penalty' : ['l1', 'l2'],
             'C' : [0.01, 0.1, 0.05, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 150, 200],
             'solver' : ['lbfgs', 'liblinear']}

In [53]:
from sklearn.model_selection import GridSearchCV

tuned_lr = GridSearchCV(lr_model, param_grid = lr_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lr.fit(Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Logistic Regression', tuned_lr.best_params_, tuned_lr.best_score_))

Logistic Regression 최적 파라미터 : {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}, 이때 정확도 : 0.8271608813006088


In [110]:
# voting
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

dt_model = DecisionTreeClassifier(random_state = 0, max_depth = 4, min_samples_leaf = 8, criterion = 'gini')
rf_model = RandomForestClassifier(random_state = 0, criterion = 'gini', max_depth = 5, min_samples_leaf = 8, n_estimators = 150)
xgb_model = XGBClassifier(random_state = 0, max_depth = 5, min_child_weight = 20,
                         colsample_bytree = 0.7851036312778981, n_estimators = 400, learning_rate = 0.2839561561151697)
lgb_model = LGBMClassifier(random_state = 0, max_depth = 5, num_leaves = 18, min_child_samples = 12,
                           colsample_bytree = 0.8557949199571184, n_estimators = 100, learning_rate = 0.11482673793937354)
lr_model = LogisticRegression(C = 1, penalty = 'l2', solver = 'lbfgs')

# Hard Voting
hard_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model), ('LR', lr_model)], 
                             voting = 'hard')
hard_score = cross_val_score(hard_vote, Feature, Label, scoring = 'accuracy', cv = 5)
print("Hard Voting 정확도 : ", np.mean(hard_score))

# Soft Voting
soft_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model), ('LR', lr_model)], 
                             voting = 'soft')
soft_score = cross_val_score(soft_vote, Feature, Label, scoring = 'accuracy', cv = 5)
print("Soft Voting 정확도 : ", np.mean(soft_score))

Hard Voting 정확도 :  0.8271608813006089
Soft Voting 정확도 :  0.8249011361496453


In [113]:
hard_vote.fit(Feature, Label)
prediction = hard_vote.predict(test_df)

submission = pd.DataFrame({
"PassengerId":sub_df['PassengerId'],
"Survived": prediction
})
display(submission)

submission.to_csv('hard_submission.csv',index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [112]:
soft_vote.fit(Feature, Label)
prediction = soft_vote.predict(test_df)

submission = pd.DataFrame({
"PassengerId":sub_df['PassengerId'],
"Survived": prediction
})
display(submission)

submission.to_csv('soft_submission.csv',index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [4]:
# feature importance
import warnings
warnings.filterwarnings(action='ignore')

Feature_Engineering_train(train_df)
Feature_Engineering_test(test_df)

from sklearn.preprocessing import OneHotEncoder

train_df = pd.get_dummies(train_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare'])
test_df = pd.get_dummies(test_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare'])  

Feature = train_df.drop('Survived', axis = 1, inplace = False)
Label = train_df['Survived']

drop_Feature = Feature.drop(['Embarked'], axis = 1, inplace = False)
drop_test = test_df.drop(['Embarked'], axis = 1, inplace = False)

In [6]:
# voting
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

dt_model = DecisionTreeClassifier(random_state = 0, max_depth = 4, min_samples_leaf = 8, criterion = 'gini')
rf_model = RandomForestClassifier(random_state = 0, criterion = 'gini', max_depth = 5, min_samples_leaf = 8, n_estimators = 150)
xgb_model = XGBClassifier(random_state = 0, max_depth = 5, min_child_weight = 20,
                         colsample_bytree = 0.7851036312778981, n_estimators = 400, learning_rate = 0.2839561561151697)
lgb_model = LGBMClassifier(random_state = 0, max_depth = 5, num_leaves = 18, min_child_samples = 12,
                           colsample_bytree = 0.8557949199571184, n_estimators = 100, learning_rate = 0.11482673793937354)
lr_model = LogisticRegression(C = 1, penalty = 'l2', solver = 'lbfgs')

# Hard Voting
hard_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model), ('LR', lr_model)], 
                             voting = 'hard')
hard_score = cross_val_score(hard_vote, drop_Feature, Label, scoring = 'accuracy', cv = 5)
print("Hard Voting 정확도 : ", np.mean(hard_score))

# Soft Voting
soft_vote = VotingClassifier(estimators = [('DT', dt_model), ('RF', rf_model), ('XGB', xgb_model), ('LGBM', lgb_model), ('LR', lr_model)], 
                             voting = 'soft')
soft_score = cross_val_score(soft_vote, drop_Feature, Label, scoring = 'accuracy', cv = 5)
print("Soft Voting 정확도 : ", np.mean(soft_score))

Hard Voting 정확도 :  0.8282719226664993
Soft Voting 정확도 :  0.8260247316552632


In [7]:
hard_vote.fit(drop_Feature, Label)
prediction = hard_vote.predict(drop_test)

submission = pd.DataFrame({
"PassengerId":sub_df['PassengerId'],
"Survived": prediction
})
display(submission)

submission.to_csv('hard_submission.csv',index=False)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


과적합을 잡기위해 데이터 분류와 파라미터 튜닝으로 검증 정확도를 약간 포기해보았지만, 큰 의미는 없었음

데이터 양이 너무 적어서 어쩔 수 없는 것으로 판단