## **Fourth Attempt**

In [2]:
import numpy as np
import pandas as pd


train_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/train.csv')
test_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/test.csv')
sub_df = pd.read_csv('C:/Users/user/Desktop/Study/Kaggle/Titanic/Data_File/gender_submission.csv')

In [3]:
# Remove features that won't be used to make model
def delete_features(df):
    feature_list = ['PassengerId', 'Ticket', 'Cabin']
    df.drop(feature_list, axis = 1, inplace = True)
    

# Fill Null Value at train data
def fill_NaN_train(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()

    df['Embarked'] = df['Embarked'].fillna('C')

# Fill Null Value at train test
def fill_NaN_test(df):
    index_list = df[df['Age'].isna() == 1].index.tolist()
    for index in index_list:
        Pclass = df[df.index == index]['Pclass'].values.tolist()[0]
        if Pclass == 1:
            df['Age'][index] = df[df['Pclass'] == 1].loc[:,'Age'].mean()
        elif Pclass == 2:
            df['Age'][index] = df[df['Pclass'] == 2].loc[:,'Age'].mean()
        else:
            df['Age'][index] = df[df['Pclass'] == 3].loc[:,'Age'].mean()
            
    df['Fare'][152] = 28.230436


# Extract title from Name
def Name_Engineering_train(df):
    Title_list = []
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)

    
    df['Title'] = Title_list
    
    list = ['Don', 'Rev', 'Dr', 'Mme', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess', 'Jonkheer', 'Ms']
    for ele in list:
        df['Title'] = df['Title'].replace(ele, 'None', inplace = False)
    
    df.drop('Name', axis = 1, inplace = True)

# Extract title from Name
def Name_Engineering_test(df):
    Title_list = []
    for str in df['Name']:
        str1 = str.split(', ')[1]
        str2 = str1.split('.')[0]
        Title_list.append(str2)

    df['Title'] = Title_list
    
    list = ['Dona', 'Rev', 'Dr', 'Col', 'Ms']
    for ele in list:
        df['Title'] = df['Title'].replace(ele, 'None', inplace = False)

    df.drop('Name', axis = 1, inplace = True)

# Transform continous data to 9 selections
def Age_Engineering(df):
    def Age_Conversion(x):
        str = ''
        if x < 13:
            str = 'Child'
        elif x < 35:
            str = 'Adult'
        elif x < 55:
            str = 'Old'
        else:
            str = 'Senior'
        return str

    df['Age'] = df['Age'].apply(lambda x : Age_Conversion(x))


# Combine SibSp feature and Parch feature to make Family new feature and make 4 selections
def Family_Engineering(df):
    df['Family'] = df['SibSp'] + df['Parch']
    df.drop(['SibSp', 'Parch'], axis = 1, inplace = True)
    
    def Family_Conversion(x):
        str = ''
        if x == 0:
            str = 'Alone'
        elif x <= 3:
            str = 'SmallFamily'
        elif x <=5:
            str = 'MediumFamily'
        else:
            str = 'BigFamily'

        return str

    df['Family'] = df['Family'].apply(lambda x : Family_Conversion(x))

# Transform continous data to 4 selections
def Fare_Engineering(df):
    def Fare_Conversion(x):
        str = ''
        if x < 7.910400:
            str = 'level1'
        elif x < 14.454200:
            str = 'level2'
        elif x < 31:
            str = 'level3'
        else:
            str = 'level4'
        return str
    
    df['Fare'] = df['Fare'].apply(lambda x : Fare_Conversion(x))

In [4]:
def Feature_Engineering_train(df):
    delete_features(df)
    fill_NaN_train(df)
    Name_Engineering_train(df)
    Age_Engineering(df)
    Family_Engineering(df)
    Fare_Engineering(df)

def Feature_Engineering_test(df):
    delete_features(df)
    fill_NaN_test(df)
    Name_Engineering_test(df)
    Age_Engineering(df)
    Family_Engineering(df)
    Fare_Engineering(df)

In [5]:
import warnings
warnings.filterwarnings(action='ignore')

Feature_Engineering_train(train_df)
Feature_Engineering_test(test_df)

from sklearn.preprocessing import OneHotEncoder

train_df = pd.get_dummies(train_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare', 'Embarked'])
test_df = pd.get_dummies(test_df, columns = ['Title', 'Sex', 'Family', 'Age', 'Fare', 'Embarked'])                          

In [6]:
Feature = train_df.drop('Survived', axis = 1, inplace = False)
Label = train_df['Survived']

In [10]:
# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

dt_model = DecisionTreeClassifier(random_state = 0)
rf_model = RandomForestClassifier(random_state = 0)
xgb_model = XGBClassifier(random_state = 0)
lgb_model = LGBMClassifier(random_state = 0)
lr_model = LogisticRegression()

In [11]:
# Cheking None-Tuning models performance
from sklearn.model_selection import cross_val_score

dt_score = cross_val_score(dt_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Decision Tree 정확도 : ", np.mean(dt_score))

rf_score = cross_val_score(rf_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Random Forest 정확도 : ", np.mean(rf_score))

xgb_score = cross_val_score(xgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("XGBoost 정확도 : ", np.mean(xgb_score))

lgb_score = cross_val_score(lgb_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("LightGBM 정확도 : ", np.mean(dt_score))

lr_score = cross_val_score(lr_model, Feature, Label, scoring = 'accuracy', cv = 5)
print("Logistic Regression 정확도 : ", np.mean(dt_score))

Decision Tree 정확도 :  0.8013746783001695
Random Forest 정확도 :  0.8193459293201932
XGBoost 정확도 :  0.8226978846274559
LightGBM 정확도 :  0.8013746783001695
Logistic Regression 정확도 :  0.8013746783001695


In [12]:
# Hyper Parameter tuning

random_seed = [0]

# DecisionTree model hyper parameter
dt_params = {'max_depth' : [ i for i in range(2,11) ],
             'min_samples_split' : [ i for i in range(2,21)],
             'min_samples_leaf' : [i for i in range(1,21)],
             'criterion' : ['gini', 'entropy'],
             'random_state' : random_seed}

# RandomForest model hyper parameter
rf_params = {'n_estimators' : [50, 100, 125, 150, 175, 200, 250, 300],
             'max_depth' : [i for i in range(2,11)],
             'criterion' : ['gini', 'entropy'],
             'min_samples_leaf' : [ i for i in range(1,21)],
             'random_state' : random_seed}

# XGBoost model hyper parameter
xgb_params = {'n_estimators' : [100, 150, 200, 250, 300, 350, 400],
              'max_depth' :[i for i in range(2,11)],
              'min_child_weight' : [i for i in range(2,21)],
              'colsample_bytree' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'subsample' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'random_state' : random_seed}

# LightGBM model hyper parameter
lgb_params = {'n_estimators' : [100, 150, 200, 250, 300, 350, 400],
              'max_depth' : [i for i in range(2,11)],
              'num_leaves' : [2 * i for i in range(2,61)],
              'min_child_samples' : [ i for i in range(1,21)],
              'min_child_depth' : [ i for i in range(2,21)],
              'colsample_bytree' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'subsample' : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
              'random_state' : random_seed}

# Logistic Regression model hyper parameter
lr_params = {'penalty' : ['l1', 'l2'],
             'C' : [0.01, 0.1, 0.05, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 150, 200],
             'solver' : ['lbfgs', 'liblinear']}

from sklearn.model_selection import GridSearchCV

tuned_dt = GridSearchCV(dt_model, param_grid = dt_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_dt.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Decision Tree', tuned_dt.best_params_, tuned_dt.best_score_))

tuned_rf = GridSearchCV(rf_model, param_grid = rf_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_rf.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Random Forest', tuned_rf.best_params_, tuned_rf.best_score_))

tuned_xgb = GridSearchCV(xgb_model, param_grid = xgb_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_xgb.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('XGBoost', tuned_xgb.best_params_, tuned_xgb.best_score_))

tuned_lgb = GridSearchCV(lgb_model, param_grid = lgb_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lgb.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('LightGBM', tuned_lgb.best_params_, tuned_lgb.best_score_))

tuned_lr = GridSearchCV(lr_model, param_grid = lr_params, cv = 5, scoring = 'accuracy', refit = True)
tuned_lr.fit(drop_Feature, Label)
print("{0} 최적 파라미터 : {1}, 이때 정확도 : {2}".format('Logistic Regression', tuned_lr.best_params_, tuned_lr.best_score_))

Decision Tree 최적 파라미터 : {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 16, 'random_state': 0}, 이때 정확도 : 0.8406063649488418
Random Forest 최적 파라미터 : {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 2, 'n_estimators': 50, 'random_state': 0}, 이때 정확도 : 0.8316301550436256