In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler 
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

from sklearn.ensemble import HistGradientBoostingClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_curve, auc
from sklearn import set_config
set_config(display='diagram') # Để trực quan hóa pipeline

In [2]:
data_df = pd.read_csv('train.csv', index_col=0)
data_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Tách giá trị khỏi tập train
y_df = data_df["Survived"]
X_df = data_df.drop("Survived", axis=1)

In [4]:
#Family_Survival
class FamilySurvival(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        #Kiểm tra bằng surname
        surname_col = X['Name'].str.extract('(.+),')
        X = X.assign(Surname = surname_col)
        Xy = X.assign(Survived=y)
        PassID_all_live = []
        PassID_one_die = []
        #Gom nhóm gia đình lại dựa vào tên và giá vé do cả gia đình đi thì sẽ có tổng chung 1 giá vé
        for grp, grp_df in Xy[['Survived','Name', 'Surname', 'Fare', 
                                   'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Surname', 'Fare']):
            if (len(grp_df) != 1):
                for PassID, row in grp_df.iterrows():
                    # Tìm xem 1 gia đình có bất kì người nào còn sống không
                    smax = grp_df.drop(PassID)['Survived'].max()
                    smin = grp_df.drop(PassID)['Survived'].min()
                    # Lưu lại tất cả các ID của gia đình nào còn sống toàn bộ vào biến PassID_all_live
                    if (smax==1.0): PassID_all_live.append(PassID)
                    # Nếu gia đình có bất kỳ người nào chết thì sẽ được lưu lại vào biến PassID_one_die
                    elif (smin==0.0): PassID_one_die.append(PassID)
        
        #Kiểm tra bằng ticket
        Xy = X.assign(Survived=y)
        #Gom nhóm nhóm lại dựa vào tên vé
        for grp, grp_df in Xy[['Survived','Name',
                                   'SibSp', 'Parch', 'Age', 'Cabin','Ticket']].groupby(['Ticket']):
            if (len(grp_df) != 1):
                for PassID, row in grp_df.iterrows():
                    # Tìm xem 1 nhóm có bất kì người nào còn sống không
                    smax = grp_df.drop(PassID)['Survived'].max()
                    smin = grp_df.drop(PassID)['Survived'].min()
                    # Lưu lại tất cả các ID của gia đình nào còn sống toàn bộ vào biến PassID_all_live
                    if (smax==1.0): PassID_all_live.append(PassID)
                    # Nếu gia đình có bất kỳ người nào chết thì sẽ được lưu lại vào biến PassID_one_die
                    elif (smin==0.0): PassID_one_die.append(PassID)
        #Đưa giá trị vào
        self.PassID_one_die = PassID_one_die
        self.PassID_all_live = PassID_all_live
        return self
    def transform(self,X):
        family_survival = pd.Series(0.5,index = X.index)
        family_survival[X.index.isin(self.PassID_one_die)] = 0
        family_survival[X.index.isin(self.PassID_all_live)] = 1
        X = X.assign(Family_Survival=family_survival)
        
        return X

In [5]:
#WomanBoy https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever
class WomanBoy(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):        
        Xy = X.assign(Survived = y)
        title_col = Xy.Name.str.extract(r'([a-zA-z]+)\.', expand=False)
        surname_col = Xy.Name.str.extract('(.+),')
        Xy = Xy.assign(Surname = surname_col, Title = title_col)
        Xy['IsWomanOrChild'] = ((Xy.Title == 'Master') | (Xy.Sex == 'female'))
        Xy['Surname'] = Xy.Name.str.split(',').str[0]

        family = Xy.groupby(Xy.Surname).Survived
        Xy['FamilyTotalCount'] = family.transform(lambda s: s[Xy.IsWomanOrChild].fillna(0).count())
        Xy['FamilyTotalCount'] = Xy.mask(Xy.IsWomanOrChild, Xy.FamilyTotalCount - 1, axis=0).FamilyTotalCount
        Xy['FamilySurvivedCount'] = family.transform(lambda s: s[Xy.IsWomanOrChild].fillna(0).sum())
        Xy['FamilySurvivedCount'] = Xy.mask(Xy.IsWomanOrChild, Xy.FamilySurvivedCount - Xy.Survived.fillna(0), axis=0).FamilySurvivedCount
        Xy['FamilySurvivalRate'] = (Xy.FamilySurvivedCount / Xy.FamilyTotalCount.replace(0, np.nan))
        Xy['Alone'] = Xy.FamilyTotalCount == 0
        
        self.Family_Total_Count = Xy['FamilyTotalCount']
        self.Family_Survived_Count = Xy['FamilySurvivedCount']
        self.Alone = Xy['Alone']
        self.Family_Survival_Rate = Xy['FamilySurvivalRate']
        
        return self
    def transform(self,X):
        X['Alone'] = self.Alone
        X['FamilyTotalCount'] = self.Family_Total_Count    
        X['FamilySurvivedCount'] = self.Family_Survived_Count
        X['FamilySurvivalRate'] = self.Family_Survival_Rate
        return X

In [6]:
#Title
class TitleAndFillMissing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        return self
    def transform(self,X):
        #Lấy cách xưng hô
        title_col = X.Name.str.extract(r'([a-zA-z]+)\.', expand=False)
        X['Title'] = title_col
        
        #Chuyển đổi 1 số cách xưng hô hiếm về những dạng common hơn để tránh việc over-fitting
        X['Title'] = X['Title'].replace(['Capt', 'Col', 'Rev', 'Dr', 'Major'], 'Others')
        X['Title'] = X['Title'].replace(['Countess', 'Lady', 'Sir', 'Jonkheer', 'Dona','Don'], 'Royal')
        X['Title'] = X['Title'].replace('Mlle', 'Miss')
        X['Title'] = X['Title'].replace('Ms', 'Miss')
        X['Title'] = X['Title'].replace('Mme', 'Mrs')
        X.loc[(X.Title=='Miss') & (X.Parch!=0), 'Title']="FemaleChild"

        
        #Nhóm lại thành 1 nhóm để điền tuổi thiếu vào        
        grouped = X.groupby(['Title','Pclass','Sex'])
        X.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))
        
        #Nhóm lại thành 1 nhóm để điền giá vé thiếu vào 
        grouped = X.groupby(['Pclass'])
        X.Fare = grouped.Fare.apply(lambda x: x.fillna(x.median()))
        
        #Điền Embarked thiếu bằng giá trị xuất hiện nhiều nhất
        X['Embarked'] = X['Embarked'].replace(np.nan,'S')
        
        return X

In [7]:
#Family
class Family(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        return self
    def transform(self,X):
        X['FamilySize'] = X['Parch'] + X['SibSp'] + 1
        X.loc[(X['FamilySize']==1),'Family'] = 0
        X.loc[(X['FamilySize']<=4) & (X['FamilySize']>1),'Family'] = 1
        X.loc[(X['FamilySize']>4),'Family'] = 2
       
        X.drop(columns = ['Parch','SibSp'], axis = 1, inplace = True)
        
        return X

In [8]:
#Age
class Age(BaseEstimator, TransformerMixin):
    def __init__(self, age = 0):
        self.age = age #1,3,8,15
    def fit(self,X,y):#11
        return self
    def transform(self,X):
        X['Child'] = X.Age.apply(lambda x: 1 if x <=  11 else 0)
        X['AgeCut'] = pd.qcut(X['Age'],4)
        
        boy_child_col = X['Name'].str.contains('Master.').astype(int)
        X = X.assign(BoyChild=boy_child_col)
        X['GirlChild'] = X['Title'].apply(lambda x : 1 if x == 'FemaleChild' else 0)        
        X['Child'] = X['Child'] + X['BoyChild'] + X['GirlChild']
        X['Child'] = X.Child.apply(lambda x: 1 if x >= 1 else x)
        X.drop(columns = ['Age','BoyChild','GirlChild'], axis = 1, inplace = True)
        
        return X

In [9]:
#Fare
class Fare(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        return self
    def transform(self,X):
        X['FareCut'] = pd.qcut(X['Fare'],4)
        
        X.drop(columns = ['Fare'], axis = 1, inplace = True)

        return X

In [10]:
#Dropper
class ColDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X,y):
        return self
    def transform(self,X):

        X.drop(columns = ['Family_Survival','Name','Cabin','FamilySize'], axis = 1, inplace = True)
        
        return X

In [11]:
#Tạo pipeline
cate_cols = ['Embarked','Pclass','Title','Sex','Ticket','AgeCut','Alone']
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse = False, handle_unknown='ignore'))])
col_transformer = ColumnTransformer(transformers=[
    ('cat', cat_transformer, cate_cols)])
change_columns = Pipeline(steps = [
    ('fasu', FamilySurvival()),
    ('wb',WomanBoy()),
    ('timi', TitleAndFillMissing()),
    ('fami', Family()),
    ('age',Age()),
    ('drop', ColDropper())
])
preprocess_pipeline = Pipeline(steps=[
    ('change_col', change_columns),
    ('col_trans', col_transformer),
    ('std', StandardScaler()),
])

In [12]:
temp = X_df.copy()
check = change_columns.fit_transform(temp,y_df)
check[check['Child'] == 1]

Unnamed: 0_level_0,Pclass,Sex,Ticket,Fare,Embarked,Alone,FamilyTotalCount,FamilySurvivedCount,FamilySurvivalRate,Title,Family,Child,AgeCut
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8,3,male,349909,21.0750,S,False,3,0,0.0,Master,2.0,1,"(0.419, 21.0]"
11,3,female,PP 9549,16.7000,S,False,1,1,1.0,FemaleChild,1.0,1,"(0.419, 21.0]"
17,3,male,382652,29.1250,Q,False,4,0,0.0,Master,2.0,1,"(0.419, 21.0]"
25,3,female,349909,21.0750,S,False,3,0,0.0,FemaleChild,2.0,1,"(0.419, 21.0]"
44,2,female,SC/Paris 2123,41.5792,C,False,1,1,1.0,FemaleChild,1.0,1,"(0.419, 21.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,3,female,2678,15.2458,C,False,1,0,0.0,FemaleChild,1.0,1,"(0.419, 21.0]"
854,1,female,PC 17592,39.4000,S,True,0,0,,FemaleChild,1.0,1,"(0.419, 21.0]"
864,3,female,CA. 2343,69.5500,S,False,3,0,0.0,FemaleChild,2.0,1,"(0.419, 21.0]"
870,3,male,347742,11.1333,S,False,2,2,1.0,Master,1.0,1,"(0.419, 21.0]"


In [13]:
GB_pipeline = Pipeline(steps=[
    ('prep_pipeline',preprocess_pipeline),
    ('gradient_boosting',GradientBoostingClassifier(max_depth = 2))])

In [14]:
XGB_pipeline = Pipeline(steps=[
    ('prep_pipeline',preprocess_pipeline),
    ('gradient_boosting',XGBClassifier(max_depth = 2, eval_metric='logloss'))])

In [15]:
LGBM_pipeline = Pipeline(steps=[
    ('prep_pipeline',preprocess_pipeline),
    ('gradient_boosting',LGBMClassifier(max_depth = 3))])

In [16]:
test_real = pd.read_csv('submission.csv', index_col = 0)

In [17]:
NFOLD = 5
skf = StratifiedKFold(n_splits=NFOLD, shuffle=True)
test_df = pd.read_csv('test.csv', index_col=0)
probs_GB = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])
probs_XGB = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])
probs_LGBM = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])

GB = []
XGB = []
LGBM = []
# for age in range(20):
#     print(age)
#     probs_GB = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])
#     probs_XGB = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])
#     probs_LGBM = pd.DataFrame(np.zeros((len(test_df), NFOLD * 2)), columns=['Fold_{}_Prob_{}'.format(i, j) for i in range(1, NFOLD + 1) for j in range(2)])

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_df, y_df), 1):
    #print('Fold {}\n'.format(fold))
#     GB_pipeline.set_params(prep_pipeline__change_col__age__age = age)
#     XGB_pipeline.set_params(prep_pipeline__change_col__age__age = age)
#     LGBM_pipeline.set_params(prep_pipeline__change_col__age__age = age)
    #GB
    #print('Gradient Boosting\n')
#     GB_pipeline.fit(X_df.iloc[trn_idx], y_df.iloc[trn_idx])
#     probs_GB.loc[:, 'Fold_{}_Prob_0'.format(fold)] = GB_pipeline.predict_proba(test_df)[:, 0]
#     probs_GB.loc[:, 'Fold_{}_Prob_1'.format(fold)] = GB_pipeline.predict_proba(test_df)[:, 1]
#     val_fpr, val_tpr, val_thresholds = roc_curve(y_df.iloc[val_idx], GB_pipeline.predict_proba(X_df.iloc[val_idx])[:, 1])
#     val_auc_score = auc(val_fpr, val_tpr)
# #     print ('auc score val ' + str(fold) + ': ' + str(val_auc_score))
#     GB.append(val_auc_score)

#     #XGB
#     #print('Exterme Gradient Boosting\n')
#     XGB_pipeline.fit(X_df.iloc[trn_idx], y_df.iloc[trn_idx])
#     probs_XGB.loc[:, 'Fold_{}_Prob_0'.format(fold)] = XGB_pipeline.predict_proba(test_df)[:, 0]
#     probs_XGB.loc[:, 'Fold_{}_Prob_1'.format(fold)] = XGB_pipeline.predict_proba(test_df)[:, 1]
#     val_fpr, val_tpr, val_thresholds = roc_curve(y_df.iloc[val_idx], XGB_pipeline.predict_proba(X_df.iloc[val_idx])[:, 1])
#     val_auc_score = auc(val_fpr, val_tpr)
# #     print ('auc score val ' + str(fold) + ': ' + str(val_auc_score))
#     XGB.append(val_auc_score)

    #GB
    #print('Light GBM\n')
    LGBM_pipeline.fit(X_df.iloc[trn_idx], y_df.iloc[trn_idx])
    probs_LGBM.loc[:, 'Fold_{}_Prob_0'.format(fold)] = LGBM_pipeline.predict_proba(test_df)[:, 0]
    probs_LGBM.loc[:, 'Fold_{}_Prob_1'.format(fold)] = LGBM_pipeline.predict_proba(test_df)[:, 1]
    val_fpr, val_tpr, val_thresholds = roc_curve(y_df.iloc[val_idx], LGBM_pipeline.predict_proba(X_df.iloc[val_idx])[:, 1])
    val_auc_score = auc(val_fpr, val_tpr)
    print ('auc score val ' + str(fold) + ': ' + str(val_auc_score))
    LGBM.append(val_auc_score)



#             #---------------------------------------------------#
#     class_survived = [col for col in probs_GB.columns if col.endswith('Prob_1')]
#     class_unsurvived = [col for col in probs_GB.columns if col.endswith('Prob_0')]
#     probs_GB['1'] = probs_GB[class_survived].sum(axis=1) / NFOLD
#     probs_GB['0'] = probs_GB[class_unsurvived].sum(axis=1) / NFOLD
#     probs_GB['pred'] = 0
#     pos = probs_GB[probs_GB['1'] >= probs_GB['0']].index
#     probs_GB.loc[pos, 'pred'] = 1

#     y_pred_GB = probs_GB['pred'].astype(int)

#     class_survived = [col for col in probs_XGB.columns if col.endswith('Prob_1')]
#     class_unsurvived = [col for col in probs_XGB.columns if col.endswith('Prob_0')]
#     probs_XGB['1'] = probs_XGB[class_survived].sum(axis=1) / NFOLD
#     probs_XGB['0'] = probs_XGB[class_unsurvived].sum(axis=1) / NFOLD
#     probs_XGB['pred'] = 0
#     pos = probs_XGB[probs_XGB['1'] >= probs_XGB['0']].index
#     probs_XGB.loc[pos, 'pred'] = 1

#     y_pred_XGB = probs_XGB['pred'].astype(int)

#     class_survived = [col for col in probs_LGBM.columns if col.endswith('Prob_1')]
#     class_unsurvived = [col for col in probs_LGBM.columns if col.endswith('Prob_0')]
#     probs_LGBM['1'] = probs_LGBM[class_survived].sum(axis=1) / NFOLD
#     probs_LGBM['0'] = probs_LGBM[class_unsurvived].sum(axis=1) / NFOLD
#     probs_LGBM['pred'] = 0
#     pos = probs_LGBM[probs_LGBM['1'] >= probs_LGBM['0']].index
#     probs_LGBM.loc[pos, 'pred'] = 1

#     y_pred_LGBM = probs_LGBM['pred'].astype(int)

#     diff_val = 0
#     for idx in range(len(y_pred_GB)):
#         if(test_real.iloc[idx].values != y_pred_XGB[idx]):
#             diff_val += 1
#     diff = diff_val/len(y_pred_GB)
#     score = 1 - diff
#     print(score)

#     diff_val = 0
#     for idx in range(len(y_pred_XGB)):
#         if(test_real.iloc[idx].values != y_pred_XGB[idx]):
#             diff_val += 1
#     diff = diff_val/len(y_pred_XGB)
#     score = 1 - diff
#     print(score)

#     diff_val = 0
#     for idx in range(len(y_pred_LGBM)):
#         if(test_real.iloc[idx].values != y_pred_LGBM[idx]):
#             diff_val += 1
#     diff = diff_val/len(y_pred_LGBM)
#     score = 1 - diff
#     print(score)

In [18]:
print('Auc GB:' + str(sum(GB)/NFOLD))
print('Auc XGB:' + str(sum(XGB)/NFOLD))
print('Auc LGBM:' + str(sum(LGBM)/NFOLD))

Auc GB:0.866120202115005
Auc XGB:0.8574645271315184
Auc LGBM:0.8423283344958538


In [19]:
class_survived = [col for col in probs_GB.columns if col.endswith('Prob_1')]
class_unsurvived = [col for col in probs_GB.columns if col.endswith('Prob_0')]
probs_GB['1'] = probs_GB[class_survived].sum(axis=1) / NFOLD
probs_GB['0'] = probs_GB[class_unsurvived].sum(axis=1) / NFOLD
probs_GB['pred'] = 0
pos = probs_GB[probs_GB['1'] >= probs_GB['0']].index
probs_GB.loc[pos, 'pred'] = 1

y_pred_GB = probs_GB['pred'].astype(int)

In [20]:
class_survived = [col for col in probs_XGB.columns if col.endswith('Prob_1')]
class_unsurvived = [col for col in probs_XGB.columns if col.endswith('Prob_0')]
probs_XGB['1'] = probs_XGB[class_survived].sum(axis=1) / NFOLD
probs_XGB['0'] = probs_XGB[class_unsurvived].sum(axis=1) / NFOLD
probs_XGB['pred'] = 0
pos = probs_XGB[probs_XGB['1'] >= probs_XGB['0']].index
probs_XGB.loc[pos, 'pred'] = 1

y_pred_XGB = probs_XGB['pred'].astype(int)

In [21]:
class_survived = [col for col in probs_LGBM.columns if col.endswith('Prob_1')]
class_unsurvived = [col for col in probs_LGBM.columns if col.endswith('Prob_0')]
probs_LGBM['1'] = probs_LGBM[class_survived].sum(axis=1) / NFOLD
probs_LGBM['0'] = probs_LGBM[class_unsurvived].sum(axis=1) / NFOLD
probs_LGBM['pred'] = 0
pos = probs_LGBM[probs_LGBM['1'] >= probs_LGBM['0']].index
probs_LGBM.loc[pos, 'pred'] = 1

y_pred_LGBM = probs_LGBM['pred'].astype(int)

In [23]:
test_real = pd.read_csv('submission.csv', index_col = 0)

In [26]:
diff_val = 0
for idx in range(len(y_pred_LGBM)):
    if(test_real.iloc[idx].values != y_pred_LGBM[idx]):
        diff_val += 1
diff = diff_val/len(y_pred_LGBM)
score = 1 - diff
score

0.799043062200957

In [27]:
pass_ID = test_df.index.values
my_preds = pd.DataFrame({'PassengerId': pass_ID, 'Survived': y_pred_LGBM}, columns=['PassengerId','Survived']).set_index('PassengerId')
my_preds.to_csv('220122_5.csv')