In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

from sklearn.metrics import (accuracy_score, roc_auc_score, f1_score, precision_score, 
                             recall_score, confusion_matrix, log_loss, RocCurveDisplay,
                             PrecisionRecallDisplay, DetCurveDisplay, ConfusionMatrixDisplay, brier_score_loss)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier)

In [23]:
df = pd.read_csv('https://raw.githubusercontent.com/JamesLo94/schulich_data_science/main/0910/train.csv')

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1,Mr


In [24]:
df['Sex'] = df['Sex'].astype('category')
df['Embarked'] = df['Embarked'].astype('category')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize']==1).astype(int)
df['Title'] = (
    df['Name'].str.extract(r",\s*([^.]*)\.", expand = False).
    str.strip()
    .replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs', 'Lady':'Noble', 'Countess':'Noble',
              'Sir':'Noble', "Don":'Noble','Dona':'Noble', 'Jonkheer':'Noble','Dr':'Officer',
              'Col':'Officer','Major':'Officer','Capt':'Officer','Rev':'Officer'}).astype('category')
)

In [25]:
y = df['Survived'].astype(int)
x = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','FamilySize','IsAlone','Title']]

In [26]:
# separate num / cat columns
num_cols = ['Age','SibSp','Parch','Fare','FamilySize','Pclass']
cat_cols = ['Sex','Embarked','IsAlone','Title']

In [27]:
numeric = Pipeline([('imputer', SimpleImputer(strategy='median')),
                    ['scale', StandardScaler(with_mean=False)]])
categorical = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

prep = ColumnTransformer([('num',numeric, num_cols),('cat',categorical, cat_cols)], sparse_threshold=0.3)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [29]:
bag = Pipeline([
    ('prep',prep),
    ('bag',BaggingClassifier(estimator = DecisionTreeClassifier(max_depth=3), n_estimators=200, 
                             max_samples=0.8, bootstrap=True, n_jobs=-1, random_state=42))
])

# 这里estimator使用的model可以换掉，就看需要什么。调参也可以在这里进行

In [30]:
ada = Pipeline([
      ('prep',prep),
      ('clf', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                 n_estimators=300, learning_rate=0.5,random_state=42))
])

In [31]:
rf = Pipeline([
    ('prep',prep),
    ('clf', RandomForestClassifier(n_estimators=400, max_depth=None, random_state=42, n_jobs=-1))
])

et = Pipeline([
    ('prep',prep),
    ('clf', ExtraTreesClassifier(n_estimators=400, max_depth=None, random_state=42, bootstrap=True, n_jobs=-1))
])

vote = Pipeline([
    ('prep',prep),
    ('clf', VotingClassifier(estimators=[
        ('bag',BaggingClassifier(estimator = DecisionTreeClassifier(max_depth=3), n_estimators=200, 
                             max_samples=0.8, bootstrap=True, n_jobs=-1, random_state=42)),
        ('ada',AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2),
                                 n_estimators=300, learning_rate=0.5,random_state=42)),
        ('rf',RandomForestClassifier(n_estimators=400, max_depth=None, random_state=42, n_jobs=-1)),
        ('et',ExtraTreesClassifier(n_estimators=400, max_depth=None, random_state=42, bootstrap=True, n_jobs=-1))], voting='soft'))
])

In [32]:
models = {'Bagging':bag, 'AdaBoost':ada, 'RandomForest':rf, 'ExtraTrees':et, 'Voting':vote}

In [33]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {'accuracy':'accuracy', 'roc_auc':'roc_auc', 'f1':'f1', 'precision':'precision', 'recall':'recall', 'neg_log_loss':'neg_log_loss'}

In [34]:
cv_rows = []

for name, pipe in models.items():
  res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
  cv_rows.append({
      'model':name,
      'acc_mean': res['test_accuracy'].mean(),
      'auc_mean': res['test_roc_auc'].mean(),
      'f1_mean': res['test_f1'].mean(),
      'prec_mean': res['test_precision'].mean(),
      'recall_mean': res['test_recall'].mean(),
      'logloss_mean': -res['test_neg_log_loss'].mean()
  })
  
cv_df = pd.DataFrame(cv_rows).sort_values("auc_mean", ascending=False)
cv_df

Unnamed: 0,model,acc_mean,auc_mean,f1_mean,prec_mean,recall_mean,logloss_mean
4,Voting,0.813277,0.878642,0.748843,0.775771,0.725253,0.429584
0,Bagging,0.816005,0.870244,0.746214,0.792714,0.706936,0.414278
2,RandomForest,0.797754,0.867506,0.733868,0.74012,0.728889,0.767524
3,ExtraTrees,0.799153,0.863415,0.736214,0.741041,0.73266,0.70286
1,AdaBoost,0.803368,0.846758,0.738679,0.753876,0.725387,0.646987
