In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\q\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [3]:
import xgboost as xgb

In [4]:
pip install lightgbm




You should consider upgrading via the 'c:\Users\q\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

import seaborn as sns
from sklearn.pipeline import Pipeline

df = sns.load_dataset('titanic').copy()

y = df['survived'].astype(int)
X = df.drop(columns=['survived'])

num_feats = ['age','fare','sibsp','parch', 'pclass']
cat_feats = ['sex', 'embarked','class','who','adult_male','alone']

In [6]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [7]:
numeric_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler(with_mean=False))
])

categorical_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocess = ColumnTransformer([
    ('num', numeric_pipe, num_feats),
    ('cat', categorical_pipe, cat_feats)
],
                               remainder='drop'
                               )

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [9]:
models = {
    'LogReg': LogisticRegression(max_iter=2000, n_jobs=None),
    'RandomForest': RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_leaf=2, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(loss='log_loss', learning_rate=0.1, n_estimators=500, subsample=0.8, random_state=42),
    'XGBoost': XGBClassifier(
        n_estimators=800,
        max_depth=3,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1),
    'LightGBM': LGBMClassifier(
        objective='binary',
        n_estimators=1500,
        learning_rate=0.1,
        num_leaves=63,
        max_bin=500,
        min_child_samples=10,
        min_child_weight=1e-4,
        min_split_gain=0.0,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        force_row_wise=True,
        class_weight=None,
        random_state=42,
        n_jobs=-1
        
    )
}

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'logloss':'neg_log_loss',
    'accuracy':'accuracy',
    'roc_auc':'roc_auc'
}

cv_results = []

for name, clf in models.items():
  pipe = Pipeline([
      ('prep',preprocess),
      ('clf',clf)
  ])

  cv = cross_validate(pipe, X, y, cv=skf, scoring=scoring, n_jobs=-1, return_train_score=False)
  cv_results.append({
      'model':name,
      'logloss_mean': -cv['test_logloss'].mean(),
      'logloss_std':cv['test_logloss'].std(),
      'roc_auc_mean': cv['test_roc_auc'].mean(),
      'accuracy_mean': cv['test_accuracy'].mean()
  })

cv_table = pd.DataFrame(cv_results).sort_values('logloss_mean')
cv_table

Unnamed: 0,model,logloss_mean,logloss_std,roc_auc_mean,accuracy_mean
1,RandomForest,0.406291,0.024467,0.884683,0.841755
0,LogReg,0.423358,0.021075,0.865917,0.822666
3,XGBoost,0.468496,0.051014,0.873954,0.818166
2,GradientBoosting,0.483826,0.058477,0.875976,0.819277
4,LightGBM,1.21888,0.209317,0.85486,0.802436


In [11]:
df['survived'].mean()
# 概率来说，default probability survival is 38.38%
# LightGBM的logloss_mean太高了，超过了1，可能失效
# 一般而言，logloss要越小越好

0.3838383838383838

In [12]:
# person 1 = survived = actual survival probability = 100%; if model predicts 90%, the loss should be 10%

In [13]:
df = pd.read_csv('https://raw.githubusercontent.com/JamesLo94/schulich_data_science/main/Data_Science/0910/credit_dataset.csv')


In [14]:
df = df.iloc[:,1:]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25134 entries, 0 to 25133
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              25134 non-null  int64  
 1   GENDER          25134 non-null  object 
 2   CAR             25134 non-null  object 
 3   REALITY         25134 non-null  object 
 4   NO_OF_CHILD     25134 non-null  int64  
 5   INCOME          25134 non-null  float64
 6   INCOME_TYPE     25134 non-null  object 
 7   EDUCATION_TYPE  25134 non-null  object 
 8   FAMILY_TYPE     25134 non-null  object 
 9   HOUSE_TYPE      25134 non-null  object 
 10  FLAG_MOBIL      25134 non-null  int64  
 11  WORK_PHONE      25134 non-null  int64  
 12  PHONE           25134 non-null  int64  
 13  E_MAIL          25134 non-null  int64  
 14  FAMILY SIZE     25134 non-null  float64
 15  BEGIN_MONTH     25134 non-null  int64  
 16  AGE             25134 non-null  int64  
 17  YEARS_EMPLOYED  25134 non-null 

In [16]:
df['TARGET'].mean()

0.01679000557014403

In [17]:
minority = df[df['TARGET']==1]
majority = df[df['TARGET']==0]

In [18]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\q\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [19]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE