In [25]:
import pandas as pd
import sklearn
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
%matplotlib inline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from utils import update_submission
import time
import tensorflow as tf

# RandomForest

1. Bagging (sampling with replacement)
   100 samples, randomly choose 60 out of 100, load them to Tree 1.
   Put the 60 back, resample 60, load the new 60 samples to Tree 2.
   ...
   Tree N.
2. Max_feature:
   Each split on the tree is based on one feature, each tree will have different features, maxed at Max_feature
When making predictions, all the trees will vote.

In [2]:
df_test = pd.read_csv('./data/test.csv')
df_train = pd.read_csv('./data/train.csv')
df_submission = pd.read_csv('./data/gender_submission.csv')

In [3]:
def transform(df,option):
    assert option in ["test","train"] , "Option must be test or train"
    df = df.set_index('PassengerId')
    cond = df.Fare.isnull()
    sub_value = df.Fare.mean()
    df.Fare = np.where(cond, sub_value, df.Fare)
    ticket_p = df.groupby('Ticket', as_index=False).agg({'Name': 'count'}).rename(columns={'Name':'t_count'}).sort_values('t_count', ascending=False)
    d = dict()
    for i, row in ticket_p.iterrows():
        d[row.Ticket] = row.t_count
    df_merge = df.copy()
    df_merge['t_count'] = df_merge.Ticket.apply(lambda x: d[x])
    df_merge['Fare_per_person'] = df_merge.Fare/df_merge.t_count
    cond = df_merge.Age.isnull()
    sub_value = df_merge.Age.mean()
    df_merge.Age = np.where(cond, sub_value, df_merge.Age)
    cond = (df_merge.Sex == 'female')
    sub_value = 1
    df_merge.Sex = np.where(cond, sub_value, 0)
    cond = df_merge.Embarked.isnull()
    sub_value = 'S'
    df_merge.Embarked = np.where(cond, 'S', df_merge.Embarked)
    df_merge = df_merge.drop(['Name', 'Ticket', 'Cabin'], axis=1)
    df_merge.isnull().sum()
    ohe = OneHotEncoder()
    X_cat = ohe.fit_transform(df_merge.Embarked.values.reshape(-1,1)).toarray()
    if option=="train":
        X_short = df_merge.drop(['Survived','Embarked'], axis=1).values
    else:
        X_short = df_merge.drop(['Embarked'], axis=1).values
    X = np.hstack([X_cat, X_short])
    return X

In [4]:
X_train_val = transform(df_train, "train")
X_test = transform(df_test,"test")
y_train_val = df_train.Survived

In [7]:

rf = RandomForestClassifier()
param_grid = {'n_estimators': [50,150,250],
              'max_depth': np.arange(2,5),
              'min_samples_split': np.arange(10,30,5),
              'min_samples_leaf': np.arange(1,10,2),
              'max_features': np.arange(2,10,2)}
gs = GridSearchCV(rf, param_grid, scoring='accuracy', n_jobs=-1, refit=True, cv=5)
gs.fit(X_train_val,y_train_val)

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': array([2, 3, 4]),
                         'max_features': array([2, 4, 6, 8]),
                         'min_samples_leaf': array([ 1,  6, 11, 16]),
                         'min_samples_split': array([10, 15, 20, 25]),
                         'n_estimators': [50, 150, 250]},
             scoring='accuracy')

In [8]:
gs.best_params_

{'max_depth': 4,
 'max_features': 8,
 'min_samples_leaf': 6,
 'min_samples_split': 10,
 'n_estimators': 250}

In [10]:
gs.best_score_

0.815956311593748

In [15]:
y_pred=gs.predict(X_train_val)

In [18]:
cm=sklearn.metrics.confusion_matrix(y_train_val, y_pred)

In [19]:
(cm[0][0]+cm[1][1])/np.sum(cm)

0.8428731762065096

In [None]:
# param_grid = {'n_estimators': [50, 100, 150, ...], # 3
#               'max_depth': np.arange(4,10), # 6
#               'min_samples_split': np.arange(10,20,2), # 5
#               'min_samples_leaf': np.arange(1,10,2), # 5
#               'max_features': ?????} # 3

In [None]:
# HW (Jan 12. 2021):
# 1. to fine tune our RandomForestClassifier and get a accuracy > 82%.
# 2. read about (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) 
#    print the confusion matrix of your model on the whole train_val set
# 3. update the submission file with the function in utils.py, submit it to kaggle, compare the result with
# what we get locally (CV)
# 4. Read about https://www.youtube.com/watch?v=bfmFfD2RIcg

In [None]:
# 3*6*5*5*3 --> 50 mins

In [None]:
gb = GradientBoostingClassifier()
cross_val_score(gb, X_train_val, y_train_val).mean()

Type I error (false positive):
model says positive, actually negative (COVID)
Type II error (false negative):
model says negative, actually positive (COVID)

Type II error more harmful than Type I error

100 person, 50/50 (COVID/HC)

model 1 (simply classifies every person to have COVID):
accuracy: 50%
Type II: 0

model 2:
50 COVID: 30 correct, 20 wrong
50 HC: 50 correct, 0 wrong
accuracy: 80%
Type II: 20

model 3:
50 COVID: 45 correct, 5 wrong
50 HC: 25 correct, 25 wrong
accuracy: 70%
Type II: 5

confusion matrix:
           model 1      model 0
true 1        50           0
true 0        50           0

real life COST for mistakes:
(domain knowledge) 

COVID
Type II 100 bucks
Type I 1 buck
model 1 cost: 50 bucks
model 2 cost: 2000 bucks
model 3 cost: 525 bucks

FLU
Type II 5 bucks
Type I 1 buck
model 1 cost: 50 bucks
model 2 cost: 100 bucks
model 3 cost: 50 bucks