# CLASE 2: Árboles de decisión, métricas de evaluación

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

np.random.seed(42)

In [2]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

train_df.shape, test_df.shape

((891, 12), (418, 11))

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
used_columns = ['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']

In [5]:
y_train = train_df[['Survived']]
y_train.shape

(891, 1)

In [6]:
X_all = train_df[used_columns].append(test_df[used_columns])
X_all.shape

(1309, 8)

In [7]:
X_all.isna().sum()

Pclass      0
Name        0
Sex         0
SibSp       0
Parch       0
Ticket      0
Fare        1
Embarked    2
dtype: int64

In [8]:
X_all['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [9]:
X_all['Embarked'].fillna('S', inplace=True)
X_all['Fare'].fillna(X_all['Fare'].median(), inplace=True)

In [10]:
X_all['Title'] = X_all['Name'].str.extract(' ([A-Za-z]+)\.')
X_all['Title'] = X_all['Title'].replace(['Ms', 'Mlle'], 'Miss')
X_all['Title'] = X_all['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
X_all['Title'] = X_all['Title'].replace(['Dr', 'Major', 'Col', 'Sir', 'Rev', 'Jonkheer', 'Capt', 'Don'], 'Mr')

In [11]:
# X_all = pd.concat([X_all, pd.get_dummies(X_all[['Sex', 'Embarked']])], axis=1)
X_all["Sex"] = X_all["Sex"].map({"male": 1, "female": 0}).astype(int)    
X_all["Embarked"] = X_all["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)    
X_all['Title'] = X_all['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3}).astype(int)   

In [12]:
X_all['TicketNumber'] = X_all['Ticket'].str.split()
X_all['TicketNumber'] = X_all['TicketNumber'].str[-1]
X_all['TicketNumber'] = LabelEncoder().fit_transform(X_all['TicketNumber'])

In [13]:
X_all.head()

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Title,TicketNumber
0,3,"Braund, Mr. Owen Harris",1,1,0,A/5 21171,7.25,1,0,209
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,0,PC 17599,71.2833,2,2,166
2,3,"Heikkinen, Miss. Laina",0,0,0,STON/O2. 3101282,7.925,1,1,466
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,0,113803,53.1,1,2,67
4,3,"Allen, Mr. William Henry",1,0,0,373450,8.05,1,0,832


In [14]:
X_all.drop(['Name', 'Ticket'], axis=1, inplace=True)
X_all.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title,TicketNumber
0,3,1,1,0,7.25,1,0,209
1,1,0,1,0,71.2833,2,2,166
2,3,0,0,0,7.925,1,1,466
3,1,0,1,0,53.1,1,2,67
4,3,1,0,0,8.05,1,0,832


In [15]:
X_all['FamilySize'] = X_all['SibSp'] + X_all['Parch'] + 1
X_all['IsAlone'] = X_all['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
# X_all['SquaredFare'] = X_all['Fare'] ** 2
X_all.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title,TicketNumber,FamilySize,IsAlone
0,3,1,1,0,7.25,1,0,209,2,0
1,1,0,1,0,71.2833,2,2,166,2,0
2,3,0,0,0,7.925,1,1,466,1,1
3,1,0,1,0,53.1,1,2,67,2,0
4,3,1,0,0,8.05,1,0,832,1,1


In [16]:
X_train = X_all[0:y_train.shape[0]]
X_test = X_all[y_train.shape[0]:]
X_train.shape, y_train.shape, X_test.shape

((891, 10), (891, 1), (418, 10))

In [17]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [18]:
y_train = np.ravel(y_train)
# y_val = np.ravel(y_val)

### Decision tree

In [19]:
%%time
parameters = {
    "criterion": ["gini", "entropy"],
    "max_depth": [1, 2, 3, 5, 10, None], 
    "min_samples_split": [2, 3, 5, 10],
    "min_samples_leaf": [1, 5, 10, 20]
}

tree_model = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5).fit(X_train, y_train)
print(accuracy_score(y_train, tree_model.predict(X_train)))
print(tree_model.best_score_)
# print(accuracy_score(y_val, tree_model.predict(X_val)))
print(tree_model.best_params_)
print(tree_model.best_estimator_)

0.9001122334455668
0.835016835016835
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
CPU times: user 4.16 s, sys: 0 ns, total: 4.16 s
Wall time: 4.17 s


### Random forest

In [20]:
%%time
parameters = {
    "n_estimators": [2, 4, 5, 8, 10, 15], 
    "criterion": ["gini", "entropy"],
    "max_features": ["auto", "log2"], 
    "max_depth": [1, 2, 3, 5, 10], 
    "min_samples_split": [2, 3, 5, 10],
    "min_samples_leaf": [1, 5, 10, 20]
}

forest_model = GridSearchCV(RandomForestClassifier(), parameters, cv=5).fit(X_train, y_train)
print(accuracy_score(y_train, forest_model.predict(X_train)))
print(forest_model.best_score_)
# print(accuracy_score(y_val, forest_model.predict(X_val)))
print(forest_model.best_params_)
print(forest_model.best_estimator_)

0.9382716049382716
0.8484848484848485
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 10}
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=3,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
CPU times: user 2min, sys: 68 ms, total: 2min
Wall time: 2min




### XGBoost

In [21]:
%%time
parameters = {
    'max_depth': [3, 4, 5, 6, 7, 8], 
    'n_estimators': [5, 10, 20, 50, 100],
    'learning_rate': np.linspace(0.02,0.16,8)
}

xgb_model = GridSearchCV(xgb.XGBClassifier(), parameters, cv=5).fit(X_train, y_train)
print(accuracy_score(y_train, xgb_model.predict(X_train)))
print(xgb_model.best_score_)
# print(accuracy_score(y_val, xgb_model.predict(X_val)))
print(xgb_model.best_params_)
print(xgb_model.best_estimator_)

0.9326599326599326
0.8552188552188552
{'learning_rate': 0.06, 'max_depth': 8, 'n_estimators': 100}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.06, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
CPU times: user 40.1 s, sys: 5.65 s, total: 45.7 s
Wall time: 46 s


### LightGBM

In [22]:
%%time
parameters = {'n_estimators': [5, 50, 100],
              'learning_rate': np.linspace(0.02,0.16,4),
              'num_leaves': [31, 61],
              'min_data_in_leaf': [20, 30, 40],
              'max_depth': range(3,8)
}

lgbm_model = GridSearchCV(lgbm.LGBMClassifier(), parameters, cv=5).fit(X_train, y_train)
print(accuracy_score(y_train, lgbm_model.predict(X_train)))
print(lgbm_model.best_score_)
# print(accuracy_score(y_val, lgbm_model.predict(X_val)))
print(lgbm_model.best_params_)
print(lgbm_model.best_estimator_)

0.9068462401795735
0.8462401795735129
{'learning_rate': 0.11333333333333334, 'max_depth': 7, 'min_data_in_leaf': 40, 'n_estimators': 100, 'num_leaves': 31}
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.11333333333333334,
               max_depth=7, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=40, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=31, objective=None, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)
CPU times: user 53.4 s, sys: 16.2 s, total: 1min 9s
Wall time: 1min 9s


### CatBoost

In [23]:
%%time
parameters = {'iterations': [10, 50, 100],
              'learning_rate': np.linspace(0.02,0.16,4),
              'depth': range(4,10)
}

cb_model = GridSearchCV(cb.CatBoostClassifier(verbose=False), parameters, cv=5).fit(X_train, y_train)
print(accuracy_score(y_train, cb_model.predict(X_train)))
print(cb_model.best_score_)
# print(accuracy_score(y_val, cb_model.predict(X_val)))
print(cb_model.best_params_)
print(cb_model.best_estimator_)

0.9113355780022446
0.8451178451178452
{'depth': 9, 'iterations': 100, 'learning_rate': 0.16}
<catboost.core.CatBoostClassifier object at 0x7f61f502a400>
CPU times: user 1min 10s, sys: 17.9 s, total: 1min 28s
Wall time: 57.9 s


In [24]:
submission = pd.DataFrame(
    {
        'PassengerId': test_df['PassengerId'], 
        'Survived': tree_model.predict(X_test) 
    }
)
submission.to_csv("submission_tree.csv", index=False)

submission = pd.DataFrame(
    {
        'PassengerId': test_df['PassengerId'], 
        'Survived': forest_model.predict(X_test)
    }
)
submission.to_csv("submission_forest.csv", index=False)

submission = pd.DataFrame(
    {
        'PassengerId': test_df['PassengerId'], 
        'Survived': xgb_model.predict(X_test) 
    }
)
submission.to_csv("submission_xgb.csv", index=False)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': xgb_model.predict(X_test) 
    }
)
submission.to_csv("submission_lgbm.csv", index=False)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': cb_model.predict(X_test).astype(int)
    }
)
submission.to_csv("submission_cb.csv", index=False)

### Stacking

In [25]:
tree_test_pred = tree_model.predict(X_test)
forest_test_pred = forest_model.predict(X_test)
xgb_test_pred = xgb_model.predict(X_test)
lgbm_test_pred = lgbm_model.predict(X_test)
cb_test_pred = cb_model.predict(X_test)

mean_test_pred = np.round((tree_test_pred + forest_test_pred + xgb_test_pred + lgbm_test_pred + cb_test_pred) / 5)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': mean_test_pred.astype(int)
    }
)
submission.to_csv("submission_mean.csv", index=False)

In [26]:
tree_train_pred = tree_model.predict(X_train)
forest_train_pred = forest_model.predict(X_train)
xgb_train_pred = xgb_model.predict(X_train)
lgbm_train_pred = lgbm_model.predict(X_train)
cb_train_pred = cb_model.predict(X_train)

In [27]:
base_pred = pd.DataFrame({
    'tree':tree_train_pred.ravel(), 
    'forest':forest_train_pred.ravel(), 
    'xgb':xgb_train_pred.ravel(), 
    'lgbm':lgbm_train_pred.ravel(),
    'cb': cb_train_pred.ravel()
})

test_pred = pd.DataFrame({
    'tree':tree_test_pred.ravel(), 
    'forest':forest_test_pred.ravel(), 
    'xgb':xgb_test_pred.ravel(), 
    'lgbm':lgbm_test_pred.ravel(),
    'cb': cb_test_pred.ravel()
})

In [28]:
%%time
parameters = {
    'max_depth': [3, 4, 5, 6, 7, 8], 
    'n_estimators': [5, 10, 20, 50, 100],
    'learning_rate': np.linspace(0.02,0.16,8)
}

final_model = GridSearchCV(xgb.XGBClassifier(), parameters, cv=5).fit(base_pred, y_train)
print(accuracy_score(y_train, final_model.predict(base_pred)))
print(final_model.best_score_)
# print(accuracy_score(y_val, xgb_model.predict(X_val)))
print(final_model.best_params_)
print(final_model.best_estimator_)

0.9438832772166106
0.9438832772166106
{'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 5}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.02, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=5, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
CPU times: user 21.3 s, sys: 3.3 s, total: 24.6 s
Wall time: 24.6 s


In [29]:
final_pred = final_model.predict(test_pred)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': final_pred
    }
)
submission.to_csv("submission_final.csv", index=False)