# Titanic dataset with ensemble methods

In [1]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /home/grzetan/PYTHON/KSIAZKA/07-Ensemble_and_RandomForest
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 1.00MB/s]


In [2]:
import os
import zipfile
import shutil

DATASET_PATH = './titanic'
ZIP_FILE = './titanic.zip'

if os.path.exists(DATASET_PATH):
    shutil.rmtree(DATASET_PATH)

def fetch_dataset(path = DATASET_PATH):
    file = zipfile.ZipFile(ZIP_FILE)
    file.extractall(path=path)
    file.close()
    if os.path.exists(ZIP_FILE):
        os.remove(ZIP_FILE)
    
fetch_dataset()

## Load data to memory

In [3]:
import pandas as pd

train = pd.read_csv(os.path.join(DATASET_PATH, 'train.csv'), index_col = 'PassengerId')
test = pd.read_csv(os.path.join(DATASET_PATH, 'test.csv'), index_col = 'PassengerId')

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
x_train = train.drop('Survived', axis=1)
y_train = train['Survived']

x_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
x_train.shape

(891, 10)

## Create val set

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 507 to 772
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Name      712 non-null    object 
 2   Sex       712 non-null    object 
 3   Age       564 non-null    float64
 4   SibSp     712 non-null    int64  
 5   Parch     712 non-null    int64  
 6   Ticket    712 non-null    object 
 7   Fare      712 non-null    float64
 8   Cabin     165 non-null    object 
 9   Embarked  711 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 61.2+ KB


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self,):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.drop(['Cabin', 'Name', 'Ticket'], axis=1)
        return X

In [9]:
dropper = ColumnDropper()

X = dropper.fit_transform(x_train)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 507 to 772
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       564 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  711 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB


In [10]:
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns
x_train[cat_cols].value_counts()

Sex     Embarked
male    S           345
female  S           168
male    C            79
female  C            58
        Q            31
male    Q            30
dtype: int64

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', OneHotEncoder(), cat_cols)
])

prepared = pipeline.fit_transform(X)
prepared.shape


(712, 11)

## Prepare val set

In [12]:
X = dropper.transform(x_val)
prepared_val = pipeline.transform(X)
prepared_val.shape

(179, 11)

## Train LogicticRegression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score

logic = LogisticRegression(random_state=42)
logic.fit(prepared, y_train)
print(f1_score(logic.predict(prepared_val), y_val))
print(roc_auc_score(logic.predict(prepared_val), y_val))

0.7058823529411764
0.7746071133167907


## Train RandomForestClassifier

In [29]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators = 1000, max_depth=5, random_state=42)
forest.fit(prepared, y_train)
print(f1_score(forest.predict(prepared_val), y_val))
print(roc_auc_score(forest.predict(prepared_val), y_val))

0.7619047619047619
0.8517811704834606


## Train SVC

In [30]:
from sklearn.svm import SVC

svc = SVC(random_state=42)
svc.fit(prepared, y_train)
print(f1_score(svc.predict(prepared_val), y_val))
print(roc_auc_score(svc.predict(prepared_val), y_val))

0.7407407407407408
0.825750612745098


## Train DecisionTree

In [33]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=5, random_state=42)
tree.fit(prepared, y_train)
print(f1_score(tree.predict(prepared_val), y_val))
print(roc_auc_score(tree.predict(prepared_val), y_val))

0.7368421052631579
0.8069456427955134


## Create blender from all classifiers

In [37]:
import numpy as np

estimators = [tree, forest, svc, logic]

#Create train set for blender
blender_set = np.empty((prepared.shape[0], len(estimators)))

for i, estimator in enumerate(estimators):
    blender_set[:, i] = estimator.predict(prepared)
    
#Train blender
blender = SVC(random_state=42)
blender.fit(blender_set, y_train)

#Validate blender on val set
blender_val = np.empty((prepared_val.shape[0], len(estimators)))

for i, estimator in enumerate(estimators):
    blender_val[:, i] = estimator.predict(prepared_val)
    
preds = blender.predict(blender_val)
print(f1_score(preds, y_val))
print(roc_auc_score(preds, y_val))

0.7454545454545455
0.8233003893381252
