# Data Dictionary
1) sibsp    : # of siblings/spouses on board\
2) parch    : # of parents/children on board\
3) embarked : Port of embarkation (C : Cherbourg, Q : Queenstown, S : Southampto)

# Setup

In [273]:
from __future__ import annotations

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

DATA_LOCATION = Path('../data/')

# Loading Data

In [365]:
actual_labels = pd.read_csv(DATA_LOCATION / 'gender_submission.csv')

use_cols = [
    'PassengerId', 'Survived', 'Pclass', 
    'Sex', 'Age', 'SibSp', 'Parch', 
    'Fare', 'Embarked'
]

train_df = pd.read_csv(DATA_LOCATION / 'train.csv', usecols=use_cols)
test_df  = pd.read_csv(DATA_LOCATION / 'test.csv')


test_df_run = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

large_data = pd.concat([test_df.merge(actual_labels, how='outer', on='PassengerId'), train_df])
large_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,,male,27.0,0,0,,13.0000,,S,0
887,888,1,,female,19.0,0,0,,30.0000,,S,1
888,889,3,,female,,1,2,,23.4500,,S,0
889,890,1,,male,26.0,0,0,,30.0000,,C,1


In [348]:
class FieldImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, field : str, strategy : str = 'mean') -> None :
        self.field = field
        self.strategy = strategy
        self.__field = field
        self.imputer = SimpleImputer(strategy=self.strategy)

    def fit(self, X : pd.DataFrame, y = None):
        return self
     
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        X[self.__field] = self.imputer.fit_transform(X[self.__field].to_numpy().reshape(-1,1)).ravel()
        return X
        
    def fit_transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        self.fit(X, y)
        return self.transform(X, y)
    
    @staticmethod
    def build_steps(fields : list[str]) -> list[FieldImputer] :
        
        def _build_step(field : str) -> FieldImputer :
            return (f'{field}_Imputer', FieldImputer(field))
        
        return [_build_step(field) for field in fields]
    
class CategoricalEncoderOhe(BaseEstimator, TransformerMixin):
    
    def __init__(self, field : str) -> None :
        self.field = field
        self.__field = field
        self.ohe = OneHotEncoder()
    
    def fit(self, X : pd.DataFrame, y = None):
        return self
    
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        
        field_data = X[self.__field].to_numpy().reshape(-1, 1)
        transformed = self.ohe.fit_transform(field_data).toarray()

        total_unique = np.unique(transformed, axis=0)

        columns = [f'{self.__field}_{i}' for i in range(len(total_unique))]

        X[columns] = pd.DataFrame(transformed)
        
        return X.drop(self.__field, axis=1)
    
    
    def fit_transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        self.fit(X, y)
        return self.transform(X, y)
    
    @staticmethod
    def build_steps(fields : list[str]) -> list[CategoricalEncoderOhe]:
        
        def _build_step(field : str) -> CategoricalEncoderOhe :
            
            return (f'{field}_ohe', CategoricalEncoderOhe(field))
            
        return [_build_step(field) for field in fields]
    
class RFWrapper(RandomForestClassifier):
    
    def fit_transform(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        model = super().fit(X, y)
        return model.predict(X)
    
    def fit_predict(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        return self.fit_transform(X, y)

class DataFrameToNumpy(BaseEstimator, TransformerMixin):
    
    def fit(self, X : pd.DataFrame, y = None):
        return self
    
    def transform(self, X : pd.DataFrame, y = None):
        return X.to_numpy()
    
    def fit_transform(self, X : pd.DataFrame, y = None):
        self.fit(X, y)
        return self.transform(X, y)
    
param_grid = {
    'clf__n_estimators' : [5, 10, 25, 50, 100, 200, 500, 1000],
    'clf__max_depth'    : [1, 5, 10, 25, 50, 100, 150, 200],
    'clf__random_state' : [42]
}

pipeline = Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
        ("pd_converter", DataFrameToNumpy()),
        ('standard_scaler', StandardScaler()),
    ]
)

clf_pipeline = Pipeline(
    steps= [('clf', RFWrapper())]
)

precleaned_train_data = pipeline.fit_transform(train_df.drop(['PassengerId', 'Survived'], axis=1), train_df['Survived'].values.ravel())


gs = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, cv=10, scoring='accuracy', verbose=2)
gs.fit(precleaned_train_data, train_df['Survived'].values.ravel())

Fitting 10 folds for each of 64 candidates, totalling 640 fits
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=10

In [349]:
print(gs.best_params_)
gs.best_score_

{'clf__max_depth': 10, 'clf__n_estimators': 50, 'clf__random_state': 42}


0.830561797752809

In [350]:
precleaned_test_data = pipeline.fit_transform(X=test_df_run.drop('PassengerId', axis=1))
gs.best_estimator_.score(precleaned_test_data, actual_labels.Survived)

0.8660287081339713

In [351]:
predicted = gs.best_estimator_.predict(precleaned_test_data)
output = pd.DataFrame({'PassengerId': test_df_run.PassengerId, 'Survived': predicted})
output.to_csv('submission.csv', index=False)

In [352]:
dd = pd.DataFrame({'PassengerId': test_df_run.PassengerId, 'Survived': predicted, 'Actual' : actual_labels.Survived})
(dd['Survived'] == dd['Actual']).value_counts()

True     362
False     56
Name: count, dtype: int64

In [369]:
X_train, X_test, y_train, y_test = train_test_split(
    large_data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1), 
    large_data['Survived'], 
    stratify=large_data['Survived'],
    train_size=0.8,
    random_state=42
)

param_grid = {
    'clf__n_estimators' : [5, 10, 25, 50, 100, 200, 500, 1000],
    'clf__max_depth'    : [1, 5, 10, 25, 50, 100, 150, 200],
    'clf__random_state' : [42]
}

pipeline = Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
        ("pd_converter", DataFrameToNumpy()),
        ('standard_scaler', StandardScaler()),
    ]
)

clf_pipeline = Pipeline(
    steps= [('clf', RFWrapper())]
)

precleaned_train_data = pipeline.fit_transform(X_train.drop(['PassengerId'], axis=1), y_train)


gs = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, cv=10, scoring='accuracy', verbose=2)
gs.fit(precleaned_train_data, y_train)

Fitting 10 folds for each of 64 candidates, totalling 640 fits
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=5, clf__random_state=42; total time=   0.0s
[CV] END clf__max_depth=1, clf__n_estimators=10

In [370]:
print(gs.best_params_)
gs.best_score_

{'clf__max_depth': 5, 'clf__n_estimators': 500, 'clf__random_state': 42}


0.6943040293040292

In [379]:
Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
    ]
).fit_transform(X=X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1)).info()

<class 'pandas.core.frame.DataFrame'>
Index: 262 entries, 546 to 294
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         262 non-null    float64
 1   SibSp       262 non-null    float64
 2   Parch       262 non-null    float64
 3   Fare        262 non-null    float64
 4   Pclass_0    111 non-null    float64
 5   Pclass_1    111 non-null    float64
 6   Pclass_2    111 non-null    float64
 7   Sex_0       111 non-null    float64
 8   Sex_1       111 non-null    float64
 9   Embarked_0  111 non-null    float64
 10  Embarked_1  111 non-null    float64
 11  Embarked_2  111 non-null    float64
dtypes: float64(12)
memory usage: 26.6 KB


In [375]:
precleaned_test_data = pipeline.fit_transform(X=X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1))
gs.best_estimator_.score(precleaned_test_data, y_test)

ValueError: Input X contains NaN.
RFWrapper does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
predicted = gs.best_estimator_.predict(precleaned_test_data)

In [374]:
X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1).info()

<class 'pandas.core.frame.DataFrame'>
Index: 262 entries, 546 to 294
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    262 non-null    int64  
 1   Sex       262 non-null    object 
 2   Age       212 non-null    float64
 3   SibSp     262 non-null    int64  
 4   Parch     262 non-null    int64  
 5   Fare      262 non-null    float64
 6   Embarked  262 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 16.4+ KB
