# Setup

In [447]:
from __future__ import annotations

import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from loguru import logger
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

DATA_LOCATION = Path('../../data/titanic/')

# Helper Classes and Functions

In [468]:
class DataFrameEstimateTransformMixin(BaseEstimator, TransformerMixin):
    ''' 
        Provides so basic functionality for creating pipeline steps that handle 
        pandas dataframes
    '''
    
    def fit(self, X : pd.DataFrame, y = None):
        ''' exists solely for compatability'''
        return self
    
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        ''' this has to be implemented by subclasses '''
        raise NotImplementedError("Transform must be implemented")
    
    def fit_transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        self.fit(X, y)
        return self.transform(X, y)

class FieldImputer(DataFrameEstimateTransformMixin):
    '''
        Wrapper around the sklearn SimpleImputer for working with DataFrame objects in a 
        data normalization pipeline
    '''
    
    def __init__(self, field : str, strategy : str = 'mean') -> None :
        self.field = field
        self.strategy = strategy
        self.__field = field
        self.imputer = SimpleImputer(strategy=self.strategy)

    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        logger.info(f"Imputing the {self.strategy} for NaN values in {self.__field}")
        X[self.__field] = self.imputer.fit_transform(X[self.__field].to_numpy().reshape(-1,1)).ravel()
        return X

    @staticmethod
    def build_steps(fields : list[str]) -> list[FieldImputer] :
        
        def _build_step(field : str) -> FieldImputer :
            return (f'{field}_Imputer', FieldImputer(field))
        
        return [_build_step(field) for field in fields]
    
class CategoricalEncoderOhe(DataFrameEstimateTransformMixin):
    
    def __init__(self, field : str) -> None :
        self.field = field
        self.__field = field
        self.ohe = OneHotEncoder()
    
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        logger.info(f"One Hot Encoding categorical column {self.__field}")
        field_data = X[self.__field].to_numpy().reshape(-1, 1)
        transformed = self.ohe.fit_transform(field_data).toarray()

        total_unique = np.unique(transformed, axis=0)

        columns = [f'{self.__field}_{i}' for i in range(len(total_unique))]

        X[columns] = pd.DataFrame(transformed)
        
        return X.drop(self.__field, axis=1)

    @staticmethod
    def build_steps(fields : list[str]) -> list[CategoricalEncoderOhe]:
        
        def _build_step(field : str) -> CategoricalEncoderOhe :
            
            return (f'{field}_ohe', CategoricalEncoderOhe(field))
            
        return [_build_step(field) for field in fields]
    
class DataFrameToNumpy(DataFrameEstimateTransformMixin):
    ''' 
        this is used following a sequence of DataFrameEstimateTransformMixin objects 
        to make it a numpy array so the data is comptible with later on steps in the
        pipeline
    '''
    
    def transform(self, X : pd.DataFrame, y = None) -> np.ndarray :
        return X.to_numpy()
    
class RFWrapper(RandomForestClassifier):
    ''' This allows us to put the RandomForestClassifier into Pipelines that expect fit_transform '''
    
    def fit_transform(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        model = super().fit(X, y)
        return model.predict(X)
    
    def fit_predict(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        return self.fit_transform(X, y)

def predict_and_save_results(
    clf : RandomForestClassifier, 
    X_test : np.ndarray, 
    passenger_ids : pd.Series, 
    filename : str = 'submission.csv'
    ) -> None :
    
    predictions = clf.predict(X_test)
    output = pd.DataFrame({'PassengerId' : passenger_ids, 'Survived' : predictions})
    output.to_csv(DATA_LOCATION / filename, index=False)

# Main

In [469]:
use_cols = [
    'PassengerId', 'Survived', 'Pclass', 
    'Sex', 'Age', 'SibSp', 'Parch', 
    'Fare', 'Embarked'
]

logger.info("Loading data")
actual_labels = pd.read_csv(DATA_LOCATION / 'gender_submission.csv')
train_df = pd.read_csv(DATA_LOCATION / 'train.csv', usecols=use_cols)
test_df  = pd.read_csv(DATA_LOCATION / 'test.csv').drop(['Name', 'Ticket', 'Cabin'], axis=1)

logger.info("Building normalization pipeline")
normalization_pipeline = Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
        ("pd_converter", DataFrameToNumpy()),
        ('standard_scaler', StandardScaler()),
    ]
)

logger.info("Building classifier pipeline")
clf_pipeline = Pipeline(
    steps= [('clf', RFWrapper())]
)

logger.info("Cleaning training data")
precleaned_train_data = normalization_pipeline.fit_transform(
    train_df.drop(['PassengerId', 'Survived'], axis=1), 
    train_df['Survived'].values.ravel()
)

logger.info("Cleaning testing data")
precleaned_test_data = normalization_pipeline.fit_transform(test_df.drop('PassengerId', axis=1))

logger.info("Building GridSearchCV")
param_grid = {
    'clf__n_estimators' : [5, 10, 25, 50, 100, 200, 500, 1000],
    'clf__max_depth'    : [1, 5, 10, 25, 50, 100, 150, 200],
    'clf__random_state' : [42]
}
gs = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, cv=10, scoring='accuracy')

logger.info("Fitting GridSerachCV")
gs.fit(precleaned_train_data, train_df['Survived'].values.ravel())

logger.info(f'Best Train Params: {gs.best_params_}')
logger.info(f'Best Train Score : {gs.best_score_ : .2f}')
logger.info(f'Best Test Score  : {gs.best_estimator_.score(precleaned_test_data, actual_labels.Survived) : .2f}')

logger.info(f"Saving predictions")
predict_and_save_results(gs.best_estimator_, precleaned_test_data, test_df.PassengerId)

logger.info("Complete.")

[32m2024-01-06 16:08:50.631[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoading data[0m
[32m2024-01-06 16:08:50.637[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mBuilding normalization pipeline[0m
[32m2024-01-06 16:08:50.637[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mBuilding classifier pipeline[0m
[32m2024-01-06 16:08:50.638[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mCleaning training data[0m
[32m2024-01-06 16:08:50.639[0m | [1mINFO    [0m | [36m__main__[0m:[36mtransform[0m:[36m32[0m - [1mImputing the most_frequent for NaN values in Embarked[0m
[32m2024-01-06 16:08:50.640[0m | [1mINFO    [0m | [36m__main__[0m:[36mtransform[0m:[36m52[0m - [1mOne Hot Encoding categorical column Pclass[0m
[32m2024-01-06 16:08:50.642[0m | [1mINFO    [0m | [36m__main__[0m:[36mtransform[0m:[36m52[0m - [1mOne Hot Encoding categor