# Data Dictionary
1) sibsp    : # of siblings/spouses on board\
2) parch    : # of parents/children on board\
3) embarked : Port of embarkation (C : Cherbourg, Q : Queenstown, S : Southampto)

# Setup

In [None]:
from __future__ import annotations

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

DATA_LOCATION = Path('../../data/')

# Loading Data

In [None]:
actual_labels = pd.read_csv(DATA_LOCATION / 'gender_submission.csv')

use_cols = [
    'PassengerId', 'Survived', 'Pclass', 
    'Sex', 'Age', 'SibSp', 'Parch', 
    'Fare', 'Embarked'
]

train_df = pd.read_csv(DATA_LOCATION / 'train.csv', usecols=use_cols)
test_df  = pd.read_csv(DATA_LOCATION / 'test.csv')


test_df_run = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

large_data = pd.concat([test_df.merge(actual_labels, how='outer', on='PassengerId'), train_df])

In [None]:
class FieldImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, field : str, strategy : str = 'mean') -> None :
        self.field = field
        self.strategy = strategy
        self.__field = field
        self.imputer = SimpleImputer(strategy=self.strategy)

    def fit(self, X : pd.DataFrame, y = None):
        return self
     
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        X[self.__field] = self.imputer.fit_transform(X[self.__field].to_numpy().reshape(-1,1)).ravel()
        return X
        
    def fit_transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        self.fit(X, y)
        return self.transform(X, y)
    
    @staticmethod
    def build_steps(fields : list[str]) -> list[FieldImputer] :
        
        def _build_step(field : str) -> FieldImputer :
            return (f'{field}_Imputer', FieldImputer(field))
        
        return [_build_step(field) for field in fields]
    
class CategoricalEncoderOhe(BaseEstimator, TransformerMixin):
    
    def __init__(self, field : str) -> None :
        self.field = field
        self.__field = field
        self.ohe = OneHotEncoder()
    
    def fit(self, X : pd.DataFrame, y = None):
        return self
    
    def transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        
        field_data = X[self.__field].to_numpy().reshape(-1, 1)
        transformed = self.ohe.fit_transform(field_data).toarray()

        total_unique = np.unique(transformed, axis=0)

        columns = [f'{self.__field}_{i}' for i in range(len(total_unique))]

        X[columns] = pd.DataFrame(transformed)
        
        return X.drop(self.__field, axis=1)
    
    
    def fit_transform(self, X : pd.DataFrame, y = None) -> pd.DataFrame :
        self.fit(X, y)
        return self.transform(X, y)
    
    @staticmethod
    def build_steps(fields : list[str]) -> list[CategoricalEncoderOhe]:
        
        def _build_step(field : str) -> CategoricalEncoderOhe :
            
            return (f'{field}_ohe', CategoricalEncoderOhe(field))
            
        return [_build_step(field) for field in fields]
    
class RFWrapper(RandomForestClassifier):
    
    def fit_transform(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        model = super().fit(X, y)
        return model.predict(X)
    
    def fit_predict(self, X : np.ndarray, y : np.ndarray = None) -> np.ndarray :
        return self.fit_transform(X, y)

class DataFrameToNumpy(BaseEstimator, TransformerMixin):
    
    def fit(self, X : pd.DataFrame, y = None):
        return self
    
    def transform(self, X : pd.DataFrame, y = None):
        return X.to_numpy()
    
    def fit_transform(self, X : pd.DataFrame, y = None):
        self.fit(X, y)
        return self.transform(X, y)
    
param_grid = {
    'clf__n_estimators' : [5, 10, 25, 50, 100, 200, 500, 1000],
    'clf__max_depth'    : [1, 5, 10, 25, 50, 100, 150, 200],
    'clf__random_state' : [42]
}

pipeline = Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
        ("pd_converter", DataFrameToNumpy()),
        ('standard_scaler', StandardScaler()),
    ]
)

clf_pipeline = Pipeline(
    steps= [('clf', RFWrapper())]
)

precleaned_train_data = pipeline.fit_transform(train_df.drop(['PassengerId', 'Survived'], axis=1), train_df['Survived'].values.ravel())


gs = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, cv=10, scoring='accuracy', verbose=2)
gs.fit(precleaned_train_data, train_df['Survived'].values.ravel())

In [None]:
print(gs.best_params_)
gs.best_score_

In [None]:
precleaned_test_data = pipeline.fit_transform(X=test_df_run.drop('PassengerId', axis=1))
gs.best_estimator_.score(precleaned_test_data, actual_labels.Survived)

In [None]:
predicted = gs.best_estimator_.predict(precleaned_test_data)
output = pd.DataFrame({'PassengerId': test_df_run.PassengerId, 'Survived': predicted})
output.to_csv('submission.csv', index=False)

In [None]:
dd = pd.DataFrame({'PassengerId': test_df_run.PassengerId, 'Survived': predicted, 'Actual' : actual_labels.Survived})
(dd['Survived'] == dd['Actual']).value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    large_data.drop(['Survived', 'Name', 'Ticket', 'Cabin'], axis=1), 
    large_data['Survived'], 
    stratify=large_data['Survived'],
    train_size=0.8,
    random_state=42
)

param_grid = {
    'clf__n_estimators' : [5, 10, 25, 50, 100, 200, 500, 1000],
    'clf__max_depth'    : [1, 5, 10, 25, 50, 100, 150, 200],
    'clf__random_state' : [42]
}

pipeline = Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
        ("pd_converter", DataFrameToNumpy()),
        ('standard_scaler', StandardScaler()),
    ]
)

clf_pipeline = Pipeline(
    steps= [('clf', RFWrapper())]
)

precleaned_train_data = pipeline.fit_transform(X_train.drop(['PassengerId'], axis=1), y_train)


gs = GridSearchCV(estimator=clf_pipeline, param_grid=param_grid, cv=10, scoring='accuracy', verbose=2)
gs.fit(precleaned_train_data, y_train)

In [None]:
print(gs.best_params_)
gs.best_score_

In [None]:
Pipeline(
    steps = [
        ("embarked_imputer", FieldImputer(field='Embarked', strategy='most_frequent')), # handles missing values in Embarked, which exist in Train but not Test
        *CategoricalEncoderOhe.build_steps(['Pclass', 'Sex', 'Embarked']),
        *FieldImputer.build_steps(['Age', 'SibSp', 'Parch', 'Fare']),
    ]
).fit_transform(X=X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1)).info()

In [None]:
precleaned_test_data = pipeline.fit_transform(X=X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1))
gs.best_estimator_.score(precleaned_test_data, y_test)

In [None]:
predicted = gs.best_estimator_.predict(precleaned_test_data)

In [None]:
X_test.drop(['PassengerId', 'Pclass_0', 'Pclass_1', 'Pclass_2'], axis=1).info()