Imports and Constants

In [1]:
import sys, os, multiprocessing, csv, copy

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import numpy as np
import math

import pandas as pd

np.random.seed(42)

ind_output = True

INPUT_PATH = os.path.dirname("./CSVs/inputs/")
OUTPUT_PATH = os.path.dirname("./CSVs/outputs/")
RANDOM_STATE = 42

In [33]:
sgd_vals = pd.read_csv(os.path.join(OUTPUT_PATH, "Random_Forest.csv"))
rf_vals = pd.read_csv(os.path.join(OUTPUT_PATH, "SGDClassifier.csv"))

count = 0
for i in range(418):
    if(sgd_vals.iloc[i, 1] != rf_vals.iloc[i, 1]):
        count+=1
        
print(count)

91


Get the data from the CSVs

In [2]:
test = pd.read_csv(os.path.join(INPUT_PATH, "test.csv"))
train = pd.read_csv(os.path.join(INPUT_PATH, "train.csv"))

train_X = train.drop(axis = 1, columns = "Survived")
train_y = train["Survived"]
marker = len(train_X)

dataset = pd.concat([train_X, test])

In [None]:
print(test["Fare"])

In [None]:
def countNaN(dataset):
    NaNcount = 0
    for data in dataset:
        if math.isnan(data):
            NaNcount = NaNcount + 1
    return NaNcount

In [None]:
print(len(train_X))
print(countNaN(train_X["Age"]))

Some data preperation

In [3]:
drop_attribs = ["Name", "Cabin", "PassengerId", "Ticket"]

class AttributeDropper(BaseEstimator, TransformerMixin):
    def __init__(self, attribs = drop_attribs):
        self.attribs = attribs
    def transform(self, X):
        return X.drop(columns = self.attribs, inplace = False)
    def fit(self, X, y=None):
        return X
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [4]:
param_grid = [{'bootstrap':[True, False], 'n_estimators': [80, 90, 100, 110, 120], 'max_features': [3, 4]}]

class AgeEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, params = param_grid):
        self.params = params
    def transform(self, X):
        return self
    def fit(self, X, y=None):
        X_valid = X.dropna(subset=["Age"], inplace = False)
        X_missing = X[X.isnull()["Age"]]
        
        forest_reg = RandomForestRegressor(random_state=RANDOM_STATE)
        grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                                   scoring='neg_root_mean_squared_error',
                                   return_train_score=True)
        
        temp = X_valid.drop(columns=["Age"], inplace=False)
        grid_search.fit(temp, X_valid["Age"])
        forest_regressor = grid_search.best_estimator_
        X.loc[X.isnull()["Age"], "Age"] = forest_regressor.predict(X_missing.drop(columns = ["Age"]))
        
        return X
    def fit_transform(self, X, y=None):
        return self.fit(X, y)

In [5]:
class Filler(BaseEstimator, TransformerMixin):
    def transform(self, X):
        return self
    def fit(self, X, y=None):
        for x in X:
            if x != "Age":
                median = X[x].median()
                X[x].fillna(median, inplace=True)
        return X
    def fit_transform(self, X, y=None):
        return self.fit(X)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

main_cols = list(test.columns)
main_cols.remove("Sex")
main_cols.remove("Embarked")
sex_col = ["Sex"]
emb_col = ["Embarked"]

main_pipeline = Pipeline([
    ("dropper", AttributeDropper()),
    ("filler", Filler()),
    ("age_estimator", AgeEstimator())
])

full_pipeline = ColumnTransformer([
    ("sex", OrdinalEncoder(categories = [['male', 'female']]), sex_col),
    ("main", main_pipeline, main_cols),
    ("emb", OneHotEncoder(categories=[['S', 'C', 'Q']], handle_unknown='ignore'), emb_col)
])

In [7]:
med_fare = dataset["Fare"].median()

#train_X["Embarked"].fillna('x', inplace=True)
#test["Embarked"].fillna('x', inplace=True)
dataset["Embarked"].fillna('x', inplace = True)
dataset["Fare"].fillna(med_fare, inplace = True)

data_prep = full_pipeline.fit_transform(dataset)

In [17]:
data_prep[5]

array([ 0.        ,  3.        , 23.64243056,  0.        ,  0.        ,
        8.4583    ,  0.        ,  0.        ,  1.        ])

Here we split the prepared data back into two seperate ndarrays, the test and training vals

In [18]:
train_X_prep = data_prep[:marker]
test_prep = data_prep[marker:]

We'll need to have a variety of ensemble methods (these methods may require additional data transformation). Listed below are the algorithmns we will use.

    Random Forest Classifier
    
    Support Vector Machine
    Gradient Descent Classifier


We will use hard voting initially, though it may be worth investigating into the use of a stack

In [19]:
param_grid = [
    {'bootstrap':[True, False], 'n_estimators': [80, 90, 100, 110, 120], 'max_features': [3, 4, 5, 6, 7, 8]}
]

forest_clf = RandomForestClassifier(random_state=RANDOM_STATE)
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)
grid_search.fit(train_X_prep, train_y)
RandomForest = grid_search.best_estimator_

forest_vals = RandomForest.predict(test_prep)

In [None]:
grid_search.best_estimator_

Output Individual Scores

In [23]:
RandomForestFilename = os.path.join(OUTPUT_PATH, "Random_Forest(with age estimator).csv")

filenames = {}

filenames.update({RandomForestFilename: forest_vals})

if ind_output == True:
    for filename in filenames:
        with open(filename, 'w', newline='') as csvfile:
            # creating a csv writer object  
            csvwriter = csv.writer(csvfile)  

            # writing the fields  
            csvwriter.writerow(["PassengerId","Survived"])
    
            for num in range(418):
                csvwriter.writerow([num+892, filenames[filename][num]])

'''
scores: (accuracy)
    SGD classifier 0.73205
    Random Forest 0.76555
'''

'\nscores: (accuracy)\n    SGD classifier 0.73205\n    Random Forest 0.74880, bootstrap false 0.75598\n'

In [8]:
df1 = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 'k': [10, math.nan, 30, 40, math.nan, 60]})

print(df1.isnull().sum())

a    0
k    2
dtype: int64
