Imports and Constants

In [11]:
#
import sys, os, multiprocessing, csv, copy

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.svm import SVC

import numpy as np
import math

import pandas as pd

np.random.seed(42)

ind_output = True

INPUT_PATH = os.path.join(".", "CSVs", "inputs")
OUTPUT_PATH = os.path.join(".", "CSVs", "outputs")
RANDOM_STATE = 42

In [12]:
vals1 = pd.read_csv(os.path.join(OUTPUT_PATH, "SVC.csv"))
vals2 = pd.read_csv(os.path.join(OUTPUT_PATH, "lin_SVC.csv"))

count = 0
for i in range(418):
    if(vals1.iloc[i, 1] != vals2.iloc[i, 1]):
        count+=1

print(count)

41


Get the data from the CSVs

In [13]:
#
test = pd.read_csv(os.path.join(INPUT_PATH, "test.csv"))
train = pd.read_csv(os.path.join(INPUT_PATH, "train.csv"))

train_X = train.drop(axis = 1, columns = "Survived")
train_y = train["Survived"]
marker = len(train_X)

dataset = pd.concat([train_X, test], ignore_index=True)

Some data preperation

In [14]:
#
drop_attribs = ["Name", "Cabin", "PassengerId", "Ticket"]

class AttributeDropper(BaseEstimator, TransformerMixin):
    def __init__(self, attribs = drop_attribs):
        self.attribs = attribs
    def transform(self, X):
        return X.drop(columns = self.attribs)
    def fit(self, X, y=None):
        return X
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [15]:
#
class AgeEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, params = param_grid):
        self.params = params
    def transform(self, X):
        return self
    def fit(self, X, y=None):
        valid = X.dropna(subset=['Age'], inplace = False)
        missing = X[X.isnull()['Age']]
        valid_X = valid.drop(columns=['Age'], inplace=False)

        self.regr = Lasso(alpha = 0.1).fit(valid_X, valid["Age"])
        X.loc[X.isnull()['Age'], 'Age'] = self.regr.predict(missing.drop(columns = ['Age']))
        
        return X
    def fit_transform(self, X, y=None):
        return self.fit(X, y)

In [16]:
#
class Filler(BaseEstimator, TransformerMixin):
    def transform(self, X):
        return self
    def fit(self, X, y=None):
        for x in X:
            if (x != 'Age') & (x != 'Sex') & (x != 'Embarked'):
                median = X[x].median()
                X[x].fillna(median, inplace=True)
        return X
    def fit_transform(self, X, y=None):
        return self.fit(X)

In [17]:
#
class CustEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return X
    def transform(self, X):
        X = X.mask(X == 'male', 0)
        X = X.mask(X == 'female', 1)
        return X
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [18]:
#
class HotEncoder(BaseEstimator, TransformerMixin):
    def transform(self, X):
        df = pd.DataFrame({'class_f':[], 'class_s':[], 'class_t':[],
                          'emb_s':[], 'emb_c':[], 'emb_q':[]})
        class_dict = {1:'class_f', 2:'class_s', 3:'class_t'}
        emb_dict = {'S':'emb_s', 'C':'emb_c', 'Q':'emb_q'}
        
        for i in X.index:
            class_key = X.at[i, 'Pclass']
            emb_key = X.at[i, 'Embarked']
            if class_key in class_dict:
                df.at[i, class_dict[class_key]] = 1
            if emb_key in emb_dict:
                df.at[i, emb_dict[emb_key]] = 1
                
        df.fillna(0, inplace = True)
        return pd.concat((df, X.drop(columns = ['Pclass', 'Embarked'])), join = 'inner', axis = 1)
    def fit(self, X, y=None):
        return X
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [19]:
#
class FamilyCombiner(BaseEstimator, TransformerMixin):
    def transform(self, X):
        fam = []
        for i in X.index:
            num = X.at[i, 'SibSp'] +  X.at[i, 'Parch']
            fam.append(num)
        X['family_size'] = fam
        X.drop(columns = ['SibSp', 'Parch'], inplace = True)
        return X
    def fit(self, X, y=None):
        return X
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [20]:
#
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

main_cols = list(test.columns)
main_cols.remove('Embarked')
emb_col = ['Embarked']

pipeline = Pipeline([
    ('dropper', AttributeDropper()),
    ('filler', Filler()),
    ('cust_enc', CustEncoder()),
    ('family_comb', FamilyCombiner()),
    ('hot_encoder', HotEncoder()),
    ('age_est', AgeEstimator()),
    ('scaler', MinMaxScaler())
])

In [21]:
#
data_prep = pipeline.fit_transform(dataset)

Here we split the prepared data back into two seperate ndarrays, the test and training vals

In [22]:
#
train_X_prep = data_prep[:marker]
test_prep = data_prep[marker:]

In [24]:
train_X_prep[:5]

array([[0.        , 0.        , 1.        , 1.        , 0.        ,
        0.        , 0.        , 0.27345609, 0.01415106, 0.1       ],
       [1.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 1.        , 0.473882  , 0.13913574, 0.1       ],
       [0.        , 0.        , 1.        , 1.        , 0.        ,
        0.        , 1.        , 0.32356257, 0.01546857, 0.        ],
       [1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 1.        , 0.43630214, 0.1036443 , 0.1       ],
       [0.        , 0.        , 1.        , 1.        , 0.        ,
        0.        , 0.        , 0.43630214, 0.01571255, 0.        ]])

In [None]:
#Random Forest Regressor

param_grid = [
    {'bootstrap':[True, False], 'n_estimators': [90, 100, 110, 120, 150, 200], 'max_features': [4, 5, 6, 7, 8]}
]

forest_clf = RandomForestClassifier(random_state=RANDOM_STATE)
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           scoring='f1',
                           return_train_score=True)

grid_search.fit(train_X_prep, train_y)
random_forest = grid_search.best_estimator_
print(grid_search.best_params_)

forest_vals = random_forest.predict(test_prep)

In [None]:
rf_prob = random_forest.predict_proba(test_prep)
print(rf_prob[:10])

In [33]:
#Support Vector Classifier
svc = SVC(probability = True, kernel = 'linear')
svc.fit(train_X_prep, train_y)
svc_vals = svc.predict(test_prep)

In [None]:
svc_prob = svc.predict_proba(test_prep)
print(svc_prob[:10])

In [None]:
voting_clf = VotingClassifier(estimators = [("rf", random_forest), ("svc", svc)], voting = "soft")
voting_clf.fit(train_X_prep, train_y)

voting_vals = voting_clf.predict(test_prep)

In [26]:
param_grid = [
    {'n_estimators': [90, 100, 110, 120], 'max_features': [4, 5, 6, 7, 8]}
]

adaboost_svc = AdaBoostClassifier(SVC(probability = True), random_state=RANDOM_STATE)

ABCsvc_search = GridSearchCV(adaboost_svc, param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True)

ABCsvc_search.fit(train_X_prep, train_y)
ABCsvc_vals = ABCsvc_search.predict(test_prep)

In [31]:
#
param_grid = [{
    'learning_rate': [ .25, .5, .75, 1, 1.2],
    'n_estimators': [40, 50, 60, 70],
    'base_estimator__n_estimators': [80, 100, 120], 
    'base_estimator__max_features': [4, 6, 7, 9]
}]

forest_clf = RandomForestClassifier(random_state=RANDOM_STATE)
adaboost_rf = AdaBoostClassifier(forest_clf, random_state=RANDOM_STATE)

ABCrf_search = GridSearchCV(adaboost_rf, param_grid, cv=5,
                           scoring='neg_root_mean_squared_error',
                           return_train_score=True)

ABCrf_search.fit(train_X_prep, train_y)
ABCrf_vals = ABCrf_search.predict(test_prep)

KeyboardInterrupt: 

In [None]:
#
ABCrf_search.best_params_

In [27]:
ABCsvc_search.best_params_

{'learning_rate': 0.05}

Output Scores

In [34]:
filename = os.path.join(OUTPUT_PATH, "lin_SVC.csv")
vals = svc_vals

with open(filename, 'w', newline='') as csvfile:
    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)

    # writing the fields  
    csvwriter.writerow(["PassengerId","Survived"])
    for num in range(418):
        csvwriter.writerow([num+892, vals[num]])

'''
scores: (accuracy)
    SGD classifier 0.73205
    Random Forest 0.76555
    SVC 0.77751
'''

'\nscores: (accuracy)\n    SGD classifier 0.73205\n    Random Forest 0.76555\n    SVC 0.77751\n'

Some testing

In [155]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import AdaBoostRegressor

param_grid = [{'n_estimators': [100, 150, 200, 250, 300], 'max_features': [2, 3, 4, 5, 6, 7, 8]}]
class AgeEstimatorTest(BaseEstimator, TransformerMixin):
    def __init__(self, params = param_grid):
        self.params = params
    def transform(self, X):
        return self
    def fit(self, X, y=None):
        temp = X.drop(columns=['Age'])
        self.regr = Lasso(alpha = 0.1).fit(temp, X["Age"])
        return X  
        
#         
    
#         forest_reg = RandomForestRegressor(random_state=RANDOM_STATE)
#         grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
#                                    scoring='neg_root_mean_squared_error',
#                                    return_train_score=True)
        
#         grid_search.fit(temp, X["Age"])
        
#         self.forest_regressor = grid_search.best_estimator_
#         self.forest_regressor.fit(temp, X['Age'])

    def fit_transform(self, X, y=None):
        return self.fit(X, y)

In [156]:
drop_attribs1 = ["Name", "Cabin", "PassengerId", "Ticket"]
drop_attribs2 = ["Name", "Cabin", "PassengerId", "Ticket", "Age"]

test_pipeline1 = Pipeline([
    ('dropper', AttributeDropper(drop_attribs1)),
    ('filler', Filler()),
    ('cust_enc', CustEncoder()),
    ('family_comb', FamilyCombiner()),
    ('hot_encoder', HotEncoder()),
    ('age_est', AgeEstimatorTest())
])

test_pipeline2 = Pipeline([
    ('dropper', AttributeDropper(drop_attribs2)),
    ('filler', Filler()),
    ('cust_enc', CustEncoder()),
    ('family_comb', FamilyCombiner()),
    ('hot_encoder', HotEncoder())
])

In [157]:
test_set = dataset.dropna(subset = ["Age"])
len(test_set)

1046

In [164]:
shuffle = test_set.sample(frac = 1, axis = 0)
sample = shuffle[:820]
s_test = shuffle[820:]

In [165]:
sample_prep = test_pipeline1.fit_transform(sample)
s_test_prep = test_pipeline2.fit_transform(s_test)

In [166]:
age = s_test["Age"]
age2 = sample["Age"]

age_est = test_pipeline1.named_steps['age_est']

score = age_est.regr.score(s_test_prep, age)
score2 = age_est.regr.score(sample_prep.drop(columns = "Age"), age2)
print(score, score2)

0.22767475464799547 0.23362727445848475
