In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell

In [2]:
InteractiveShell.ast_node_interactivity = 'all'
plt.style.use('dark_background')

### Pre-processing

In [3]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [4]:
def extract_numericals(df):
    """
    Extract float quantities from DataFrame `df`.
    Replace missing values w median.
    Perform standard scaling.
    """
    df_flt = df.select_dtypes(include='float64')
    df_flt = df_flt.fillna(value=df_flt.median())
    ss = StandardScaler()
    X = ss.fit_transform(df_flt.values)
    df_flt = pd.DataFrame(X, columns=df_flt.columns) 
    return df_flt

In [5]:
def one_hot_encoder(series):
    """ 
    Encode categorical `series` using OneHotEncoding. 
    Return class list and encoded feature, truncated by
    one to avoid dummy trap.
    """
    # Stop if there are missing values
    assert np.sum(series.isnull()) == 0
    ## Encode categories to integers 
    le = LabelEncoder()
    le.fit(series)
    classes = le.classes_
#     print('LE classes', classes)
    series_le = le.transform(series)
    ## Convert integer categories using OneVsAll 
    oh = OneHotEncoder(sparse=False)
    feat_le = series_le.reshape(-1, 1)
    oh.fit(feat_le)
#     print('OH feats', oh.active_features_)
    feat_oh = oh.transform(feat_le)
    # Avoid dummy trap
#     print('shape:', feat_oh.shape)
    feat_oh = feat_oh[:, :-1]
#     print('shape:', feat_oh.shape)
    return classes[:-1], feat_oh

In [6]:
def show_pearson(df):
    """Display correlation matrix of dataset"""
    corr_mat = df.corr()
#     display(corr_mat)
    sns.heatmap(corr_mat)
    return corr_mat

### Generate scores for various models
- Score is $F_1$

In [7]:
def compute_score(classifier, X_trn, y_trn, X_tst, y_tst):
    # Fit classifier to training data
    cf = classifier
    cf = cf.fit(X_trn, y_trn)
    # Collect predictions
    y_pred_trn = cf.predict(X_trn)
    y_pred_tst = cf.predict(X_tst)
    scores = precision_recall_fscore_support(y_trn, y_pred_trn, average='binary')
    scores_tst = precision_recall_fscore_support(y_tst, y_pred_tst, average='binary')
    return scores, scores_tst

### Clean data and utility loop

In [8]:
def process_df(df):
    """Return processed Dataframe"""
    
    # If shuffled, reset index
    assert df.index.is_monotonic == True
    
    # Extract floating point quantities and fill missing values
    df_end = extract_numericals(df)
    
    # OneHotEncoding of `Sex`
    classes, encoded = one_hot_encoder(df['Sex'])
    for ix, class_ in enumerate(classes):
        df_end[class_] = encoded[:, ix]
        df_end[class_] = df_end[class_].astype('int64')
    
    # OneHotEncoding of `Embarked`
    series = df['Embarked'].fillna(value=df['Embarked'].mode()[0])
    classes, encoded = one_hot_encoder(series)
    for ix, class_ in enumerate(classes):
        df_end[class_] = encoded[:, ix]
        df_end[class_] = df_end[class_].astype('int64')
    
    # Encode `Cabin` as binary category
    mask = df['Cabin'].isnull()
    mask = mask.reset_index(drop=True)
    df_end['Cabin'] = mask.astype('int64')

    # Add target
    df_end['Survived'] = df['Survived']
    
    # Add integer quantities
    df_end['SibSp'] = df['SibSp']
    df_end['Pclass'] = df['Pclass']
    df_end['Parch'] = df['Parch']
    
    return df_end

In [9]:
def generate_score(df, classifier):
    """
        Returns a generator that evalute metric for a certain 
        strategy.
    """
    while True:
        # Split training and test set 80% 20%
        df_trn, df_tst = train_test_split(df, test_size = .2, shuffle=True)
        df_trn = df_trn.reset_index(drop=True)
        df_tst = df_tst.reset_index(drop=True)
        
        # Process sets
        df_trn_end = process_df(df_trn)
        df_tst_end = process_df(df_tst)

        # Get design matrix and target
        X_trn = df_trn_end.drop(columns=['Survived']).values
        X_tst = df_tst_end.drop(columns=['Survived']).values
        y_trn = df_trn_end['Survived'].values
        y_tst = df_tst_end['Survived'].values
        
        # Compute score on strategy
        yield compute_score(classifier, X_trn, y_trn, X_tst, y_tst)

In [10]:
def evaluate_strategy(df, classifier, times):
    """Evaluate `score_strategy` on `df` `times` times. `DataFrame` is reshuffled at every use."""
    res = [next(generate_score(df, classifier)) for _ in range(times)]

    ps_trn = []
    rs_trn = []
    fs_trn = []

    ps_tst = []
    rs_tst = []
    fs_tst = []

    for instance in res:
        res_trn, res_tst = instance
        p_trn, r_trn, f_trn, _ = res_trn
        p_tst, r_tst, f_tst, _ = res_tst
        ps_trn.append(p_trn)
        rs_trn.append(r_trn)
        fs_trn.append(f_trn)
        ps_tst.append(p_tst)
        rs_tst.append(r_tst)
        fs_tst.append(f_tst)

    scores = pd.DataFrame({
        'precision_trn': ps_trn,
        'recall_trn': rs_trn,
        'fscore_trn': fs_trn,
        'precision_tst': ps_tst,
        'recall_tst': rs_tst,
        'fscore_tst': fs_tst
    })
    
    return scores

### Start!

In [11]:
df = pd.read_csv('train.csv')

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB

In [13]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    MLPClassifier(),
    GaussianProcessClassifier(),
    AdaBoostClassifier(),
    GaussianNB()
]

In [16]:
import re

cf_f1_dict = {}

for classifier in classifiers:
    match_ob = re.search(r"(.+)\(", str(classifier))
    classifier_name = match_ob.groups()[0]
    print(classifier_name)
    scores = evaluate_strategy(df, classifier, 50)
    summary = scores.describe()
    display(summary[['fscore_trn', 'fscore_tst']].loc[['mean','std']])
    cf_f1_dict[classifier_name] = summary['fscore_tst'].loc['mean']

DecisionTreeClassifier


Unnamed: 0,fscore_trn,fscore_tst
mean,0.980981,0.651422
std,0.004062,0.0466


RandomForestClassifier


Unnamed: 0,fscore_trn,fscore_tst
mean,0.961302,0.694817
std,0.006299,0.041422


LogisticRegression


Unnamed: 0,fscore_trn,fscore_tst
mean,0.738119,0.720366
std,0.010706,0.042421


SVC


Unnamed: 0,fscore_trn,fscore_tst
mean,0.767538,0.754315
std,0.01016,0.034582


KNeighborsClassifier


Unnamed: 0,fscore_trn,fscore_tst
mean,0.801199,0.720061
std,0.011106,0.033881


MLPClassifier




Unnamed: 0,fscore_trn,fscore_tst
mean,0.789461,0.73715
std,0.008642,0.037632


GaussianProcessClassifier


Unnamed: 0,fscore_trn,fscore_tst
mean,0.804557,0.734314
std,0.009668,0.033507


AdaBoostClassifier


Unnamed: 0,fscore_trn,fscore_tst
mean,0.787661,0.702726
std,0.011492,0.048798


GaussianNB


Unnamed: 0,fscore_trn,fscore_tst
mean,0.705216,0.695554
std,0.022022,0.046827


In [27]:
pd.Series(cf_f1_dict).sort_values()

DecisionTreeClassifier       0.651422
RandomForestClassifier       0.694817
GaussianNB                   0.695554
AdaBoostClassifier           0.702726
KNeighborsClassifier         0.720061
LogisticRegression           0.720366
GaussianProcessClassifier    0.734314
MLPClassifier                0.737150
SVC                          0.754315
dtype: float64