In [35]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell

In [36]:
InteractiveShell.ast_node_interactivity = 'all'
plt.style.use('dark_background')

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [38]:
def extract_numericals(df):
    """Extract floats and replace missing values w median"""
    df_flt = df.select_dtypes(include='float64')
    df_flt = df_flt.fillna(value=df_flt.median())
    return df_flt

In [39]:
def one_hot_encode(series):
    """Return series encoded OneVsAll"""
    ## Encode categories as integers
    le = LabelEncoder()
    le.fit(series)
    classes = le.classes_
#     print('LE classes', classes)
    series_le = le.transform(series)
    ## Convert integer categories using OneVsAll 
    oh = OneHotEncoder(sparse=False)
    feat_le = series_le.reshape(-1, 1)
    oh.fit(feat_le)
#     print('OH feats', oh.active_features_)
    feat_oh = oh.transform(feat_le)
    # Avoid dummy trap
    feat_oh = feat_oh[:, -2]
#     print('shape:', feat_oh.shape)
    return feat_oh

In [40]:
def show_pearson(df):
    """Display correlation matrix of dataset"""
    corr_mat = df.corr()
#     display(corr_mat)
    sns.heatmap(corr_mat)
    return corr_mat

In [41]:
def score_logistic_regression(df, df_tst):
    """Fit logistic regressor and evalute metrics"""
    # Get design matrix and target
    X = df.drop(columns=['Survived']).values
    X_tst = df_tst.drop(columns=['Survived']).values
    y = df['Survived'].values
    y_tst = df_tst['Survived'].values
    # Fit logistic regressor to training data
    lr = LogisticRegression()
    lr = lr.fit(X, y)
    # Collect predictions
    y_pred = lr.predict(X)
    y_pred_tst = lr.predict(X_tst)
    scores = precision_recall_fscore_support(y, y_pred, average='binary')
    scores_tst = precision_recall_fscore_support(y_tst, y_pred_tst, average='binary')
    return scores, scores_tst

In [42]:
def score_gender_strategy(df, df_tst):
    """Evaluate strategy male = death, female = alive"""
    # Collet targets
    y = df['Survived'].values
    y_tst = df_tst['Survived'].values
    # Collect predictions
    y_pred = df['Sex'].values
    y_pred_tst = df_tst['Sex'].values
    # Compute scores
    scores = precision_recall_fscore_support(y, y_pred, average='binary')
    scores_tst = precision_recall_fscore_support(y_tst, y_pred_tst, average='binary')
    return scores, scores_tst

In [43]:
def process_df(df):
    """Return processed Dataframe"""
    # Extract floating point quantities and fill missing values
    df_end = extract_numericals(df)
    # OneHotEncoding of `Sex`
    df_end['Sex'] = one_hot_encode(df['Sex'])
    df_end['Sex'] = df_end['Sex'].astype('int64')
    # Encode `Cabin` as binary category
    sr_cabin = df['Cabin'].str.replace(r'.*', '1', case=True)
    sr_cabin = sr_cabin.fillna(value=0).astype('int64')
    df_end['Cabin'] = sr_cabin
    # Add target
    df_end['Survived'] = df['Survived']
    return df_end

In [44]:
def generate_score(df, score_strategy):
    """Returns a generator that evalute metrics for strategy"""
    while True:
        # Split training and test set 80% 20%
        df_trn, df_tst = train_test_split(df, test_size = .2, shuffle=True)
        # Process sets
        df_trn_end = process_df(df_trn)
        df_tst_end = process_df(df_tst)
        # Compute score on strategy
        yield score_strategy(df_trn_end, df_tst_end)

In [45]:
def evaluate_strategy(df, score_strategy, times):
    """Evaluate `score_strategy` on `df` `times` times. `DataFrame` is reshuffled at every use."""
    res = [next(generate_score(df, score_strategy)) for _ in range(times)]

    ps_trn = []
    rs_trn = []
    fs_trn = []

    ps_tst = []
    rs_tst = []
    fs_tst = []

    for instance in res:
        res_trn, res_tst = instance
        p_trn, r_trn, f_trn, _ = res_trn
        p_tst, r_tst, f_tst, _ = res_tst
        ps_trn.append(p_trn)
        rs_trn.append(r_trn)
        fs_trn.append(f_trn)
        ps_tst.append(p_tst)
        rs_tst.append(r_tst)
        fs_tst.append(f_tst)

    scores = pd.DataFrame({
        'precision_trn': ps_trn,
        'recall_trn': rs_trn,
        'fscore_trn': fs_trn,
        'precision_tst': ps_tst,
        'recall_tst': rs_tst,
        'fscore_tst': fs_tst
    })
    
    return scores

In [46]:
df = pd.read_csv('train.csv')

In [50]:
lr_scores = evaluate_strategy(df, score_logistic_regression, 500)
gen_scores = evaluate_strategy(df, score_gender_strategy, 500)

In [52]:
lr_scores.describe()
gen_scores.describe()

Unnamed: 0,fscore_trn,fscore_tst,precision_trn,precision_tst,recall_trn,recall_tst
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.720541,0.715801,0.729997,0.727707,0.711448,0.707255
std,0.010638,0.038217,0.011869,0.047408,0.013187,0.053433
min,0.688588,0.587302,0.697417,0.6,0.670455,0.536232
25%,0.713494,0.692913,0.721756,0.695468,0.702602,0.671588
50%,0.720887,0.719087,0.730038,0.727834,0.711679,0.710145
75%,0.727928,0.742857,0.737672,0.760721,0.719858,0.739859
max,0.750929,0.806202,0.768061,0.875,0.748227,0.862069


Unnamed: 0,fscore_trn,fscore_tst,precision_trn,precision_tst,recall_trn,recall_tst
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.709158,0.713663,0.740818,0.746856,0.680241,0.685644
std,0.010423,0.040968,0.012595,0.049724,0.01302,0.05159
min,0.678501,0.590164,0.708,0.6,0.638298,0.53125
25%,0.702703,0.6875,0.732283,0.714286,0.672646,0.652778
50%,0.709302,0.714286,0.740741,0.746479,0.680147,0.685714
75%,0.715909,0.739834,0.74902,0.78125,0.688406,0.716267
max,0.737828,0.818792,0.786611,0.875,0.719557,0.883333
