In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from sklearn.metrics import roc_auc_score,accuracy_score

In [3]:
from scipy.special import cbrt
import re

In [4]:
os.chdir('../pickles')

In [5]:
numerical = pickle.load(open('numerical_final.pickle','rb'))
categorical = pickle.load(open('categorical.pickle','rb'))
conversion_dict = pickle.load(open('conversion_dict.pickle','rb'))
scale = pickle.load(open('scale.pickle','rb'))
imputation_cols = pickle.load(open('imputation_cols.pickle','rb'))
dummy_cols = pickle.load(open('dummies_final.pickle','rb'))
pca = pickle.load(open('pca.pickle','rb'))

In [7]:
model_cols = pickle.load(open('model_columns.pickle'))

In [8]:
os.chdir('../output_data')

In [9]:
numerical

['Fare', 'SibSp', 'Parch']

In [10]:
categorical

['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']

In [11]:
df_val = pd.read_csv('validation.csv',header=0)

In [12]:
df_val['Cabin']=df_val['Cabin'].astype(str).apply(lambda x: ''.join([s for s in x if str.isalpha(s)][0]))

In [13]:
df_val['Ticket']=df_val['Ticket'].apply(lambda x: ''.join([s for s in x if str.isalpha(s)]))

In [14]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [15]:
df_val['Pclass'] = df_val['Pclass'].astype(str).apply(lambda x: x+' class')

In [16]:
df_val['Title'] = df_val['Name'].apply(get_title) 

In [17]:
df_val.drop(['Name'],axis=1,inplace=True)

In [18]:
df_val = df_val[numerical+categorical+['Survived','PassengerId']]

In [19]:
df_val.set_index('PassengerId',inplace=True)

In [20]:
df_train = pd.read_csv('train_v05.csv',header=0)

In [21]:
df_train.set_index('PassengerId',inplace=True)

In [22]:
df_pca = pd.read_csv('train_pca.csv',header=0)

In [23]:
df_pca.set_index('PassengerId',inplace=True)

In [24]:
values={}
for col in categorical:
    values[col]='Unknown'
df_val.fillna(values,inplace=True)

Unnamed: 0_level_0,Fare,SibSp,Parch,Pclass,Sex,Ticket,Cabin,Embarked,Title,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
710,15.2458,1,1,3 class,male,,n,C,Master,1
440,10.5000,0,0,2 class,male,CA,n,S,Mr,0
841,7.9250,0,0,3 class,male,SOTONO,n,S,Mr,0
721,33.0000,0,1,2 class,female,,n,S,Miss,1
40,11.2417,1,0,3 class,female,,n,C,Miss,1
291,78.8500,0,0,1 class,female,,n,S,Miss,1
301,7.7500,0,0,3 class,female,,n,Q,Miss,1
334,18.0000,2,0,3 class,male,,n,S,Mr,0
209,7.7500,0,0,3 class,female,,n,Q,Miss,1
137,26.2833,0,2,1 class,female,,D,S,Miss,1


In [25]:
for col in categorical:
    if len(conversion_dict[col])<=2:
        category = conversion_dict[col][0]
        df_val[col+'_dum_'+str(category)] = 0
        df_val.loc[df_val[col]==category,col+'_dum_'+str(category)]=1
    else:
        total_categories = len(conversion_dict[col])
        dummies = len(str(int(bin(total_categories)[2:],10)))
        bin_conv=[]
        for i in range(total_categories):
            bin_conv.append(conversion_dict[col][i][1])
        for j in range(dummies):
            df_val[col+'_dum_'+str(j)]=0
            for i,cat in enumerate([conv[0] for conv in conversion_dict[col]]):
                df_val.loc[df_val[col]==cat,col+'_dum_'+str(j)]=bin_conv[i]%10
                bin_conv[i]=bin_conv[i]//10
    df_val.drop(col,axis=1,inplace=True)
    print(col+' done')   

Pclass done
Sex done
Ticket done
Cabin done
Embarked done
Title done


In [26]:
transform_dict = {'log':lambda x: np.log(x),'sqr':lambda x: x**2,'sqrt':lambda x: np.sqrt(x),'exp':lambda x:np.exp(x),
                 'cube':lambda x: x**3,'cuberoot': lambda x: cbrt(x)}

In [27]:
os.chdir('../Statistics')

In [28]:
edd = pd.read_csv('edd_v05.csv',header=0)

In [29]:
def transform(x,function):
    if x is not None:
        return function(x)
    else:
        return np.nan

In [30]:
for col in numerical:
    if edd[(edd['Var']==col)&(edd['Status'].notnull())].shape[0]!=0:
        function = transform_dict[edd.loc[edd['Var']==col,'Status'].values[0]]
        df_val[col] = df_val[col].apply(lambda x: transform(x,function))

In [31]:
os.chdir('../Imputation_models')

In [32]:
for col in ['Fare']:
    model = pickle.load(open(col+'_impute.pickle','rb'))
    if df_val[col].isnull().any():
        indices = df_val.loc[df_val[col].isnull()].index.tolist()
        df_val.loc[indices,col]=np.array(model.predict(np.array(df_val.loc[indices,imputation_cols])))
        del indices
    del model
    print(col+' imputed')

Fare imputed


In [33]:
values={}
for col in ['SibSp','Parch']:
    values[col]=0
df_val.fillna(values,inplace=True)

Unnamed: 0_level_0,Fare,SibSp,Parch,Survived,Pclass_dum_3 class,Sex_dum_male,Ticket_dum_0,Ticket_dum_1,Ticket_dum_2,Ticket_dum_3,...,Cabin_dum_1,Cabin_dum_2,Cabin_dum_3,Embarked_dum_0,Embarked_dum_1,Embarked_dum_2,Title_dum_0,Title_dum_1,Title_dum_2,Title_dum_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
710,15.2458,1,1,1,1,1,0,0,0,0,...,0,0,0,1,0,0,1,1,0,0
440,10.5000,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
841,7.9250,0,0,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
721,33.0000,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
40,11.2417,1,0,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
291,78.8500,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
301,7.7500,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
334,18.0000,2,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209,7.7500,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
137,26.2833,0,2,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [34]:
df_val = df_val[numerical+dummy_cols+['Survived']]

# Input Arrays

In [35]:
x_val,y_val = np.array(df_val[model_cols]),np.array(df_val['Survived'])

In [36]:
x_val_pca,y_val_pca = pca.transform(scale.transform(x_val)),y_val

In [37]:
x_train,y_train = np.array(df_train[model_cols]),np.array(df_train['Survived'])

In [38]:
x_train_pca,y_train_pca = np.array(df_pca.drop(['Survived'],axis=1)),np.array(df_pca['Survived'])

# Inferential Models

In [39]:
os.chdir('../Models/Inferential_models')

In [40]:
inferential_models = {'LR_inferential':pickle.load(open('LR.pickle','rb')),
                      'RF_inferential':pickle.load(open('RF.pickle','rb')),
                    'GBM_inferential':pickle.load(open('GBM.pickle','rb'))}

# Predictive Models

In [41]:
os.chdir('../pca_models')

In [42]:
predictive_models = {'LR_predictive':pickle.load(open('LR.pickle','rb')),
                      'RF_predictive':pickle.load(open('RF.pickle','rb')),
                    'GBM_predictive':pickle.load(open('GBM.pickle','rb'))}

# Valuation

In [43]:
def model_valuation(model_name,x_train,y_train,x_val,y_val,inferential=True):
    if inferential:
        model = inferential_models[model_name]
    else:
        model = predictive_models[model_name]
    prob_train = list(model.predict_proba(x_train)[:,1])
    prob_val = list(model.predict_proba(x_val)[:,1])
    max_acc=0
    def predict(prob,cutoff):
        if prob>=cutoff:
            return 1
        else:
            return 0
    for i in range(101):
        cut = .01*i
        y_train_pred = [predict(j,cut) for j in prob_train]
        acc = accuracy_score(y_train,y_train_pred)
        if acc>max_acc:
            max_acc = acc
            cutoff = cut
    y_train_pred = [predict(j,cutoff) for j in prob_train]
    y_val_pred = [predict(j,cutoff) for j in prob_val]
    acc_train = accuracy_score(y_train,y_train_pred)
    acc_val = accuracy_score(y_val,y_val_pred)
    roc_train = roc_auc_score(y_train,prob_train)
    #evaluating_criteria_train = mean_squared_error(np.apply_along_axis(np.log,0,y_train),np.apply_along_axis(np.log,0,y_train_pred))
    roc_val = roc_auc_score(y_val,prob_val)
    #evaluating_criteria_val = mean_squared_error(np.apply_along_axis(np.log,0,y_val),np.apply_along_axis(np.log,0,y_val_pred))
    return [model_name,roc_train,roc_val,cutoff,acc_train,acc_val]

In [44]:
df_eval = pd.DataFrame()

In [45]:
for key in inferential_models.keys():
    df_eval = df_eval.append([model_valuation(key,x_train,y_train,x_val,y_val)])

In [46]:
for key in predictive_models.keys():
    df_eval = df_eval.append([model_valuation(key,x_train_pca,y_train_pca,x_val_pca,y_val_pca,False)])

In [47]:
df_eval

Unnamed: 0,0,1,2,3,4,5
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642


In [48]:
df_eval.columns=['model_name','roc_train','roc_val','cutoff','acc_train','acc_val']

In [49]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642


In [50]:
def ensemble_eval(model_list,x_train,y_train,x_val,y_val,inferential = True):
    if inferential:
        models = [(model_name,inferential_models[model_name]) for model_name in model_list]
    else:
        models = [(model_name,predictive_models[model_name]) for model_name in model_list]
    
    prob_train = sum([model[1].predict_proba(x_train)[:,1]*df_eval.loc[df_eval['model_name']==model[0],'roc_train'].values[0]
                        for model in models])/sum([df_eval.loc[df_eval['model_name']==model[0],'roc_train'].values[0] for model
                                                  in models])
    #print(prob_train)
    prob_val = sum([model[1].predict_proba(x_val)[:,1]*df_eval.loc[df_eval['model_name']==model[0],'roc_train'].values[0]
                        for model in models])/sum([df_eval.loc[df_eval['model_name']==model[0],'roc_train'].values[0] for model
                                                  in models])
    
    def predict(prob,cutoff):
        if prob>=cutoff:
            return 1
        else:
            return 0
    max_acc=0
    for i in range(101):
        cut = .01*i
        y_train_pred = [predict(j,cut) for j in prob_train]
        acc = accuracy_score(y_train,y_train_pred)
        if acc>max_acc:
            max_acc = acc
            cutoff = cut
    y_train_pred = [predict(j,cutoff) for j in prob_train]
    y_val_pred = [predict(j,cutoff) for j in prob_val]
    acc_train = accuracy_score(y_train,y_train_pred)
    acc_val = accuracy_score(y_val,y_val_pred)
    roc_train = roc_auc_score(y_train,prob_train)
    #evaluating_criteria_train = mean_squared_error(np.apply_along_axis(np.log,0,y_train),np.apply_along_axis(np.log,0,y_train_pred))
    roc_val = roc_auc_score(y_val,prob_val)
    #evaluating_criteria_val = mean_squared_error(np.apply_along_axis(np.log,0,y_val),np.apply_along_axis(np.log,0,y_val_pred))
    return pd.DataFrame([['ensemble('+','.join([model[0] for model in models])+')',roc_train,roc_val,cutoff,acc_train,acc_val]],columns=['model_name','roc_train','roc_val','cutoff','acc_train','acc_val'])

In [51]:
models = list(inferential_models.keys())
for i in range(len(models)):
    for j in range(i+1,len(models)):
        df_eval = df_eval.append(ensemble_eval([models[i],models[j]],x_train,y_train,x_val,y_val))

In [52]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642
0,"ensemble(RF_inferential,GBM_inferential)",0.965451,0.871557,0.46,0.938202,0.815642
0,"ensemble(RF_inferential,LR_inferential)",0.85499,0.86036,0.4,0.807584,0.787709
0,"ensemble(GBM_inferential,LR_inferential)",0.97557,0.879794,0.42,0.938202,0.810056


In [53]:
df_eval = df_eval.append(ensemble_eval(models,x_train,y_train,x_val,y_val))

In [54]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642
0,"ensemble(RF_inferential,GBM_inferential)",0.965451,0.871557,0.46,0.938202,0.815642
0,"ensemble(RF_inferential,LR_inferential)",0.85499,0.86036,0.4,0.807584,0.787709
0,"ensemble(GBM_inferential,LR_inferential)",0.97557,0.879794,0.42,0.938202,0.810056
0,"ensemble(RF_inferential,GBM_inferential,LR_inf...",0.95809,0.883655,0.41,0.933989,0.821229


In [55]:
models = list(predictive_models.keys())
for i in range(len(models)):
    for j in range(i+1,len(models)):
        df_eval = df_eval.append(ensemble_eval([models[i],models[j]],x_train_pca,y_train_pca,x_val_pca,y_val_pca,False))

In [56]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642
0,"ensemble(RF_inferential,GBM_inferential)",0.965451,0.871557,0.46,0.938202,0.815642
0,"ensemble(RF_inferential,LR_inferential)",0.85499,0.86036,0.4,0.807584,0.787709
0,"ensemble(GBM_inferential,LR_inferential)",0.97557,0.879794,0.42,0.938202,0.810056
0,"ensemble(RF_inferential,GBM_inferential,LR_inf...",0.95809,0.883655,0.41,0.933989,0.821229


In [57]:
df_eval = df_eval.append(ensemble_eval(models,x_train_pca,y_train_pca,x_val_pca,y_val_pca,False))

In [58]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
0,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
0,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
0,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
0,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
0,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642
0,"ensemble(RF_inferential,GBM_inferential)",0.965451,0.871557,0.46,0.938202,0.815642
0,"ensemble(RF_inferential,LR_inferential)",0.85499,0.86036,0.4,0.807584,0.787709
0,"ensemble(GBM_inferential,LR_inferential)",0.97557,0.879794,0.42,0.938202,0.810056
0,"ensemble(RF_inferential,GBM_inferential,LR_inf...",0.95809,0.883655,0.41,0.933989,0.821229


In [59]:
df_eval.reset_index(inplace=True)

In [60]:
df_eval.drop(['index'],inplace=True,axis=1)

In [61]:
df_eval

Unnamed: 0,model_name,roc_train,roc_val,cutoff,acc_train,acc_val
0,RF_inferential,0.852889,0.847362,0.47,0.799157,0.787709
1,GBM_inferential,0.984276,0.869755,0.45,0.938202,0.810056
2,LR_inferential,0.847687,0.86435,0.37,0.801966,0.793296
3,RF_predictive,0.86231,0.827864,0.57,0.825843,0.798883
4,GBM_predictive,0.882866,0.866731,0.51,0.839888,0.793296
5,LR_predictive,0.859411,0.859202,0.45,0.83427,0.815642
6,"ensemble(RF_inferential,GBM_inferential)",0.965451,0.871557,0.46,0.938202,0.815642
7,"ensemble(RF_inferential,LR_inferential)",0.85499,0.86036,0.4,0.807584,0.787709
8,"ensemble(GBM_inferential,LR_inferential)",0.97557,0.879794,0.42,0.938202,0.810056
9,"ensemble(RF_inferential,GBM_inferential,LR_inf...",0.95809,0.883655,0.41,0.933989,0.821229


In [62]:
df_eval.sort_values(by='roc_val',ascending=False,inplace=True)

In [63]:
os.chdir('../../Statistics')

In [64]:
df_eval.to_csv('model_evaluation.csv',index=False)

# Ensemble(GBM_Predictive,LR_Predictive) comes out to be the best model