In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
import os
print(os.listdir("../input"))

import warnings
warnings.filterwarnings('ignore')

## Brief Introduction
First, the 2 datasets are imported. The training one contains survival value while the testing one does not.  <br>
The aim here is to predict the survival of passengers in the test dataset using the features given.

In [None]:
# Read datasets from csv
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# Merge the 2 dataframes for EDA and feature engineeraing
full = pd.concat([df_train, df_test], axis = 0, sort=True)

# Set PassengerId as Index
full.set_index('PassengerId', drop = False, inplace=True)
train = full[:891]

# Display Data
display(full.head(3))
print(f"Dataset contains {full.shape[0]} records, with {full.shape[1]} variables.")
print(f"Variables:{list(full.columns)}")

## Missing Values
Missing Values are found on Age, Cabin and Fare. <br>
As too many values are missing in Cabin, this features may not be useful for predicting survival;
Age can be an important factor and could be inferred from other features, e.g., Title, Parch and the families the passengers was belong to.

In [None]:
# Identify Missing Values
display(full.isnull().sum())

## Exploring the Data - Distributions

In [None]:
# Descriptive Statistics
full.describe(include="all")

In [None]:
# EDA - Distributions
categorical_var = ['Pclass','Sex','SibSp','Parch','Embarked', 'Survived']
continuous_var = ['Age','Fare']

# Plot Categorical Var
fig, axs = plt.subplots(4,3, figsize = (15,12))
for i,key in enumerate(categorical_var):
     sns.countplot(data = full, x = key, ax = axs[i//3,i%3], color='teal')

# Plot Age
plt.subplot2grid((4,3),(2,0),rowspan=1,colspan=3);
plt.hist(full.Age[full.Age.isna()!=True], bins=range(0,80,1), color='slategrey' );
plt.xlabel('Age');

# Plot Fare
plt.subplot2grid((4,3),(3,0),rowspan=1,colspan=3);
plt.hist(full.Fare[full.Fare.isna()!=True], bins=100, color='slategrey');
plt.xlabel('Fare');

print(f"survived: {full.Survived.mean()*100:.2f}%")

## EDA - Relationships between features and survival

In [None]:

# Plot all categorical features with Survival rate
f, axs = plt.subplots(3,5, sharey=True, figsize=(18,9))
for i,key in enumerate(categorical_var[:-1]): # except feature Survived
    sns.barplot(data = full, x= key, y='Survived', ax = axs[i%3, i//3], color='teal');
    axs[i%3, i//3].axhline(y=0.3838, color='k', linestyle='--')

# Plot Correlation
corr = full.corr()
plt.subplot2grid((3,5),(0,2),rowspan=3,colspan=3);
cmap =sns.diverging_palette( 220 , 10 , as_cmap = True )
sns.heatmap(corr, cmap = cmap,square=True, cbar_kws={ 'shrink' : .9 }, annot = True, annot_kws = { 'fontsize' : 12 });


Sex seems to have a strong predictive power, which makes sense due to the "Women and Children First" instructions for deciding who can get on the lifeboats. <br>
Pclass and Fare also seem significant. These higher class passengers lives and have most of their activities near the deck, thus, closer to the lifeboats. <br>
It is surprising to find no correlation between Age and Survived. Their relationship may not be linear.

In [None]:
# Plot number of survived passengers by PClass, Sex and Age
facet = sns.FacetGrid(full, row = 'Pclass',col='Sex', hue = 'Survived', aspect=2)
facet.map(plt.hist, 'Age', histtype='step', bins = np.arange(0,80,4))

facet.add_legend();

Clearly shown the "Women first" pattern. <br>
Child survival advantage seems to apply for those < 12 years old. <br>
Much higher survival rate for people in 1st and 2nd class. Children and Women in these 2 classes have a much higher survival rate (some age range even with  100%), compared to those in the 3rd class (which has around 50% chance)

In [None]:
# Create Age Quartiles
full['Age_quartile'] = pd.qcut(full.Age,10)

# Plot age quartiles by sex with survival rate
plt.figure(figsize = (10,5))
sns.barplot(data = full, x= 'Age_quartile', y='Survived', hue = 'Sex');
plt.axhline(y=0.3838, color='k', linestyle='--')
plt.xticks(rotation = 30);
plt.title('Across All Classes');



In [None]:
full['Child'] = (full['Age'] <= 14).astype(int)

Age Advantage for boys with age <14. <br>
It may exist for fathers as well (trough in males with age 14-25, which were unlikely to have kids ).

## Feature Engineering
Other interesting relationships to look at is between Survival, Parch and SibSp. It is not difficult to imagine those within the same family/ same group will stay together when in danger, thus, having any of them survived would mean the other members of the group will likely to have a better chance to survive, and vice versa.

In [None]:
import re 

# function to parse surname of the passengers
def parse_surname(s):
    m = re.search('(\w+),.*',s)
    return m.group(1)

### Identifing the Families by Surname
First, parse the Surnames of the passengers. Those from the same family should share the surname. <br>
Surnames are grouped together and their occurance caluculated respectively.

In [None]:
family = pd.DataFrame()

# Parse Surname from Name
family['Surname'] = full.Name.map(parse_surname)

# Assign codes to surname for later grouping
surname_count_dict = {}
surname_code_dict = {}
for i, name in enumerate(family['Surname'].unique()):
    surname_count_dict[name] = sum(family['Surname']==name)
    surname_code_dict[name] = i

family['SurnameCode'] = family['Surname'].map(surname_code_dict)
family['SurnameSize'] = family['Surname'].map(surname_count_dict)
family['FamilySize'] = 1 + full.Parch + full.SibSp # True Family Size 

# Examples with common surname
display(full[family.Surname == 'Smith'])

However, some common surnames may be shared by people from different families. This is addressed by the following function. <br>
To judge if passengers are likely to be in the same family, the function check their ticket code.  <br>
The function decides if people with the same surname are from the same family by checking the level of  similarity of their tickets. <br>Those with the exact same tickets or tickets that have values close to each other are grouped together.

In [None]:
def tick2fam_gen(df):
    """
    Function to judge if passengers are likely to be in the same family.
    Input: DataFrame with Passenger surname and ticket
    Return: Code generated to specify different families
    """
    # initialize ticket dict
    dict_tick2fam = {'000000': 0}
    fam_counter = 0
        
    for i in df.index:    
        keys = list(dict_tick2fam.keys())
        chk_key = df.loc[i, 'Ticket']
        for key in keys:
            if len(chk_key) == len(key): #if their tickets have high similarity
                if (chk_key[-4].isdigit()) & (key[-4].isdigit()): 
                    if (chk_key[:-2] == key[:-2]) & (np.abs(int(chk_key[-2:]) - int(key[-2:])) <= 10):
                        dict_tick2fam[chk_key] = dict_tick2fam[key]
                        break
                    
            if key == keys[-1]:
                fam_counter += 1
                dict_tick2fam[chk_key] = str(fam_counter)  
                
    return dict_tick2fam

In [None]:
# DF to provide a view for checking if function works properly
family_infer = pd.concat([family, full[['Parch','SibSp','Age','Name','Pclass','Ticket','Embarked','Survived']]], axis = 1)

# Single out Surnames with size > true family size (may have more than 1 family involved)
chk_surname = family_infer[family['FamilySize'] < family['SurnameSize']].Surname.unique() # surnames to check
# chk_surname2 = family_infer[family['FamilySize'] > family['SurnameSize']].Surname.unique() # unidentified fam

# Regrouping Families according to Family Size and Ticket.
family['SurnameAdj'] = family['Surname'] #new column for corrected family_group

for s in chk_surname:
    family_regroup = family_infer[family.Surname == s] #get family with specific surname
    fam_code_dict = tick2fam_gen(family_regroup) #pass in df to get family codes within the same surname

    for idx in family_regroup.index: #assign family code 1by1
        curr_ticket = full.loc[idx].Ticket

        if family_regroup.loc[idx].FamilySize == 1: #for passengers traveling alone
            if family_regroup.Ticket.value_counts()[curr_ticket] > 1: #relatives that shares surname and ticket, which Parch and SibSp failed to record
                family.loc[idx, 'SurnameAdj'] =  family.loc[idx].Surname + '-hidfam' + fam_code_dict[curr_ticket]
            else: #single traveler
                family.loc[idx, 'SurnameAdj'] =  family.loc[idx].Surname + '-single' + fam_code_dict[curr_ticket]
        else: #families
            family.loc[idx, 'SurnameAdj'] =  family.loc[idx].Surname + '-fam' + fam_code_dict[curr_ticket]

display(family[family.Surname == 'Smith'])

After Adjusting the surnames of families, group these true families together again. The no. of families here should increase.

In [None]:
# Assign codes to families
Family_count_dict = {}
Family_code_dict = {}
for i, name in enumerate(family['SurnameAdj'].unique()):
    surname_count_dict[name] = sum(family['SurnameAdj']==name) # count no. of member of the same fam
    surname_code_dict[name] = i # fam code

family['FamilyCode'] = family['SurnameAdj'].map(surname_code_dict)
family['FamilySize'] = family['SurnameAdj'].map(surname_count_dict)

print(f"No. of Family Before Regrouping: {len(family.SurnameCode.unique())}")
print(f"No. of Family After Regrouping: {len(family.FamilyCode.unique())}")

### Identify Roomates by Ticket
People who share the same ticket can be families as well as friends traveling together. They are expected to stay together during the incidents. 

In [None]:
# Identify Groups (Those holding the same ticket code, could be friends/family)
group = pd.DataFrame(family[['FamilyCode','FamilySize']])

ticket_count = {}
ticket_code = {}
for i,ticket in enumerate(full.Ticket.unique()):
    ticket_count[ticket] = sum(full.Ticket == ticket)
    ticket_code[ticket] = i

group['Ticket_code'] = full.Ticket.map(ticket_code)
group['Ticket_size'] = full.Ticket.map(ticket_count)

print(f"No. of Tickets Identified: {len(group['Ticket_code'].unique())}")
display(full[(full.Ticket == 'A/4 48871') |(full.Ticket == 'A/4 48873')])

### Combining Friends and Families as Groups
Finally, the families and friend groups are combined together.  <br>
People who share either the same room or same family are grouped together.

In [None]:
def ChainCombineGroups(df, colA, colB):
    '''
    This function takes in 2 columns of labels and chain all items which share
    the same labels within each of the 2 columns
    input:
    df - DataFrame
    colA - Key for Col
    colB - Key for Col  
    output:
    array of numeric grouping labels
    '''
    # make a copy of DFs for iteration
    data = df.copy()
    search_df = data.copy()
    
    group_count = 0

    while not search_df.empty:

        # Initiate pool and Select Reference item
        pool = search_df.iloc[:1]
        idx = pool.index

        # Remove 1st item from searching df
        search_df.drop(index = idx, inplace = True)

        # Initialize Search
        flag_init = 1
        update = pd.DataFrame()

        # While loop to exhausively search for commonalities, pool is updated until no more common features are found
        while (flag_init or not update.empty):

            flag_init = 0

            # target labels to look for
            pool_A_uniq = np.unique(pool[colA])
            pool_B_uniq = np.unique(pool[colB])

            for col in [colA,colB]:
                idx = []

                # get all indexs of items with the same label
                for num in np.unique(pool[col]):
                    idx.extend(search_df[search_df[col] == num].index)

                # update pool
                update = search_df.loc[idx]
                pool = pd.concat([pool, update], axis = 0)

                # remove item from searching df
                search_df = search_df.drop(index = idx)

            # assign group num
            data.loc[pool.index, 'Group_'] = group_count

        group_count += 1
        
    return np.array(data['Group_'].astype(int))

In [None]:
# Assign Final group no.
group['Group_code'] = ChainCombineGroups(group, 'FamilyCode', 'Ticket_code')
         
print(f"Family: {len(family.FamilyCode.unique())}")
print(f"Group: {len(group.Ticket_code.unique())}")
print(f"Combined: {len(group.Group_code.unique())}")
group.head()

In [None]:
print('An example of grouping the both friends and family under a same group.')
display(pd.concat([full['Ticket'],family[['Surname','FamilyCode']],group[['Ticket_code','Group_code']]], axis = 1)[group['Group_code'] == 458])

### Limitations:
The above function did fail to join some families back together, especially those who had different ticket numbers and had different surnames. <br> 
For example, female siblings who were married and took different surnames; <br>
and families who bought tickets with codes that has low similarity, which is likely to be found for those in the 1st Class. 

In [None]:
# Getting the Group Size
group_count={}
group_code = group['Group_code'].unique()
for code in group_code:
    group_count[code] = sum(group.Group_code == code)
group['Group_size'] = group.Group_code.map(group_count)

### Survival of the Group
Finally, the thing that we wanted to know in the first place is if the members in their Family/Friends group has survived or not. Having a surviving friend/family member should have good predictive power of whether a passenger survived or not.

In [None]:
# Prepare the df by adding the Survived features
group_corr_test = pd.concat([group, full.Survived, family[['SurnameCode','SurnameSize']]], axis = 1)

In [None]:
for param in [('SurnameCode','SurnameSize'),
              ('FamilyCode','FamilySize'),
              ('Ticket_code','Ticket_size'),
              ('Group_code','Group_size')]: # keep group at last
    
    # No. of member survived in each group
    n_member_survived_by_gp = group_corr_test.groupby(param[0]).Survived.sum()
    
    # No. of member survived in a particular group, discounting the passenger concerned
    n_mem_survived = group_corr_test[param[0]].map(n_member_survived_by_gp)
    n_mem_survived_adj = n_mem_survived - group_corr_test.Survived.apply(lambda x: 1 if x == 1 else 0)

    # Same for the dead
    n_member_dead_by_gp = group_corr_test.groupby(param[0]).Survived.count() - group_corr_test.groupby(param[0]).Survived.sum()
    n_mem_dead  = group_corr_test[param[0]].map(n_member_dead_by_gp)
    n_mem_dead_adj = n_mem_dead - group_corr_test.Survived.apply(lambda x: 1 if x == 0 else 0)

    # How many people from that group that we do not have data on.
    unknown_factor = (group_corr_test[param[1]] - n_mem_survived_adj - n_mem_dead_adj)/group_corr_test[param[1]]
    confidence = 1 - unknown_factor

    # Ratio of members survived in that group, ranging from -1 to 1, adjusted by the confidence weight
    key = 'Confidence_member_survived'+'_'+param[0]
    ratio = (1/group_corr_test[param[1]]) * (n_mem_survived_adj - n_mem_dead_adj)
    group_corr_test[key] = confidence * ratio
    
    group['Ratio_member_survived'] = (1/group_corr_test[param[1]]) * (n_mem_survived_adj - n_mem_dead_adj)
    group['Confidence_member_survived'] = confidence * ratio

# Display Correlation
plt.barh(group_corr_test.corr().Survived[-4:].index, group_corr_test.corr().Survived[-4:])
plt.xlabel('Correlation with Survived');

## Data Engineering - Simplifying the Ticket format
Tickets also provide information on where the passengers are located on ship, which may be vital for survival.
Here, I group the tickets by their first few letters. Ticket heading with occurance < 10 are ignored.

In [None]:
def parse_ticket(str1):
    m = re.search(r'(.*)(\s\d|\s\d{4,7}$)',str1)
    s = re.search(r'[A-Z]+',str1)
    if m:
        str2 = m.group(1)
        n =re.search(r'([A-Z]+)[^A-Z0-9]*([A-Z]+)*[^A-Z0-9]*([A-Z0-9]*)[^A-Z]*([A-Z]*)*',str2)
        new_str = ''
        if n:    
            if n.group(1):
                new_str+=n.group(1)
                if n.group(2) or n.group(3):
                    if n.group(2):
                        new_str+=n.group(2)
                    if n.group(3):
                        new_str+=n.group(3)
                        if n.group(4):
                            new_str+=n.group(4)
                            if n.group(5):
                                new_str+=m.group(5)
    elif s:
        new_str = s.group(0)
    else:
        new_str = 'XXX'
    return new_str

In [None]:
ticket = pd.DataFrame(full.Survived)

ticket['Ticket'] = full.Ticket.map(parse_ticket)
d = dict(zip(ticket.Ticket.value_counts().index, ticket.Ticket.value_counts()))
ticket['Ticket_count'] = ticket['Ticket'].map(d)
plt.figure(figsize = (12,6))
sns.barplot(data = ticket[ticket['Ticket_count'] > 10], x = 'Ticket', y = 'Survived')
plt.axhline(y=0.3838, color='k', linestyle='--');

Tickets with the most Predictive power: A5, PC

In [None]:
ticket['A5'] = (ticket['Ticket'] == 'A5').astype(int)
ticket['PC'] = (ticket['Ticket'] == 'PC').astype(int)

## Adjustin Fare according to Ticket Size
Fare value  is found to be distorted as the Fare feature in original dataset calculates the total amount paid for one single ticket, i.e., no. of person * base rate of ticket. To get a more accurate fare paid by individual value, the fare is divided by the no. of person holding that ticket.

In [None]:
# Fare Adjustment
full['Fare_adj'] = full.Fare/group.Ticket_size

# Plot Fare Adjustment
fig, axs = plt.subplots(2,figsize = (14,10))
axs[0].hist(full.Fare[full.Fare.isna()!=True], bins=80);
axs[0].set_title('Before Adjustment')
axs[0].set_xlabel('Fare')
axs[1].hist(full.Fare_adj[full.Fare_adj.isna()!=True], bins=80);
axs[1].set_title('After Adjustment');
axs[1].set_xlabel('Fare_adj');

After adjustment, the 3 Pclass are more clearly shown by the 3 peaks of Adjusted Fare.

## Handling Missing Values

### Missing Fare
As mentioned before, there are missing values in Age, Fare, Embarked and Cabin. <br>
Here, I dealt with the Age and Fare only.

In [None]:
# Handle missing Fare
full[full.Fare.isnull()] #PassengerId 1044 has fare value missing
fare_dict_nan = dict(full.groupby('Pclass').Fare_adj.mean())
full.loc[full.Fare.isnull(),['Fare','Fare_adj']]= fare_dict_nan[3] # fill value according to PClass

Missing Fare is filled with the mean value of the Fare of their respective PClass.

### Missing Age
Age is an important factor for survival prediction, since children are more likely to be saved. Instead of filling the NaN with general mean age, we may try to guess Passengers' ages according to their title.

In [None]:
# Parse Titles from Names
title = pd.DataFrame()

def parse_name(str):
    m = re.search(', (\w+ *\w*)\.',str)
    return m.group(1)
    
title['Title'] = full.Name.map(parse_name)
title.Title.unique()

In [None]:
# Simplify title groups
Title_Dictionary = {"Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    }

title.Title = title.Title.map(Title_Dictionary)

In [None]:
# Plot the distribution of Age by Title
title = title.join(full.Age)
# display(title.groupby('Title').describe());
plt.figure(figsize = (14,6))
sns.violinplot(data = title, x = 'Title', y = 'Age');

Passengers with title 'Master' are likely to be children, we can infer those missing age as the mean age of Master
Passengers with title 'Miss' seem to comprise both children and adult, the followings is an attempt to infer their age from other given features <br>
However, age of female here is relatively unimportant, since all female regardless of age have high priority to board the lifeboats.

In [None]:
title_age_dict = {}

# Calculate mean age of each title group
for t in title.Title.unique():
    title_age_dict[t] = title[title.Title == t].Age.mean() 

# Fill in Age according to passenger's title
idx = full.Age.isnull()
full['Age_infer'] = full['Age']
full.loc[idx, 'Age_infer'] = title.loc[idx].Title.map(title_age_dict)
full['Child'] = (full['Age_infer'] <= 14).astype(int)

## Data Transformation
Used MinMaxScalar for continuous variables and One-hot encoding for Categorical ones.

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select features as predictors
features = pd.concat([full[['Pclass','Sex','Child','Fare_adj','Parch','SibSp']],
                      group[['Group_size','Confidence_member_survived']],
                      title['Title'],
                      ticket[['A5','PC']]], axis = 1)

# MinMax Transform the continuous variables
scalar = MinMaxScaler()

continuous = ['Fare_adj','Group_size']
features_minmax_transformed = pd.DataFrame(data = features)
features_minmax_transformed[continuous] = scalar.fit_transform(features_minmax_transformed[continuous])

# Transform Sex labels into binary code
features_minmax_transformed.Sex = features_minmax_transformed.Sex.apply(lambda x: 1 if x == 'male' else 0)

# One-hot Encoding
features_final = pd.get_dummies(features_minmax_transformed)

encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))

# Seperate Train Data and Test Data
features_final_train = features_final[:891]
features_final_test = features_final[891:]

## Model Training and Selection


In [None]:
# Spliting Training Sets into Train and Cross-validation sets
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

X_train, X_test, y_train, y_test = train_test_split(features_final_train, 
                                                    train.Survived, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [None]:
# Create Model Training Pipeline
from sklearn.metrics import accuracy_score
from time import time

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    
    # Get the predictions on the test set(X_test),
    predictions_test = learner.predict(X_test)
    
    # then get predictions on the training samples(X_train)
    predictions_train = learner.predict(X_train)
            
    # Compute accuracy on the training samples
    results['acc_train'] = accuracy_score(y_train, predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
       
    # Success
    print("{} trained on {} samples. Acc: {:.4f}".format(learner.__class__.__name__, sample_size, results['acc_test']))
        
    # Return the results
    return results

In [None]:
# Import the three supervised learning models from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier,RandomForestClassifier

# Initialize the three models
clf_A = GradientBoostingClassifier(random_state = 0)
clf_B = LogisticRegression(random_state= 0)
clf_C = RandomForestClassifier(random_state= 0)

# Calculate the number of samples for 10%, 50%, and 100% of the training data
samples_100 = len(y_train)
samples_10 = int(len(y_train)/2)
samples_1 = int(len(y_train)/10)

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

In [None]:
# Reshaping the Results for plotting
df = pd.DataFrame()

for i in results.items():
    temp = pd.DataFrame(i[1]).rename(columns={0:'1% of train', 1:'10% of train', 2:'100% of train'})
    temp['model'] = i[0]
    df = pd.concat([df, temp], axis = 0)
df_plot = df.reset_index().melt(id_vars=['index','model'])

# Ploting the results
fig, axs = plt.subplots(1,2,figsize = (14,4))
for i,key in enumerate(df_plot['index'].unique()[:2]):
    ax = axs[i%2]
    sns.barplot(data = df_plot[df_plot['index'] == key], x = 'model', y = 'value',
                hue = 'variable', ax = ax)
    ax.set_ylim([0,1])
    ax.set_title(key)
    ax.legend(ncol=3, loc="lower right", frameon=True, fontsize = 'small')



## Model Selection and model tuning
RandomForestClassifier seemed to have the best out of the box accuracy score and with room for improvement as seen in acc_train.
Model tuning is performed using GridSearchCV to improve generalizability of the model.

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

clf = RandomForestClassifier(random_state = 0, oob_score = True)

parameters = {'criterion' :['gini'],
             'n_estimators' : [400], #[100,200,400]
             'max_depth':[6], #[3,4,5,6]
             'min_samples_leaf': [5], # [2,4,6]
              'max_leaf_nodes': [10], # [8,10,12]
              'min_impurity_decrease': [0], # [0,0.001,0.005]
              'max_features' : [1] # [1,2,3]
             }

scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring = scorer, cv = 10)

grid_fit = grid_obj.fit(X_train,y_train)

best_clf = grid_fit.best_estimator_

predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("Oob score on testing data: {:.4f}".format(clf.oob_score_))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final oob score on the testing data: {:.4f}".format(best_clf.oob_score_))
print("\nBest Parameters\n------")
best_clf

In [None]:
# Plot Feature Importnace
idx = np.argsort(best_clf.feature_importances_)
plt.figure(figsize = (12,8))
plt.barh(range(len(best_clf.feature_importances_)),best_clf.feature_importances_[idx])
plt.yticks(range(len(best_clf.feature_importances_)),features_final_train.columns[idx]);
plt.title('Feature Importance');

In [None]:
# Output for Kaggle competition
final_predict = best_clf.predict(features_final_test)

prediction = pd.DataFrame(full[891:].PassengerId)
prediction['Survived'] = final_predict.astype('int')

prediction.to_csv('predict.csv',index = False)