In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import metrics

In [40]:
X = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [41]:
encoded_embarked = pd.get_dummies(X['Embarked'])
X = pd.concat([X, encoded_embarked], axis=1)
X.drop('Embarked', axis = 1, inplace = True)

In [44]:
def extract_titles(df):
    if 'Name' in df:
        split_names = df['Name'].str.split(',')
        titles = split_names.apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0])
        df['Title'] = titles
        mens_titles = ['Jonkheer', 'Sir', 'Capt', 'Col', 'Don', 'Dr', 'Rev', 'Major']
        mrs_titles = ['Mme']
        miss_titles = ['Ms', 'Lady', 'Mlle', 'the Countess']
        
        for title in mens_titles:
            df.loc[df['Title'] == title, 'Title'] = 'Mr'
        
        for title in miss_titles :
            df.loc[df['Title'] == title, 'Title'] = 'Miss'
            
        for title in mrs_titles:
            df.loc[df['Title'] == title, 'Title'] = 'Mrs'
            
    encoded_titles = pd.get_dummies(df['Title'])
    df = pd.concat([df, encoded_titles], axis=1)
    df = df.drop(['Name', 'Title'], axis = 1)
    return df

extract_titles(X)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S,Master,Miss,Mr,Mrs
0,1,0,3,male,22.0,1,0,A/5 21171,7.2500,,0,0,1,0,0,1,0
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,1,0,0,0,0,0,1
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1,0,1,0,0
3,4,1,1,female,35.0,1,0,113803,53.1000,C123,0,0,1,0,0,0,1
4,5,0,3,male,35.0,0,0,373450,8.0500,,0,0,1,0,0,1,0
5,6,0,3,male,,0,0,330877,8.4583,,0,1,0,0,0,1,0
6,7,0,1,male,54.0,0,0,17463,51.8625,E46,0,0,1,0,0,1,0
7,8,0,3,male,2.0,3,1,349909,21.0750,,0,0,1,1,0,0,0
8,9,1,3,female,27.0,0,2,347742,11.1333,,0,0,1,0,0,0,1
9,10,1,2,female,14.0,1,0,237736,30.0708,,1,0,0,0,0,0,1


In [29]:
X.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,0,0,1


In [None]:
test.head(2)

In [None]:
#Checking the null values as a percentage of total values
null = X.isnull().sum().astype(float)/X.shape[0]
null[null!=0]

In [None]:
#77% data missing no point in keeping, cannot reconstruct 

In [None]:
print X['Embarked'].value_counts()

In [None]:
#S is largest by far so may as well impute  as S

In [None]:
print X['Ticket'].nunique(), X.shape

In [None]:
#Basically unique, so drop

In [None]:
X.drop(['Cabin','PassengerId', 'Ticket'], inplace=True, axis=1),
X['Embarked'] = X['Embarked'].fillna('S')
X['Sex'].replace({'male':0, 'female':1}, inplace = True) #Replacing gender with binary, will have to happen anyway
X.head(3)

In [None]:
#Next to fill in age
X['Age'].hist()
print X['Age'].mean(), X['Age'].median()
#TO DO find a better way to reconstruct Age
#For now will use median due to resistance to skew

In [None]:
for pclass in X['Pclass'].unique():
    sns.kdeplot(X[X['Pclass']==pclass]['Age'], label = pclass)
#Showing relationship between age and pclass

In [None]:
X.groupby('Pclass')['Age'].mean()

In [None]:
for pclass in X['Pclass'].unique():
        X.loc[(X['Pclass'] == pclass) & (X['Age'].isnull()), 'Age'] = X.groupby('Pclass')['Age'].mean().loc[pclass]
        
#Reconstruct Age based on Pclass

In [None]:
plt.subplot(121)
plt.hist(X[X['Survived']== 0 ]['Pclass'])
plt.title('Perished')
plt.subplot(122)
plt.hist(X[X['Survived']== 1 ]['Pclass'])
plt.title('Survived')
X.groupby('Pclass')['Survived'].mean()

In [None]:
#Useful information in Pclass

In [None]:
plt.subplot(121)
plt.hist(X[X['Survived']== 0 ]['Age'])
plt.title('Perished')
plt.subplot(122)
plt.hist(X[X['Survived']== 1 ]['Age'])
plt.title('Survived')

In [None]:
#An idea to try later, infant/elderly 
X['infant'] = X['Age'] < 7 or X['Age'] > 65
X['infant'] = X['Age'] <= 7 
X['Geriatric'] = X['Age'] >= 65
#More survived below 10, more died above 65

In [None]:
#One hot encoding embarked, to see if it is worth keeping
X['Embarked_S'] = 0
X['Embarked_C'] = 0
X['Embarked_Q'] = 0

X.loc[X['Embarked'] == 'S', 'Embarked_S'] = 1
X.loc[X['Embarked'] == 'C', 'Embarked_C'] = 1
X.loc[X['Embarked'] == 'Q', 'Embarked_Q'] = 1

X.drop('Embarked', axis = 1, inplace=True)

In [None]:
X.groupby('Survived')['Embarked_S', 'Embarked_C', 'Embarked_Q'].agg(np.array(['value_counts'])).plot.bar()
#Can probably drop Embarked_Q


In [None]:
X.corr()
#Because why not

In [None]:
Titles = X['Name'].str.split(',').apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0]).unique()
Title_counts = X['Name'].str.split(',').apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0]).value_counts()
#Using data from the name column
Title_counts

In [None]:
#Reset for flag age

X = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

def remove_nan(df):
    df['Embarked'] = df['Embarked'].fillna('S')
    for pclass in df['Pclass'].unique():
        df.loc[(df['Pclass'] == pclass) & (df['Age'].isnull()), 'Age'] = df.groupby('Pclass')['Age'].mean().loc[pclass]
    return df    

def remove_features(df):
    df = df.drop(['Cabin', 'PassengerId', 'Ticket', 'Name'], axis=1)
    return df

def extract_titles(df):
    if 'Name' in df:
        split_names = df['Name'].str.split(',')
        titles = split_names.apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0])
        df['Title'] = titles
        
    return df
    #To Do, remove useless titles like mlle ms
    #One_hot_encode
    
def one_hot_encode_embarked(df):
    df['Embarked_S'] = 0
    df['Embarked_C'] = 0
    df['Embarked_Q'] = 0

    df.loc[df['Embarked'] == 'S', 'Embarked_S'] = 1
    df.loc[df['Embarked'] == 'C', 'Embarked_C'] = 1
    df['Sex'].replace({'male':0, 'female':1}, inplace = True)
    df.loc[df['Embarked'] == 'Q', 'Embarked_Q'] = 1
    df = df.drop('Embarked', axis = 1)
    return df

def flag_age(df):
    df['Infant'] = df['Age'] <= 7 
    df['Geriatric'] = df['Age'] >= 65
    return df

In [None]:
X = flag_age(one_hot_encode_embarked(remove_features(remove_nan(X))))
X.head()

In [None]:
print X['Infant'].value_counts()/X.shape[0]

In [None]:
print X['Geriatric'].value_counts()/X.shape[0]

In [None]:
def remove_features(df):
    df = df.drop(['Cabin', 'PassengerId', 'Ticket'], axis=1)
    return df

X = extract_titles(one_hot_encode_embarked(remove_features(remove_nan(X))))
X.head()

In [None]:
X['Title'].unique()

In [None]:
X['Title'].value_counts()

In [None]:
X.loc[X['Title'] == 'Ms', 'Title'] = 'Miss'
X.loc[X['Title'] == 'Lady', 'Title'] = 'Miss'
X.loc[X['Title'] == 'Mlle', 'Title'] = 'Miss'
X.loc[X['Title'] == 'Mme', 'Title'] = 'Mrs'
X.loc[X['Title'] == 'the Countess', 'Title'] = 'Miss'
X.loc[X['Title'] == 'Jonkheer', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Sir', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Capt', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Col', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Major', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Don', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Dr', 'Title'] = 'Mr'
X.loc[X['Title'] == 'Rev', 'Title'] = 'Mr'

In [None]:
X['Title'].value_counts()

Will update extract titles to reflect this.

In [None]:
def extract_titles(df):
    if 'Name' in df:
        split_names = df['Name'].str.split(',')
        titles = split_names.apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0])
        df['Title'] = titles
        X.loc[X['Title'] == 'Ms', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Lady', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Mlle', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Mme', 'Title'] = 'Mrs'
        X.loc[X['Title'] == 'the Countess', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Jonkheer', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Sir', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Capt', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Col', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Major', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Don', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Dr', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Rev', 'Title'] = 'Mr'
    return df

In [None]:
X['Mr'] = 0
X['Miss'] = 0
X['Mrs'] = 0
X['Master'] = 0

X.loc[X['Title'] == 'Mr', 'Mr'] = 1
X.loc[X['Title'] == 'Miss', 'Miss'] = 1
X.loc[X['Title'] == 'Mrs', 'Mrs'] = 1
X.loc[X['Title'] == 'Master', 'Master'] = 1

Final Extract_Titles

In [None]:
def extract_titles(df):
    if 'Name' in df:
        split_names = df['Name'].str.split(',')
        titles = split_names.apply(lambda x: x[1]).str.strip().str.split('.').apply(lambda x: x[0])
        df['Title'] = titles
        X.loc[X['Title'] == 'Ms', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Lady', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Mlle', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Mme', 'Title'] = 'Mrs'
        X.loc[X['Title'] == 'the Countess', 'Title'] = 'Miss'
        X.loc[X['Title'] == 'Jonkheer', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Sir', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Capt', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Col', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Major', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Don', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Dr', 'Title'] = 'Mr'
        X.loc[X['Title'] == 'Rev', 'Title'] = 'Mr'
        
        X['Mr'] = 0
        X['Miss'] = 0
        X['Mrs'] = 0
        X['Master'] = 0

        X.loc[X['Title'] == 'Mr', 'Mr'] = 1
        X.loc[X['Title'] == 'Miss', 'Miss'] = 1
        X.loc[X['Title'] == 'Mrs', 'Mrs'] = 1
        X.loc[X['Title'] == 'Master', 'Master'] = 1
    return df

In [None]:
np.arange(0, 50, 2.5)

In [None]:
test.isnull().sum()

#Need to impute the missing fare from test

test.loc[test['Fare'].isnull(), 'Fare'] = test['Fare'].median()

test.isnull().sum()