In [222]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

d = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
p_id = test_data['PassengerId']
data = pd.concat([train_data, test_data])
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


# Another attemp at improving the accuracy

This attemp fo similar data cleaning as the previous attemp. However, we introduce new columns and also standardizing almost all the column.

In [223]:
data.drop('PassengerId', axis=1, inplace=True)
survived = data['Survived'].dropna()
data['Survived'].fillna(-1, inplace=True)

In this attemp we include all the cleaning inside one function and included comment to explain what we do in each step

In [224]:
def preprocess_data(data):
    ## preprocessing the Cabin column by filling in the most common cabin
    data['Cabin'].fillna('U0', inplace=True)
    
    ## Adding a new column which contains the first letter of Cabin 
    data['CabinSection'] = LabelEncoder().fit_transform(data['Cabin'].map(lambda x: x[0]))
    
    ## This create a new column that measure the distance using the number in the cabin column
    data['CabinDistance'] = data['Cabin'].map(lambda x: x[1:])
    data['CabinDistance'] = data['CabinDistance'].map(lambda x: x.split(' ')[0])
    data['CabinDistance'].where(data['CabinDistance'] != '', '0', inplace=True)
    data['CabinDistance'] = data['CabinDistance'].map(lambda x: int(x))
    
    ## labeling the sex column with a 1 and 0 label
    data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
    
    ## filling NaN value in Embarked column with S and encode the column
    data['Embarked'].fillna('S', inplace=True)
    data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
    
    ## spliting the name by tittle and encode the column
    data['Name'] = data['Name'].map(lambda x: x.split(',')[1].split('.')[0])
    data['Name'] = LabelEncoder().fit_transform(data['Name'])
    
    ## filling in the NaN value in Fare column with -1 and change them to the median of each 
    ## corresponding pclass.
    data['Fare'].fillna(-1, inplace=True)
    medians = dict()
    for pclass in data['Pclass'].unique():
        median = data.Fare[(data["Fare"] != -1) & (data['Pclass'] == pclass)].median()
        medians[pclass] = median
    for index, row in data.iterrows():
        if row['Fare'] == -1:
            data.loc[index, 'Fare'] = medians[row['Pclass']]
    
    
    ## filling in the NaN value in Age column with -1 and change them to the median of each
    ## correspoding title.
    data['Age'].fillna(-1, inplace=True)
    medians = dict()
    for title in data['Name'].unique():
        median = data.Age[(data["Age"] != -1) & (data['Name'] == title)].median()
        medians[title] = median
    for index, row in data.iterrows():
        if row['Age'] == -1:
            data.loc[index, 'Age'] = medians[row['Name']]
            
    
    
    ## processing a second age based on number of sibling, spouse, parent, children
    for index, row in data.iterrows():
        ticket = row['Ticket']
        sibsp = row['SibSp']
        parch = row['Parch']

        if sibsp > 0 or parch > 0:
            ages = list()
            for index2, row2 in data[data['Ticket'] == ticket].iterrows():
                ages.append(row2['Age'])
            data.loc[index, 'SecondAge'] = min(ages)

        else:
            data.loc[index, 'SecondAge'] = row['Age']
            
    
    
    ## process the titles by encode them.
    died = ('Don', 'Rev', 'Capt', 'Jonkheer')
    survived = ('Mme', 'Ms', 'Lady', 'Sir', 'Mlle', 'the Countess')
    data['TitleDied'] = data['Name'].apply(lambda x: int(x in died))
    data['TitleSurvived'] = data['Name'].apply(lambda x: int(x in survived))

    for title in ('Mr', 'Mrs', 'Miss', 'Master', 'Dr', 'Major', 'Col'):
        data['Title_{}'.format(title)] = data['Name'].apply(lambda x: int(x == title))

    
    ## encode pclass.
    data = pd.concat([data, pd.get_dummies(data['Pclass']).rename(columns=lambda x: 'Pclass' + str(x))], axis=1)
    
    ## standardizing columns
    data['CabinDistance'] = StandardScaler().fit_transform(data['CabinDistance'].values.reshape(-1, 1))
    data['Age'] = StandardScaler().fit_transform(data['Age'].values.reshape(-1, 1))
    data['Fare'] = StandardScaler().fit_transform(data['Fare'].values.reshape(-1, 1))
    data['SecondAge'] = StandardScaler().fit_transform(data['SecondAge'].values.reshape(-1, 1))
    
    data.drop('Name', axis=1, inplace=True)
    data.drop('Pclass', axis=1, inplace=True)
    data.drop('Cabin', axis=1, inplace=True)
    data.drop('Ticket', axis=1, inplace=True)
    
    return data

In [225]:
processed_data = preprocess_data(data)

training_data = processed_data[data['Survived'] != -1]
testing_data = processed_data[data['Survived'] == -1]

training_data.drop('Survived', axis=1, inplace=True)
testing_data.drop('Survived', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Finally, we do what we previously done before which is trying mutiple models to see which fit best and GradientBoostingClassifier come out as the best which gave a 80.3% on the leaderboard.

In [226]:

models = [
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
    GradientBoostingClassifier(n_estimators=100),
]

i=1
for model in models:
    model.fit(training_data, survived)
    prediction = model.predict(testing_data).astype(int)
    submission = pd.DataFrame({
        "PassengerId": d["PassengerId"],
        "Survived": prediction
    })
    submission.to_csv('submission{}.csv'.format(i), index=False, columns=["Survived", "PassengerId"])

    i += 1