In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline

d = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
p_id = test_data['PassengerId']
data = pd.concat([train_data, test_data])
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


# Another attemp at improving the accuracy

This attemp fo similar data cleaning as the previous attemp. However, we introduce new columns and also standardizing almost all the column.

In [38]:
data.drop('PassengerId', axis=1, inplace=True)
training_survived = data['Survived'].dropna()
data['Survived'].fillna(-1, inplace=True)

In this attemp we include all the cleaning inside one function and included comment to explain what we do in each step

In [40]:
## preprocessing the Cabin column by filling in the most common cabin
data['Cabin'].fillna('U0', inplace=True)

## Adding a new column which contains the first letter of Cabin 
data['CabinSection'] = LabelEncoder().fit_transform(data['Cabin'].map(lambda x: x[0]))

## This create a new column that measure the distance using the number in the cabin column
data['CabinDistance'] = data['Cabin'].map(lambda x: x[1:])
data['CabinDistance'] = data['CabinDistance'].map(lambda x: x.split(' ')[0])
data['CabinDistance'].where(data['CabinDistance'] != '', '0', inplace=True)
data['CabinDistance'] = data['CabinDistance'].map(lambda x: int(x))
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,CabinSection,CabinDistance
0,22.0,U0,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,8,0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,2,85
2,26.0,U0,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,8,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,2,123
4,35.0,U0,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,8,0


Here filling in the NaN value for the Cabin column. After this we create 2 new columns which is the section of the cabin (the first letter of the cabin letter) and then we calculate the distance between cabins. This help us explain whether some cabins is closer to the rescuse boat compared to other and also the distance of each cabin compare to cabins within the same section.

In [29]:
## labeling the sex column with a 1 and 0 label
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

## filling NaN value in Embarked column with S and encode the column
data['Embarked'].fillna('S', inplace=True)
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])

## spliting the name by tittle and encode the column
data['Name'] = data['Name'].map(lambda x: x.split(',')[1].split('.')[0])
data['Name'] = LabelEncoder().fit_transform(data['Name'])
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,CabinSection,CabinDistance
0,22.0,U0,2,7.25,12,0,3,1,1,0.0,A/5 21171,8,0
1,38.0,C85,0,71.2833,13,0,1,0,1,1.0,PC 17599,2,85
2,26.0,U0,2,7.925,9,0,3,0,0,1.0,STON/O2. 3101282,8,0
3,35.0,C123,2,53.1,13,0,1,0,1,1.0,113803,2,123
4,35.0,U0,2,8.05,12,0,3,1,0,0.0,373450,8,0


Here we trying to filling in Nan value in Emarked column and also encode the sex and embarked value with numerical value. For the name column, we're trying to extract the title from it and encode it.

In [30]:
## filling in the NaN value in Fare column with -1 and change them to the median of each 
## corresponding pclass.
data['Fare'].fillna(-1, inplace=True)
medians = dict()
for pclass in data['Pclass'].unique():
    median = data.Fare[(data["Fare"] != -1) & (data['Pclass'] == pclass)].median()
    medians[pclass] = median
for index, row in data.iterrows():
    if row['Fare'] == -1:
        data.loc[index, 'Fare'] = medians[row['Pclass']]
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,CabinSection,CabinDistance
0,22.0,U0,2,7.25,12,0,3,1,1,0.0,A/5 21171,8,0
1,38.0,C85,0,71.2833,13,0,1,0,1,1.0,PC 17599,2,85
2,26.0,U0,2,7.925,9,0,3,0,0,1.0,STON/O2. 3101282,8,0
3,35.0,C123,2,53.1,13,0,1,0,1,1.0,113803,2,123
4,35.0,U0,2,8.05,12,0,3,1,0,0.0,373450,8,0


For the Fare column, we filling in the NaN value with median of each rows group by pclass. This give us a better estimate of the fare by using the class of the ticket for each observation.

In [31]:
## filling in the NaN value in Age column with -1 and change them to the median of each
## correspoding title.
data['Age'].fillna(-1, inplace=True)
medians = dict()
for title in data['Name'].unique():
    median = data.Age[(data["Age"] != -1) & (data['Name'] == title)].median()
    medians[title] = median
for index, row in data.iterrows():
    if row['Age'] == -1:
        data.loc[index, 'Age'] = medians[row['Name']]
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,CabinSection,CabinDistance
0,22.0,U0,2,7.25,12,0,3,1,1,0.0,A/5 21171,8,0
1,38.0,C85,0,71.2833,13,0,1,0,1,1.0,PC 17599,2,85
2,26.0,U0,2,7.925,9,0,3,0,0,1.0,STON/O2. 3101282,8,0
3,35.0,C123,2,53.1,13,0,1,0,1,1.0,113803,2,123
4,35.0,U0,2,8.05,12,0,3,1,0,0.0,373450,8,0


We're doing the same thing for age. Estimating the NaN age by using the median of each rows group by title.

In [32]:
## processing a second age based on number of sibling, spouse, parent, children
for index, row in data.iterrows():
    ticket = row['Ticket']
    sibsp = row['SibSp']
    parch = row['Parch']

    if sibsp > 0 or parch > 0:
        ages = list()
        for index2, row2 in data[data['Ticket'] == ticket].iterrows():
            ages.append(row2['Age'])
        data.loc[index, 'SecondAge'] = min(ages)

    else:
        data.loc[index, 'SecondAge'] = row['Age']
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,CabinSection,CabinDistance,SecondAge
0,22.0,U0,2,7.25,12,0,3,1,1,0.0,A/5 21171,8,0,34.5
1,38.0,C85,0,71.2833,13,0,1,0,1,1.0,PC 17599,2,85,47.0
2,26.0,U0,2,7.925,9,0,3,0,0,1.0,STON/O2. 3101282,8,0,62.0
3,35.0,C123,2,53.1,13,0,1,0,1,1.0,113803,2,123,27.0
4,35.0,U0,2,8.05,12,0,3,1,0,0.0,373450,8,0,2.0


Here we're creating a new column for age. This column give another age for the youngest member for each other family member. We know that children and people with family tend to survive compare to adult and people traveling alone. Therefore, this feature could help us differentiate that.

In [33]:
## process the titles by encode them.
died = ('Don', 'Rev', 'Capt', 'Jonkheer')
survived = ('Mme', 'Ms', 'Lady', 'Sir', 'Mlle', 'the Countess')
data['TitleDied'] = data['Name'].apply(lambda x: int(x in died))
data['TitleSurvived'] = data['Name'].apply(lambda x: int(x in survived))

for title in ('Mr', 'Mrs', 'Miss', 'Master', 'Dr', 'Major', 'Col'):
    data['Title_{}'.format(title)] = data['Name'].apply(lambda x: int(x == title))
data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,...,SecondAge,TitleDied,TitleSurvived,Title_Mr,Title_Mrs,Title_Miss,Title_Master,Title_Dr,Title_Major,Title_Col
0,22.0,U0,2,7.25,12,0,3,1,1,0.0,...,34.5,0,0,0,0,0,0,0,0,0
1,38.0,C85,0,71.2833,13,0,1,0,1,1.0,...,47.0,0,0,0,0,0,0,0,0,0
2,26.0,U0,2,7.925,9,0,3,0,0,1.0,...,62.0,0,0,0,0,0,0,0,0,0
3,35.0,C123,2,53.1,13,0,1,0,1,1.0,...,27.0,0,0,0,0,0,0,0,0,0
4,35.0,U0,2,8.05,12,0,3,1,0,0.0,...,2.0,0,0,0,0,0,0,0,0,0


Finally, we encode the tittle column, but since we notice there are some titles which died or survived 100% of the time so we could separate them as a new column. 

In [34]:
## encode pclass.
data = pd.concat([data, pd.get_dummies(data['Pclass']).rename(columns=lambda x: 'Pclass' + str(x))], axis=1)

## standardizing columns
data['CabinDistance'] = StandardScaler().fit_transform(data['CabinDistance'].values.reshape(-1, 1))
data['Age'] = StandardScaler().fit_transform(data['Age'].values.reshape(-1, 1))
data['Fare'] = StandardScaler().fit_transform(data['Fare'].values.reshape(-1, 1))
data['SecondAge'] = StandardScaler().fit_transform(data['SecondAge'].values.reshape(-1, 1))

data.drop('Name', axis=1, inplace=True)
data.drop('Pclass', axis=1, inplace=True)
data.drop('Cabin', axis=1, inplace=True)
data.drop('Ticket', axis=1, inplace=True)
data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Sex,SibSp,Survived,CabinSection,CabinDistance,SecondAge,...,Title_Mr,Title_Mrs,Title_Miss,Title_Master,Title_Dr,Title_Major,Title_Col,Pclass1,Pclass2,Pclass3
0,-0.581155,2,-0.503176,0,1,1,0.0,8,-0.412097,0.687414,...,0,0,0,0,0,0,0,0,0,1
1,0.70553,0,0.734809,0,0,1,1.0,2,2.887792,1.664975,...,0,0,0,0,0,0,0,1,0,0
2,-0.259484,2,-0.490126,0,0,0,1.0,8,-0.412097,2.838048,...,0,0,0,0,0,0,0,0,0,1
3,0.464276,2,0.383263,0,0,1,1.0,2,4.363037,0.100878,...,0,0,0,0,0,0,0,1,0,0
4,0.464276,2,-0.487709,0,1,0,0.0,8,-0.412097,-1.854244,...,0,0,0,0,0,0,0,0,0,1


In [None]:
Here, we standardizing and dropping u

In [35]:
training_data = data[data['Survived'] != -1]
testing_data = data[data['Survived'] == -1]

training_data.drop('Survived', axis=1, inplace=True)
testing_data.drop('Survived', axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 21 columns):
Age              891 non-null float64
Embarked         891 non-null int64
Fare             891 non-null float64
Parch            891 non-null int64
Sex              891 non-null int64
SibSp            891 non-null int64
CabinSection     891 non-null int64
CabinDistance    891 non-null float64
SecondAge        891 non-null float64
TitleDied        891 non-null int64
TitleSurvived    891 non-null int64
Title_Mr         891 non-null int64
Title_Mrs        891 non-null int64
Title_Miss       891 non-null int64
Title_Master     891 non-null int64
Title_Dr         891 non-null int64
Title_Major      891 non-null int64
Title_Col        891 non-null int64
Pclass1          891 non-null uint8
Pclass2          891 non-null uint8
Pclass3          891 non-null uint8
dtypes: float64(4), int64(14), uint8(3)
memory usage: 134.9 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Finally, we do what we previously done before which is trying mutiple models to see which fit best and GradientBoostingClassifier come out as the best which gave a 80.3% on the leaderboard.

In [36]:

models = [
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(),
    GradientBoostingClassifier(n_estimators=100),
]

i=1
for model in models:
    model.fit(training_data, training_survived)
    prediction = model.predict(testing_data).astype(int)
    submission = pd.DataFrame({
        "PassengerId": d["PassengerId"],
        "Survived": prediction
    })
    submission.to_csv('submission{}.csv'.format(i), index=False, columns=["Survived", "PassengerId"])

    i += 1

