Import all packages and list input files

In [260]:
import numpy as np
import pandas as pd

In [261]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier

In [262]:
import os
for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read data

In [263]:

pd.set_option('display.max_rows', None)

train_data = pd.read_csv("./input/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [264]:
test_data = pd.read_csv("./input/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Drop categories that dont matter. PassengerId still needed for submission

In [265]:
train_data = train_data.drop(['PassengerId'], axis=1)
train_data = train_data.drop(['Ticket'], axis=1)
test_data = test_data.drop(['Ticket'], axis=1)

In [266]:
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


Check correlations and look for null data

In [267]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [268]:
train_data.corr(method='pearson')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [269]:
train_data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [270]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Put median value at missing Fare value

In [271]:
test_data[test_data["Fare"].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,,,S


In [272]:
test_data.loc[152,'Fare'] = 32.204208 # This is the median found from test_data.describe() 

In [273]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             0
Cabin          327
Embarked         0
dtype: int64

Fix null values in Embarked. Find pattern in Cabin vs Embarked and use this to fill out the null values

In [274]:
train_data[train_data["Embarked"].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,B28,
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,B28,


In [275]:
train_data[['B2' in str(x) for x in train_data['Cabin']]]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
61,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,B28,
540,1,1,"Crosby, Miss. Harriet R",female,36.0,0,2,71.0,B22,S
690,1,1,"Dick, Mr. Albert Adrian",male,31.0,1,0,57.0,B20,S
745,0,1,"Crosby, Capt. Edward Gifford",male,70.0,1,1,71.0,B22,S
781,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17.0,1,0,57.0,B20,S
829,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,B28,


In [276]:
test_data[['B2' in str(x) for x in test_data['Cabin']]]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
305,1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabe...",female,64.0,1,1,26.55,B26,S
390,1282,1,"Payne, Mr. Vivian Ponsonby",male,23.0,0,0,93.5,B24,S


In [277]:
train_data.loc[61, 'Embarked'] = 'S'

In [278]:
train_data.loc[829, 'Embarked'] = 'S'

In [279]:
train_data.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

Replace text values with numbers for Sex

In [280]:
combine = [train_data, test_data]

In [281]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,,S


In [282]:
train_data.corr(method='pearson')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0


Replace text values with numbers for Embarked

In [283]:
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,C85,1
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,C123,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,,0


In [284]:
train_data.corr(method='pearson')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.106811
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.045702
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.116569
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.010171
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.059961
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.078665
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.062142
Embarked,0.106811,0.045702,0.116569,0.010171,-0.059961,-0.078665,0.062142,1.0


Create a new category named Title. Extract titles from Name, change text values to numbers, and then remove the Name column

In [285]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')



title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train_data = train_data.drop(['Name'], axis=1)
test_data = test_data.drop(['Name'], axis=1)

train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,0,22.0,1,0,7.25,,0,1
1,1,1,1,38.0,1,0,71.2833,C85,1,3
2,1,3,1,26.0,0,0,7.925,,0,2
3,1,1,1,35.0,1,0,53.1,C123,0,3
4,0,3,0,35.0,0,0,8.05,,0,1


In [286]:
train_data = train_data.drop(['Cabin'], axis=1)

train_data.head()

train_data.corr(method='pearson')

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
Survived,1.0,-0.338481,0.543351,-0.077221,-0.035322,0.081629,0.257307,0.106811,0.407753
Pclass,-0.338481,1.0,-0.1319,-0.369226,0.083081,0.018443,-0.5495,0.045702,-0.173929
Sex,0.543351,-0.1319,1.0,-0.093254,0.114631,0.245489,0.182333,0.116569,0.502713
Age,-0.077221,-0.369226,-0.093254,1.0,-0.308247,-0.189119,0.096067,0.010171,-0.104766
SibSp,-0.035322,0.083081,0.114631,-0.308247,1.0,0.414838,0.159651,-0.059961,0.269623
Parch,0.081629,0.018443,0.245489,-0.189119,0.414838,1.0,0.216225,-0.078665,0.315784
Fare,0.257307,-0.5495,0.182333,0.096067,0.159651,0.216225,1.0,0.062142,0.13631
Embarked,0.106811,0.045702,0.116569,0.010171,-0.059961,-0.078665,0.062142,1.0,0.0454
Title,0.407753,-0.173929,0.502713,-0.104766,0.269623,0.315784,0.13631,0.0454,1.0


In [287]:
test_data = test_data.drop(['Cabin'], axis=1)

test_data.head()

test_data.corr(method='pearson')

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,1.0,-0.026751,-0.023245,-0.034102,0.003818,0.04308,0.008278,-0.049863,0.006531
Pclass,-0.026751,1.0,-0.108615,-0.492143,0.001087,0.018721,-0.576745,0.031096,-0.119826
Sex,-0.023245,-0.108615,1.0,-1.3e-05,0.099943,0.15912,0.191493,0.126779,0.565539
Age,-0.034102,-0.492143,-1.3e-05,1.0,-0.091587,-0.061249,0.334662,0.113664,-0.064884
SibSp,0.003818,0.001087,0.099943,-0.091587,1.0,0.306895,0.17156,-0.100603,0.269295
Parch,0.04308,0.018721,0.15912,-0.061249,0.306895,1.0,0.230059,-0.125164,0.302061
Fare,0.008278,-0.576745,0.191493,0.334662,0.17156,0.230059,1.0,0.053658,0.217577
Embarked,-0.049863,0.031096,0.126779,0.113664,-0.100603,-0.125164,0.053658,1.0,0.031726
Title,0.006531,-0.119826,0.565539,-0.064884,0.269295,0.302061,0.217577,0.031726,1.0


Run our algorihm and predict the test data. Save output

In [289]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('./output/my_submission.csv', index=False)
print("Your submission was successfully saved!")



Your submission was successfully saved!
