In [93]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [94]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [95]:
train.size

10692

In [96]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [97]:
def clean_data(df):
    #make the binary sex
    df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
    df.drop(columns = ['PassengerId','Name','Ticket','Cabin'], inplace = True) #Name NLP in future e.g. family link, cabin location could be useful
    dic = train.groupby(['Pclass']).mean()['Age']
    def age_rep(row):
        if np.isnan(row['Age']):
            return dic[row['Pclass']]
        else:
            return row['Age']
    df['Age'] = df.apply(age_rep, axis = 1)
    dic = train.groupby(['Pclass']).mean()['Fare']
    def age_rep(row):
        if np.isnan(row['Fare']):
            return dic[row['Pclass']]
        else:
            return row['Fare']
    df['Fare'] = df.apply(age_rep, axis = 1)
    df = df.dropna()
    return df

In [98]:
train = clean_data(train)
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [99]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2.0
1,1,1,0,38.0,1,0,71.2833,0.0
2,1,3,0,26.0,0,0,7.925,2.0
3,1,1,0,35.0,1,0,53.1,2.0
4,0,3,1,35.0,0,0,8.05,2.0


In [100]:
train_ys = train['Survived']
train_xs = train.drop(columns = ['Survived'])

clf = RandomForestClassifier(n_estimators=100,criterion='entropy',max_depth=10,bootstrap=False, random_state = 0)
clf.fit(train_xs, train_ys)

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=10,
                       random_state=0)

In [101]:
clf.score(train_xs,train_ys)

0.9448818897637795

In [102]:
test_id = test['PassengerId']
test = clean_data(test)
test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [103]:
example = pd.read_csv("gender_submission.csv")
example['PassengerId'] = test_id
example['Survived'] = pd.DataFrame(clf.predict(test))
example.to_csv("submission.csv", index = False)