In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [3]:
titanic_df = pd.read_csv('train.csv')

In [4]:
titanic_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
gender_df = titanic_df[['Sex', 'PassengerId', 'Survived']]

In [6]:
gender_df.groupby(by = ['Sex', 'Survived']).size()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
dtype: int64

Percentage of females survived: 74.20%; Percentage of males survived: 18.89%

In [7]:
class_df = titanic_df[['Pclass', 'PassengerId', 'Survived']]

In [8]:
class_df.groupby(by = ['Pclass', 'Survived']).size()

Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
dtype: int64

Percentage of 1st class people survived: 62.96%;
Percentage of 2nd class people survived: 47.28%;
Percentage of 3rd class people survived: 24.23%

In [9]:
age_df = titanic_df[['Age', 'PassengerId', 'Survived']].copy()

In [10]:
age_df.loc[age_df['Age'] < 10, 'Age'] = 1
age_df.loc[(age_df['Age'] >= 10) & (age_df['Age'] < 50), 'Age'] = 2
age_df.loc[age_df['Age'] >= 50, 'Age'] = 3

In [11]:
age_df.groupby(by = ['Age', 'Survived']).size()

Age  Survived
1.0  0            24
     1            38
2.0  0           353
     1           225
3.0  0            47
     1            27
dtype: int64

Children < 10, percentage survived: 61.29%; 10 <= Teenage and adult < 50, percentage survived: 38.92%; 50 <= Old, percentage survived: 36.48%

In [12]:
station_df = titanic_df[['Embarked', 'PassengerId', 'Survived']]

In [13]:
station_df.groupby(by = ['Embarked', 'Survived']).size()

Embarked  Survived
C         0            75
          1            93
Q         0            47
          1            30
S         0           427
          1           217
dtype: int64

Percentage of people emabarked at C survived: 55.35%; Q: 38.96%; S: 33.69%

In [14]:
sibsp_df = titanic_df[['SibSp', 'PassengerId', 'Survived']]

In [15]:
sibsp_df.groupby(by = ['SibSp', 'Survived']).size()

SibSp  Survived
0      0           398
       1           210
1      0            97
       1           112
2      0            15
       1            13
3      0            12
       1             4
4      0            15
       1             3
5      0             5
8      0             7
dtype: int64

Percentage of people survived with no siblings or spouses: 34.53%; Percentage of people survived with either 1 or more siblings or spouses: 49.11%

In [16]:
parch_df = titanic_df[['Parch', 'PassengerId', 'Survived']]

In [17]:
parch_df.groupby(by = ['Parch', 'Survived']).size()

Parch  Survived
0      0           445
       1           233
1      0            53
       1            65
2      0            40
       1            40
3      0             2
       1             3
4      0             4
5      0             4
       1             1
6      0             1
dtype: int64

Percentage of people survived with no parents or children: 34.36%; Percentage of people survived with either 1 or more siblings or spouses: 53.05%

In [18]:
coeff_df = titanic_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()

In [19]:
coeff_df.loc[coeff_df['Sex'] == 'male', 'Sex'] = 0
coeff_df.loc[coeff_df['Sex'] == 'female', 'Sex'] = 1
coeff_df['Sex'] = coeff_df['Sex'].apply(lambda x: float(x))

coeff_df.loc[coeff_df['Age'] < 10, 'Age'] = 1
coeff_df.loc[(coeff_df['Age'] >= 10) & (coeff_df['Age'] < 50), 'Age'] = 2
coeff_df.loc[coeff_df['Age'] >= 50, 'Age'] = 3
             
coeff_df.loc[coeff_df['Embarked'] == 'C', 'Embarked'] = 0
coeff_df.loc[coeff_df['Embarked'] == 'Q', 'Embarked'] = 1
coeff_df.loc[coeff_df['Embarked'] == 'S', 'Embarked'] = 2
coeff_df['Embarked'] = coeff_df['Embarked'].apply(lambda x: float(x))

In [20]:
coeff_df.dropna(axis=0, how='any', inplace=True)

In [22]:
coeff_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
Survived,1.0,-0.356462,0.536762,-0.108173,-0.015523,0.095265,0.2661,-0.181979
Pclass,-0.356462,1.0,-0.150826,-0.284215,0.065187,0.023666,-0.552893,0.244145
Sex,0.536762,-0.150826,1.0,-0.087224,0.106296,0.249543,0.182457,-0.109639
Age,-0.108173,-0.284215,-0.087224,1.0,-0.3493,-0.267547,0.065584,-0.052033
SibSp,-0.015523,0.065187,0.106296,-0.3493,1.0,0.383338,0.13986,0.033064
Parch,0.095265,0.023666,0.249543,-0.267547,0.383338,1.0,0.206624,0.011803
Fare,0.2661,-0.552893,0.182457,0.065584,0.13986,0.206624,1.0,-0.28351
Embarked,-0.181979,0.244145,-0.109639,-0.052033,0.033064,0.011803,-0.28351,1.0


In [47]:
# null values- age (177 records) and embarked (62, 830)
base_df = titanic_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()
base_df['Embarked'].fillna('S', inplace=True)
base_df['Age'].fillna(base_df['Age'].mean(), inplace=True)

In [48]:
base_df.loc[base_df['Sex'] == 'male', 'Sex'] = 0
base_df.loc[base_df['Sex'] == 'female', 'Sex'] = 1
base_df['Sex'] = base_df['Sex'].apply(lambda x: float(x))

base_df.loc[base_df['Age'] < 10, 'Age'] = 1
base_df.loc[(base_df['Age'] >= 10) & (coeff_df['Age'] < 50), 'Age'] = 2
base_df.loc[base_df['Age'] >= 50, 'Age'] = 3
             
base_df.loc[base_df['Embarked'] == 'C', 'Embarked'] = 0
base_df.loc[base_df['Embarked'] == 'Q', 'Embarked'] = 1
base_df.loc[base_df['Embarked'] == 'S', 'Embarked'] = 2
base_df['Embarked'] = base_df['Embarked'].apply(lambda x: float(x))

In [49]:
X_train = base_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()
Y_train = base_df['Survived']

In [50]:
clf = RandomForestClassifier(max_depth=50)
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
titanic_test_df = pd.read_csv('test.csv')

In [53]:
test_df = titanic_test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()

In [54]:
test_df.loc[test_df['Sex'] == 'male', 'Sex'] = 0
test_df.loc[test_df['Sex'] == 'female', 'Sex'] = 1
test_df['Sex'] = test_df['Sex'].apply(lambda x: float(x))

test_df.loc[test_df['Age'] < 10, 'Age'] = 1
test_df.loc[(test_df['Age'] >= 10) & (test_df['Age'] < 50), 'Age'] = 2
test_df.loc[test_df['Age'] >= 50, 'Age'] = 3
             
test_df.loc[test_df['Embarked'] == 'C', 'Embarked'] = 0
test_df.loc[test_df['Embarked'] == 'Q', 'Embarked'] = 1
test_df.loc[test_df['Embarked'] == 'S', 'Embarked'] = 2
test_df['Embarked'] = test_df['Embarked'].apply(lambda x: float(x))

In [59]:
test_df['Embarked'].fillna('S', inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
test_df['Fare'].fillna(0, inplace=True)

In [60]:
len(test_df)

418

In [61]:
#test_df.dropna(axis=0, how='any', inplace=True)
X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].copy()

In [62]:
pred = clf.predict(X_test)

In [63]:
len(pred)

418

In [71]:
np.savetxt('titanic_submission_rf.csv', np.c_[titanic_test_df['PassengerId'],pred], delimiter=',', 
           header = 'PassengerId,Survived', comments = '', fmt='%d')

Accuracy on Kaggle: .74163