In [197]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

#Drop features we are not going to use
#train = train.drop(['Name', 'Ticket', 'Cabin', 'Embarked'],axis=1)
#test = test.drop(['Name', 'Ticket', 'Cabin', 'Embarked'],axis=1)

train = train.drop(['Name', 'Ticket', 'Cabin'],axis=1)
test = test.drop(['Name', 'Ticket', 'Cabin'],axis=1)

#Look at the first 3 rows of our training data
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S


In [198]:
#Fill in missing values with the most common value 'S'
train['Embarked'] = train['Embarked'].fillna('S')
test['Embarked'] = test['Embarked'].fillna('S')

#Fill in the missing value in fare
test['Fare'] = test['Fare'].fillna(0)

#Fill in missing age values with the median value for age
train['Age'] = train['Age'].fillna(df['Age'].median())
test['Age'] = test['Age'].fillna(df['Age'].median())

#Convert ['male','female'] to [1,0] so that our decision tree can be built

for df in [train,test]:
    df['Sex_binary']=df['Sex'].map({'male':1,'female':0})
    df['Embarked_no']=df['Embarked'].map({'S':1, 'C':2, 'Q':3})
    
#Create new value that measures size of family
train['Fsize'] = train['SibSp'] + train['Parch']
test['Fsize'] = test['SibSp'] + test['Parch']

train = train.drop(['SibSp', 'Parch'],axis=1)
test = test.drop(['SibSp', 'Parch'],axis=1)

#Select feature column names and target variable we are going to use for training
features = ['Pclass','Age','Sex_binary', 'Fsize', 'Fare', 'Embarked_no']
target = 'Survived'

#Look at the first 3 rows (we have over 800 total rows) of our training data.; 
#This is input which our classifier will use as an input.
train[features].head(3)


Unnamed: 0,Pclass,Age,Sex_binary,Fsize,Fare,Embarked_no
0,3,22.0,1,1,7.25,1
1,1,38.0,0,1,71.2833,2
2,3,26.0,0,0,7.925,1


In [199]:
#Display first 3 target variables
train[target].head(3).values

array([0, 1, 1], dtype=int64)

In [200]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

max_features = [1, 2, 3, 4, 5, 6]
max_depth = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]

#Feature tuning
X, X_test, y, y_test = train_test_split(train[features], train[target])
min1 = 999

for i in max_features:
    for j in max_depth:
        clf = RandomForestClassifier(max_features = i, max_depth = j, n_estimators = 500, random_state = 0)
        clf.fit(X, y)
        predictions = clf.predict(X_test)
        mse = mean_squared_error(predictions, y_test)
        if mse<min1:
            min1 = mse
            minf = i
            mind = j
print(min1, minf, mind)

#Create classifier object with default hyperparameters
clf = RandomForestClassifier(max_depth = mind, max_features = minf, n_estimators = 500, random_state = 0)  

#Fit our classifier using the training features and the training target values
clf.fit(train[features],train[target]) 

0.13004484304932734 1 8


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [201]:
#Make predictions using the features from the test data set
predictions = clf.predict(test[features])

#Display our predictions - they are either 0 or 1 for each training instance 
#depending on whether our algorithm believes the person survived or not.
predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [202]:
from sklearn.metrics import accuracy_score
for feature in zip(features, clf.feature_importances_):
    print(feature)

('Pclass', 0.11372794143078921)
('Age', 0.18299359839952273)
('Sex_binary', 0.3316474630400099)
('Fsize', 0.10116086650303321)
('Fare', 0.23386998527838368)
('Embarked_no', 0.03660014534826124)


In [203]:
#Create a DataFrame with the passengers ids and our prediction regarding whether they survived or not
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [204]:
#This is saved in the same directory as your notebook
filename = 'Titanic Predictions 14.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Titanic Predictions 14.csv
