In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


# Part One: Defining function 'WrangleData'

In [44]:
def WrangleData(df):
    drop_columns = ['Name', 'Ticket', 'Fare', 'Cabin', 'Parch', 'Embarked', 'SibSp']
    df = df.drop(drop_columns, axis = 1)
    df = df.dropna()
    def f(x):
        return x == 'female'
    df['Sexint'] = f(df['Sex']).astype(int)
    df = df.drop('Sex', axis = 1)
    return df

In [45]:
df = pd.read_csv('train.csv')
#df.sample(5)
trainingSet = WrangleData(df)

In [46]:
trainingSet.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Age,Sexint
count,714.0,714.0,714.0,714.0,714.0
mean,448.582633,0.406162,2.236695,29.699118,0.365546
std,259.119524,0.49146,0.83825,14.526497,0.481921
min,1.0,0.0,1.0,0.42,0.0
25%,222.25,0.0,1.0,20.125,0.0
50%,445.0,0.0,2.0,28.0,0.0
75%,677.75,1.0,3.0,38.0,1.0
max,891.0,1.0,3.0,80.0,1.0


In [47]:
training_sample = trainingSet.iloc[:,2:]
training_success = trainingSet.iloc[:,1:2]
training_success.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [48]:
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(training_sample, training_success.values.ravel(), test_size = 0.22, random_state = 0)

randomforest = RandomForestClassifier()
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
score = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Random Forest Score: " + str(score))


Random Forest Score: 77.85


In [49]:
# View a list of the features and their importance scores
list(zip(x_train, randomforest.feature_importances_))



[('Pclass', 0.18245453771093903),
 ('Age', 0.46881218823892662),
 ('Sexint', 0.34873327405013443)]

In [50]:
# Crosstab: Anything on the diagonals was predicted correctly, and anything off-diagonal was incorrect
pd.crosstab(y_val, y_pred, rownames=['Actual Survivors'], colnames=['Predicted Survivors'])

Predicted Survivors,0,1
Actual Survivors,Unnamed: 1_level_1,Unnamed: 2_level_1
0,71,18
1,17,52


In [51]:
df = pd.read_csv('test.csv')
testSet = WrangleData(df)
testSet.describe(include='all')

Unnamed: 0,PassengerId,Pclass,Age,Sexint
count,332.0,332.0,332.0,332.0
mean,1100.063253,2.144578,30.27259,0.38253
std,122.763173,0.846283,14.181209,0.486739
min,892.0,1.0,0.17,0.0
25%,992.75,1.0,21.0,0.0
50%,1099.5,2.0,27.0,0.0
75%,1210.25,3.0,39.0,1.0
max,1307.0,3.0,76.0,1.0


In [53]:
predictionSet = testSet.filter(items=['Pclass', 'Age', 'Sexint'])
test_pred = randomforest.predict(predictionSet)
test_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0,

In [55]:
testSet['Survived'] = test_pred

In [56]:
testSet.sample(5)

Unnamed: 0,PassengerId,Pclass,Age,Sexint
131,1023,1,53.0,0
324,1216,1,39.0,1
162,1054,2,26.0,1
348,1240,2,24.0,0
90,982,3,22.0,1


In [74]:
masterSet = df.filter(items=['PassengerId'])
masterSet['Survived'] = np.nan
testSet2 = testSet.filter(items=['PassengerId', 'Survived'])
results = testSet2.combine_first(masterSet)
results = results.fillna(0)
results.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.0
std,120.810458,0.0
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,0.0
max,1309.0,0.0


In [78]:
results.PassengerId = results.PassengerId.astype(int)
results.Survived = results.Survived.astype(int)
results.to_csv('results.csv', index=False)