In [2]:
import pandas as pd
import numpy as np
from algorithm.simpleRF import random_forest, predict_rf

In [3]:
# Kaggle Titanic dataset
df = pd.read_csv('titanic/train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.shape

(891, 12)

In [6]:
# Simple cleaning
df.loc[df['Age'].isnull(),'Age'] = np.round(df['Age'].mean())
df.loc[df['Embarked'].isnull(),'Embarked'] = df['Embarked'].value_counts().index[0]

In [7]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
features = ['Pclass','Sex','Age','SibSp','Parch', 'Fare', 'Embarked']

# Shuffle and split dataset
nb_train = int(np.floor(0.9 * len(df)))
df = df.sample(frac=1, random_state=217)
X_train = df[features][:nb_train]
y_train = df['Survived'][:nb_train].values
X_test = df[features][nb_train:]
y_test = df['Survived'][nb_train:].values

In [9]:
# Defining parameters
n_estimators = 100
max_features = 3
max_depth = 10
min_samples_split = 2

In [10]:
# Fit data to model
model = random_forest(X_train, y_train, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split)

OOB estimate: 0.31


In [12]:
# Inferencing
preds = predict_rf(model, X_test)
acc = sum(preds == y_test) / len(y_test)
print("Testing accuracy: {}".format(np.round(acc,3)))

Testing accuracy: 0.689


### Archive

In [None]:
# def main():
#     # Kaggle Titanic dataset
#     df = pd.read_csv('titanic/train.csv')
#     # df.head()
#     print("processing df...")
#     df.loc[df['Age'].isnull(),'Age'] = np.round(df['Age'].mean())
#     df.loc[df['Embarked'].isnull(),'Embarked'] = df['Embarked'].value_counts().index[0]
    
#     print("read_features...")
#     features = ['Pclass','Sex','Age','SibSp','Parch', 'Fare', 'Embarked']
#     nb_train = int(np.floor(0.9 * len(df)))
#     df = df.sample(frac=1, random_state=217)
#     X_train = df[features][:nb_train]
#     y_train = df['Survived'][:nb_train].values
#     X_test = df[features][nb_train:]
#     y_test = df['Survived'][nb_train:].values

#     # n_estimators = 100
#     # max_features = 3
#     # max_depth = 10
#     # min_samples_split = 2
#     print("into model...")
#     model = random_forest(X_train, y_train, n_estimators=100, max_features=3, max_depth=10, min_samples_split=2)

#     preds = predict_rf(model, X_test)
#     acc = sum(preds == y_test) / len(y_test)
#     print("Testing accuracy: {}".format(np.round(acc,3)))

# if __name__=='__main__':
#     main()