# Model Training

In [2]:
import numpy as np
import pandas as pd

## Read in the cleaned data

In [90]:
train_df = pd.read_csv('./data/cleaned_data/train_cleaned.csv')
test_df = pd.read_csv("./data/cleaned_data/test_cleaned.csv")

test_label = pd.read_csv("./data/cleaned_data/test.csv")

In [54]:
train_df.head()

Unnamed: 0,Survived,Pclass,Fare,AgeGroup,FareBuckets,FamilySize,Sex_female,Sex_male,EmbarkedImputed_C,EmbarkedImputed_Q,...,Floor_E,Floor_F,Floor_G,Floor_Ground Floor,Floor_T,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Other Titles
0,0,3,7.25,2,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1,1,1,71.2833,2,2,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1,3,7.925,2,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,1,1,53.1,2,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,3,8.05,2,1,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0


In [55]:
test_df.head()

Unnamed: 0,Pclass,Fare,AgeGroup,FareBuckets,FamilySize,Sex_female,Sex_male,EmbarkedImputed_C,EmbarkedImputed_Q,EmbarkedImputed_S,...,Floor_E,Floor_F,Floor_G,Floor_Ground Floor,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Other Titles,Floor_T
0,3,7.8292,2,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
1,3,7.0,2,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,2,9.6875,2,1,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
3,3,8.6625,2,1,0,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,0
4,3,12.2875,2,1,1,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0


We have read in the data and it's good practice to just look at your data to make sure that it has been imported correctly. head() shows the first five rows of the dataset and the opposite is tail() which shows the last five rows in the dataset.

In [56]:
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 24), (891,), (418, 24))

In [57]:
X_train.head()

Unnamed: 0,Pclass,Fare,AgeGroup,FareBuckets,FamilySize,Sex_female,Sex_male,EmbarkedImputed_C,EmbarkedImputed_Q,EmbarkedImputed_S,...,Floor_E,Floor_F,Floor_G,Floor_Ground Floor,Floor_T,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Other Titles
0,3,7.25,2,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0,0
1,1,71.2833,2,2,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,7.925,2,0,0,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
3,1,53.1,2,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,3,8.05,2,1,0,0,1,0,0,1,...,0,0,0,1,0,0,0,1,0,0


In [58]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [60]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

93.6

In [59]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

74.97

In [94]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

93.6

In [62]:
models = pd.DataFrame({
    'Model': ['Random Forest', 'Perceptron', 'Decision Tree'],
    'Score': [acc_random_forest, acc_perceptron, acc_decision_tree]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Random Forest,93.6
2,Decision Tree,93.6
1,Perceptron,74.97


## Cross-validation

### Decision tree w/ cross-validation

In [64]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(DecisionTreeClassifier(), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Decision tree', scores.mean(), scores.std())
print (msg)

Decision tree: 0.80 (+/- 0.04)


### Random forest w/ cross-validation

In [65]:
scores = cross_val_score(RandomForestClassifier(n_estimators=100), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Random forest', scores.mean(), scores.std())
print (msg)

Random forest: 0.79 (+/- 0.03)


### Perceptron w/ cross-validation

In [66]:
scores = cross_val_score(Perceptron(), X_train, Y_train, cv=10)

msg = "%s: %0.2f (+/- %0.2f)" % ('Perceptron', scores.mean(), scores.std())
print (msg)

Perceptron: 0.73 (+/- 0.06)


In [None]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
# submission.to_csv('../output/submission.csv', index=False)

## Neural network w/ multi-layer perceptron

In [67]:
from sklearn.neural_network import MLPClassifier

In [76]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [81]:
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
#                      hidden_layer_sizes=(50,50), random_state=1)
clf = MLPClassifier()
clf.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(alpha=1e-05, hidden_layer_sizes=(50, 50), random_state=1,
              solver='lbfgs')

In [83]:
clf.score(X_train, Y_train)

0.9034792368125701

In [86]:
Y_pred = clf.predict(X_test)
Y_pred

array([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,

In [95]:
submission = pd.DataFrame({
        "PassengerId": test_label["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('./output/submission.csv', index=False)