# MLP and Random Forest comparison
See the readme for a description of the notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# load the train data into a dataframe

train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# load the test data in to a dataframe

test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# check for any missing data

print(test_data.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


# Split the data for training and testing

In [9]:
# import the test_train_split function

from sklearn.model_selection import train_test_split

# set the target series

y = train_data["Survived"]

# set the feature dataframe

features = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Age"]
X = pd.get_dummies(train_data[features])
X_unknown = pd.get_dummies(test_data[features])

# impute values as there is missing data in the fares and age column

from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
X = my_imputer.fit_transform(X)
X_unknown = my_imputer.fit_transform(X_unknown)

# split the data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

# Test an MLP classifier

In [10]:
# import MLP classifier

from sklearn.neural_network import MLPClassifier

# find an optimal alpha

optimal_alpha = 1

optimal_accuracy = 0

for i in range(20):
    # define and fit the model classifier with MLP

    model = MLPClassifier(hidden_layer_sizes = [50, 50], alpha = 0.1*(i+1), activation='relu', solver='adam', random_state=1).fit(X_train, y_train)
    
    # find the model accuracy
    
    model_accuracy = model.score(X_test, y_test)
    
    # update optimal variables
    
    if model_accuracy > optimal_accuracy:
        optimal_accuracy = model_accuracy
        optimal_alpha = 0.1*(i+1)
    
# print results
print('The optimal alpha found was {}'.format(optimal_alpha))
print('The test accuracy was {}\n'.format(optimal_accuracy))



The optimal alpha found was 0.6000000000000001
The test accuracy was 0.8067796610169492



# Test a random forest classifier

In [11]:
# import random forest classifier

from sklearn.ensemble import RandomForestClassifier

# find an optimal number of estimators

optimal_estimators = 1

optimal_accuracy = 0

for i in range(20):
    # define the model classifier with random forest

    model = RandomForestClassifier(n_estimators=(i+1)*10, max_depth=5, random_state=1).fit(X_train, y_train)

    # find the model accuracy
    
    model_accuracy = model.score(X_test, y_test)
    
    # update optimal variables
    
    if model_accuracy > optimal_accuracy:
        optimal_accuracy = model_accuracy
        optimal_estimators = (i+1)*10
    
# print results
print('The optimal number of estimators found was {}'.format(optimal_estimators))
print('The test accuracy was {}\n'.format(optimal_accuracy))

The optimal number of estimators found was 110
The test accuracy was 0.8372881355932204



# Make predictions on test_data with best performing classifier

In [13]:
# define the classifier

model = RandomForestClassifier(n_estimators=110, max_depth=5, random_state=1).fit(X, y)

#make predicitons
predictions = model.predict(X_unknown)

# import the predictions to a dataframe with passenger IDs

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

# save to a csv file

output.to_csv('my_submission.csv', index=False)