In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
file_path = 'train.csv'
data = pd.read_csv(file_path)
#I will have to remove the name field as this is irrelevant, the ticket field as this contains both letters and 
#numbers and the cabin field as the number of missing entries is high. Fares have been rounded.
data.Fare = data.Fare.astype(int)
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = data[features]
y = data.Survived

In [3]:
#Split data into train and test segments
train_X, valid_X, train_y, valid_y = train_test_split(X, y, random_state = 0)

In [4]:
#I have resued my function to determine accuracy here, using the optimised settings from before
def accuracy(train_X, valid_X, train_y, valid_y):
    model = RandomForestRegressor(max_leaf_nodes = 17, random_state = 1)
    model.fit(train_X, train_y)
    predicted_values = model.predict(valid_X)

    count = 0
    for item in predicted_values:
        item = round(item, 0)
        predicted_values[count] = item
        count+=1

    real_values = []
    for item in valid_y:
        real_values.append(item)

    count = 0
    correct_predictions = 0
    for item in predicted_values:
        comparator = real_values[count]
        if item == comparator:
            correct_predictions+=1
        count+=1
    
    accuracy = (correct_predictions/count)*100
    return(accuracy)

In [5]:
#First we remove any categorical data - this will be sorted later and isn't missing many entries
num_X_train = train_X.drop(['Sex', 'Embarked'], axis=1)
num_X_valid = valid_X.drop(['Sex', 'Embarked'], axis=1)

imputer = SimpleImputer(strategy = 'most_frequent')
imputed_X_train = pd.DataFrame(imputer.fit_transform(num_X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(num_X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = num_X_train.columns
imputed_X_valid.columns = num_X_valid.columns

In [6]:
# Get list of categorical variables
s = (train_X.dtypes == 'object')
object_columns = list(s[s].index)

# Make copy to avoid changing original data (when imputing)
train_X_plus = train_X[object_columns].copy()
valid_X_plus = valid_X[object_columns].copy()

#Here I am using for loops to imitiate the imputation for categorical data. This cannot be done using the imputer
#itself as this only works for numerical data.
S = 0
C = 0
Q = 0
for item in train_X_plus['Embarked']:
    if item == 'S':
        S+=1
    if item == 'C':
        C+=1
    if item == 'Q':
        Q+=1
for item in valid_X_plus['Embarked']:
    if item == 'S':
        S+=1
    if item == 'C':
        C+=1
    if item == 'Q':
        Q+=1
print(S,C,Q)

male = 0
female = 0

for item in train_X_plus['Sex']:
    if item == 'male':
        male+=1
    if item == 'female':
        female+=1
for item in valid_X_plus['Sex']:
    if item == 'male':
        male+=1
    if item == 'female':
        female+=1
print(male,female)

644 168 77
577 314


In [7]:
#Fill in any gaps because we used the most frequent imputer earlier I will imitate the behaviour here

train_X_plus['Embarked'] = train_X_plus['Embarked'].fillna(value = 'S', axis = 0)
valid_X_plus['Embarked'] = valid_X_plus['Embarked'].fillna(value = 'S', axis = 0)
train_X_plus['Sex'] = train_X_plus['Sex'].fillna(value = 'male', axis = 0)
valid_X_plus['Sex'] = valid_X_plus['Sex'].fillna(value = 'male', axis = 0)

Now I will be able to use one hot encoding - similar to how before I manually used 1s and 0s to replace male and female values.

In [9]:
# Apply one-hot encoder to each column with categorical data
OH_cols_train = pd.get_dummies(train_X_plus)
OH_cols_valid = pd.get_dummies(valid_X_plus)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([imputed_X_train.reset_index(), OH_cols_train.reset_index()], axis=1)
OH_X_valid = pd.concat([imputed_X_valid.reset_index(), OH_cols_valid.reset_index()], axis=1)

In [10]:
print('Accuracy using modal imputation and one hot encoding = ', accuracy(OH_X_train, OH_X_valid, train_y, valid_y), '%')

Accuracy using modal imputation and one hot encoding =  81.61434977578476 %


This model accuracy is lower than before, however now that I have been able to preprocess data, I have been able to use more of the available data. This will allow me to build my complex model - because the preprocessing is consistent and can be used for the official validation data.