In [302]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Functions

In [248]:
def scaler(X):
    """
    Scales data X using StandardScaler
    """
    standard_scaler = StandardScaler()
    standard_scaler.fit(X)
    X_scaled = standard_scaler.transform(X)
    return X_scaled

In [249]:
def clean_data(df_X):
    """
    Cleans data and returns cleaned data
    """
    
    df_X_clean = df_X.copy(deep = True)
    sex_dic = {'male' : 0, 'female' : 1}
    embarked_dic = {'S':0, 'C':1, 'Q':2}
    #remove irrelivant/difficult rows and change strings to ints, one hot encoding is NOT used. 
    df_X_clean = df_X_clean.drop(columns = ['Name','Cabin','Ticket','PassengerId'], inplace = False)\
                                   .replace({'Sex': sex_dic})\
                                   .replace({'Embarked': embarked_dic})
    df_X_clean = df_X_clean.fillna(df_Xdata_clean.mean())
    return df_X_clean                                            
    
    

# Import and clean data

In [259]:
df_data = pd.read_csv('train.csv') # Import training data
df_valid = pd.read_csv('test.csv') # Import tesimng for submission

print(f'Input data shape: {df_data.shape}')
print(f'Input data shape: {df_valid.shape}')

Input data shape: (891, 12)
Input data shape: (418, 11)


Extract the suvival from the data frame for X,Y and train test split 

In [260]:
#df_Xdata = df_data[df_data.columns.difference(['Survived'])] # Remove the survived column form the data, but pandas automatically reorders columns in aplhabetical order, very annoying
df_Xdata = df_data.drop(columns = ['Survived'])
Ydata = df_data['Survived'] # We feed the data frame a list as an index so it returns a dataframe and not a string. 

In [261]:
df_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [262]:
X_train, X_test, Y_train, Y_test = train_test_split(df_Xdata, Ydata, random_state = 42, test_size = 0.3)

Examining the columns we cam remove the names by hand, since we already have gender and fare and ticket. Further we can change male\female to 0\1. and 

In [263]:
clean_data(df_Xdata)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.000000,1,0,7.2500,0.0
1,1,1,38.000000,1,0,71.2833,1.0
2,3,1,26.000000,0,0,7.9250,0.0
3,1,1,35.000000,1,0,53.1000,0.0
4,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,0.0
887,1,1,19.000000,0,0,30.0000,0.0
888,3,1,29.699118,1,2,23.4500,0.0
889,1,0,26.000000,0,0,30.0000,1.0


Now prepare cleaned and scaled data for training

In [354]:
X_train_clean =  pd.DataFrame(scaler(clean_data(X_train)))
X_test_clean  =  pd.DataFrame(scaler(clean_data(X_test)))
X_train_clean

Unnamed: 0,0,1,2,3,4,5,6
0,-1.637881,-0.720772,-1.946947,-0.474161,1.998853,0.980998,-0.555945
1,0.803267,-0.720772,0.027243,-0.474161,-0.479327,-0.469634,-0.555945
2,0.803267,1.387401,-2.177405,0.348687,0.759763,-0.406136,-0.555945
3,-0.417307,-0.720772,0.511273,0.348687,1.998853,-0.080232,-0.555945
4,-0.417307,-0.720772,1.049008,0.348687,0.759763,-0.109651,-0.555945
...,...,...,...,...,...,...,...
618,0.803267,1.387401,-0.641018,-0.474161,-0.479327,-0.474455,-0.555945
619,-1.637881,-0.720772,0.027243,-0.474161,-0.479327,-0.016489,-0.555945
620,0.803267,-0.720772,0.895370,1.171535,-0.479327,-0.347787,-0.555945
621,-1.637881,1.387401,-1.178753,0.348687,1.998853,1.729074,-0.555945


# Random forest

In [349]:
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train_clean, Y_train)

#looking at random forest predictions
pred_rfc = rfc.predict(X_test_clean)
print('Report from Random Forest:')
print(classification_report(Y_test, pred_rfc))

Report from Random Forest:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       157
           1       0.80      0.64      0.71       111

    accuracy                           0.78       268
   macro avg       0.79      0.76      0.77       268
weighted avg       0.79      0.78      0.78       268



In [347]:
confusion_matrix(Y_test,pred_rfc)

array([[138,  19],
       [ 38,  73]], dtype=int64)

# SVM

In [305]:
svc = SVC()
svc.fit(X_train_clean, Y_train)
pred_svc = svc.predict(X_test_clean)

In [306]:
print('Report from SVC:')
print(classification_report(Y_test, pred_svc))

Report from SVC:
              precision    recall  f1-score   support

           0       0.79      0.93      0.86       157
           1       0.87      0.66      0.75       111

    accuracy                           0.82       268
   macro avg       0.83      0.79      0.80       268
weighted avg       0.82      0.82      0.81       268



In [307]:
confusion_matrix(Y_test,pred_svc)

array([[146,  11],
       [ 38,  73]], dtype=int64)

Grid search for better parameters

In [335]:
param_grid = {'C':[1,25,50,100,150], 'gamma':[0.0175,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.5,1]}
grid = GridSearchCV(SVC(), param_grid, verbose=1)
grid.fit(X_train_clean, Y_train) 

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 25, 50, 100, 150],
                         'gamma': [0.0175, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07,
                                   0.08, 0.09, 0.1, 0.5, 1]},
             verbose=1)

In [336]:
print(grid.best_estimator_) # Best parametrs in the grid search, both lie within the grid and not at the boundry 

SVC(C=50, gamma=0.02)


In [None]:
svc_best = grid.best_estimator_
svc_best.fit(X_train_clean, Y_train)
pred_svc = svc_best.predict(X_test_clean)
print(classification_report(Y_test, pred_svc))
print(confusion_matrix(Y_test, pred_svc))

              precision    recall  f1-score   support

           0       0.78      0.94      0.85       157
           1       0.88      0.63      0.73       111

    accuracy                           0.81       268
   macro avg       0.83      0.78      0.79       268
weighted avg       0.82      0.81      0.80       268

[[147  10]
 [ 41  70]]


# Make prediction with best model: SVM

In [359]:
best_model = svc_best
X_valid_clean =  pd.DataFrame(scaler(clean_data(df_valid)))
prediction = best_model.predict(X_valid_clean)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [360]:
output = pd.DataFrame({'PassengerId': df_valid.PassengerId, 'Survived': prediction})
output.to_csv('submission.csv', index=False)