In [19]:
'''Task: Use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#load training Dataset
df = pd.read_csv("Datasets\\train.csv")

In [20]:
#data preprocessing
meanAge = df['Age'].mean()

#replace null values with NaN values
#we can then fill NaN values
df.replace(' ',np.nan)

#set inplace = True to commit changes to Dataframe 
df['Age'].fillna(value=meanAge,inplace=True)
df.replace(to_replace="male", value=0,inplace=True)
df.replace(to_replace="female", value=1,inplace=True)

print(df['Cabin'].isna().sum())
#drop cabin column due to too many missing values (687 nulls out of 891 rows.)
df.drop('Cabin',axis=1,inplace = True)

#names of the passengers can be dropped too
df.drop('Name',axis=1,inplace = True)
df.drop('PassengerId',axis=1, inplace= True)
df.drop('Ticket',axis=1, inplace= True)

df.head()

687


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.925,S
3,1,1,1,35.0,1,0,53.1,S
4,0,3,0,35.0,0,0,8.05,S


In [21]:
#one hot encoding the 'Embarked' feature

dataset = pd.get_dummies(df, columns = ['Embarked'])
print(dataset)

#set dependant and independant variables

#dependant variables
x = dataset.iloc[:,1:]

#independant variable
y = dataset.iloc[:,0]


     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Embarked_C  \
0           0       3    0  22.000000      1      0   7.2500           0   
1           1       1    1  38.000000      1      0  71.2833           1   
2           1       3    1  26.000000      0      0   7.9250           0   
3           1       1    1  35.000000      1      0  53.1000           0   
4           0       3    0  35.000000      0      0   8.0500           0   
..        ...     ...  ...        ...    ...    ...      ...         ...   
886         0       2    0  27.000000      0      0  13.0000           0   
887         1       1    1  19.000000      0      0  30.0000           0   
888         0       3    1  29.699118      1      2  23.4500           0   
889         1       1    0  26.000000      0      0  30.0000           1   
890         0       3    0  32.000000      0      0   7.7500           0   

     Embarked_Q  Embarked_S  
0             0           1  
1             0           0

In [22]:
#split dataset into training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

print(x_train)
print(x_test)
print(y_train)
print(y_test)

     Pclass  Sex        Age  SibSp  Parch     Fare  Embarked_C  Embarked_Q  \
140       3    1  29.699118      0      2  15.2458           1           0   
439       2    0  31.000000      0      0  10.5000           0           0   
817       2    0  31.000000      1      1  37.0042           1           0   
378       3    0  20.000000      0      0   4.0125           1           0   
491       3    0  21.000000      0      0   7.2500           0           0   
..      ...  ...        ...    ...    ...      ...         ...         ...   
835       1    1  39.000000      1      1  83.1583           1           0   
192       3    1  19.000000      1      0   7.8542           0           0   
629       3    0  29.699118      0      0   7.7333           0           1   
559       3    1  36.000000      1      0  17.4000           0           0   
684       2    0  60.000000      1      1  39.0000           0           0   

     Embarked_S  
140           0  
439           1  
817      

In [23]:
#multiple linear regression

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)

y_pred = regressor.predict(x_test)
y_pred = regressor.predict(x_test)
regressorScore = round(regressor.score(x_train,y_train)*100 , 2)
print(regressorScore)

39.01


In [24]:
#decision tree regressor
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(x_train,y_train)

y_pred = dtr.predict(x_test)
DTRScore = round(dtr.score(x_train,y_train)*100 , 2)
print(DTRScore)

95.53


In [25]:
#random forest algorithm

from sklearn.ensemble import RandomForestRegressor
#use 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

fitX=rf.fit(x_train,y_train)

y_pred = rf.predict(x_test)
RFScore = round(rf.score(x_train,y_train)*100 , 2)
print(RFScore)




88.51


In [26]:
#support vector regression

from sklearn.svm import SVR
svr = SVR(kernel="linear")
svr.fit(x_train,y_train)
y_pred = svr.predict(x_test)
SVRScore = round(svr.score(x_train,y_train)*100 , 2)
print(SVRScore)


24.09


In [27]:
# KNN

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
knnScore = round(knn.score(x_train, y_train) * 100, 2)
print(knnScore)

82.72


In [28]:
results = pd.DataFrame({
    'Model': ['SVR','Random Forest','Decision Tree','Linear Regression','KNN'],
    'Score': [SVRScore,RFScore,DTRScore,regressorScore,knnScore]
})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
95.53,Decision Tree
88.51,Random Forest
82.72,KNN
39.01,Linear Regression
24.09,SVR
