In [1]:
# Packages
import pandas as pd

In [2]:
# Reading data
df1 = pd.read_csv("TitanicData.csv", sep = ";")
df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#**Data Cleaning**

In [3]:
# Dropping irrelevant features
df1.drop(["PassengerId", "Name", "Ticket"], inplace = True, axis = 1)

In [4]:
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [5]:
# Checking for null values
df1.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

Since the features "Age" and "Cabin" has a signifcant number of null values, we will drop it, and we will fill "Embarked" with the median value

In [6]:
df1.drop(["Age", "Cabin"], inplace = True, axis = 1)

In [7]:
df1

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,male,1,0,7.2500,S
1,1,1,female,1,0,71.2833,C
2,1,3,female,0,0,7.9250,S
3,1,1,female,1,0,53.1000,S
4,0,3,male,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,S
887,1,1,female,0,0,30.0000,S
888,0,3,female,1,2,23.4500,S
889,1,1,male,0,0,30.0000,C


In [8]:
# Encoding the relevant features
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df1["Sex"] = le.fit_transform(df1["Sex"])
df1["Embarked"] = le.fit_transform(df1["Embarked"])

df1                              

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,1,1,0,7.2500,2
1,1,1,0,1,0,71.2833,0
2,1,3,0,0,0,7.9250,2
3,1,1,0,1,0,53.1000,2
4,0,3,1,0,0,8.0500,2
...,...,...,...,...,...,...,...
886,0,2,1,0,0,13.0000,2
887,1,1,0,0,0,30.0000,2
888,0,3,0,1,2,23.4500,2
889,1,1,1,0,0,30.0000,0


In [9]:
# Converting the relevant feature data types
df1.dtypes

Survived      int64
Pclass        int64
Sex           int64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object

In [13]:
df1["Survived"] = df1["Survived"].astype("category")
df1["Pclass"] = df1["Pclass"].astype("category")
df1["Sex"] = df1["Sex"].astype("category")
df1["SibSp"] = df1["SibSp"].astype("category")
df1["Parch"] = df1["Parch"].astype("category")
df1["Embarked"] = df1["Embarked"].astype("int64")

df1.dtypes

Survived    category
Pclass      category
Sex         category
SibSp       category
Parch       category
Fare         float64
Embarked       int64
dtype: object

In [14]:
# Filling in the missing values
median_Embarked = df1["Embarked"].median()
df1["Embarked"].fillna(median_Embarked)

0      2
1      0
2      2
3      2
4      2
      ..
886    2
887    2
888    2
889    0
890    1
Name: Embarked, Length: 891, dtype: int64

In [15]:
# Now converting the feature "Embarked" to categorical type
df1["Embarked"] = df1["Embarked"].astype("category")

df1.dtypes

Survived    category
Pclass      category
Sex         category
SibSp       category
Parch       category
Fare         float64
Embarked    category
dtype: object

In [16]:
# Checking unique values forthe feature "Embarked"
df1["Embarked"].unique()

[2, 0, 1, 3]
Categories (4, int64): [0, 1, 2, 3]

In [17]:
df1

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked
0,0,3,1,1,0,7.2500,2
1,1,1,0,1,0,71.2833,0
2,1,3,0,0,0,7.9250,2
3,1,1,0,1,0,53.1000,2
4,0,3,1,0,0,8.0500,2
...,...,...,...,...,...,...,...
886,0,2,1,0,0,13.0000,2
887,1,1,0,0,0,30.0000,2
888,0,3,0,1,2,23.4500,2
889,1,1,1,0,0,30.0000,0


#**Model Building**

In [19]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

X = df1[["Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked"]]
y = df1[["Survived"]]

X_train, X_test, y_train, y_test = train_test_split(X, y)

1. Multivariate Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_model.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.7802690582959642

2. Support Vector Machine

In [21]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_model.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.6322869955156951

3. Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

  after removing the cwd from sys.path.


0.757847533632287

4. Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

gaussiannb_model = GaussianNB()
multinomialnb_model = MultinomialNB()

4.1 Gaussian Naive Bayes

In [25]:
gaussiannb_model.fit(X_train, y_train)
gaussiannb_model.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.757847533632287

4.2 Multinomial Naive Bayes

In [26]:
multinomialnb_model.fit(X_train, y_train)
multinomialnb_model.score(X_test, y_test)

  y = column_or_1d(y, warn=True)


0.6547085201793722

**We conclude that the best model for the data is Multivariate Logistic Regression**