In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [48]:
df = pd.read_csv("titanic-passengers.csv")

In [49]:
# displaying the first five rows of the dataframe
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [50]:
df.isnull().sum()



PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Handling the missing values

In [51]:
df = df.drop(columns='Cabin', axis=1)

Replacing the missing values in the “Age” column with the mean value

In [52]:
df['Age'].fillna(df['Age'].mean(), inplace=True)



Finding the mode value of the “Embarked” column as it will have occurred the maximum number of times

In [53]:
print(df['Embarked'].mode())

0    S
dtype: object


Replacing the missing values in the “Embarked” column with mode value

In [54]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Transformation into a categorical column.

In [55]:
df.replace({'Sex':{'male':0,'female':1}, 'Embarked':{'S':0,'C':1,'Q':2}}, inplace=True)



In [56]:
 df= df.drop(columns = ['PassengerId','Name','Ticket'],axis=1)

In [57]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.000000,1,0,7.2500,0
1,1,1,1,38.000000,1,0,71.2833,1
2,1,3,1,26.000000,0,0,7.9250,0
3,1,1,1,35.000000,1,0,53.1000,0
4,0,3,0,35.000000,0,0,8.0500,0
...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,0
887,1,1,1,19.000000,0,0,30.0000,0
888,0,3,1,29.699118,1,2,23.4500,0
889,1,1,0,26.000000,0,0,30.0000,1


# Let’s split the data into the target and feature variables.

In [58]:
X = df.drop(columns = ['Survived'],axis=1)
Y = df['Survived']

In [59]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.000000,1,0,7.2500,0
1,1,1,38.000000,1,0,71.2833,1
2,3,1,26.000000,0,0,7.9250,0
3,1,1,35.000000,1,0,53.1000,0
4,3,0,35.000000,0,0,8.0500,0
...,...,...,...,...,...,...,...
886,2,0,27.000000,0,0,13.0000,0
887,1,1,19.000000,0,0,30.0000,0
888,3,1,29.699118,1,2,23.4500,0
889,1,0,26.000000,0,0,30.0000,1


Now, we will be splitting the data into four variables, namely, X_train, Y_train, X_test, Y_test.

In [60]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=2)



In [61]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
199,2,1,24.000000,0,0,13.0000,0
129,3,0,45.000000,0,0,6.9750,0
90,3,0,29.000000,0,0,8.0500,0
230,1,1,35.000000,1,0,83.4750,0
126,3,0,29.699118,0,0,7.7500,2
...,...,...,...,...,...,...,...
534,3,1,30.000000,0,0,8.6625,0
584,3,0,29.699118,0,0,8.7125,1
493,1,0,71.000000,0,0,49.5042,1
527,1,0,29.699118,0,0,221.7792,0


# Logistic Regression

In [62]:
model = LogisticRegression()

In [63]:
model.fit(X_train, Y_train)



LogisticRegression()

# Checking the Accuracy

In [64]:
X_train_prediction = model.predict(X_train)



In [65]:
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.8068862275449101


Now, Let’s try it again with X_test and Y_test:

In [66]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.7892376681614349
