# Objective

Given the information about a set of passengers, design a Logistic regression model to predict if a person is likely to survive.

# Import libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

### Import dataset

In [2]:
raw_data = pd.read_csv('train_and_test2.csv')
raw_data

Unnamed: 0,Passengerid,Age,Fare,Sex,sibsp,zero,zero.1,zero.2,zero.3,zero.4,...,zero.12,zero.13,zero.14,Pclass,zero.15,zero.16,Embarked,zero.17,zero.18,2urvived
0,1,22.0,7.2500,0,1,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1,2,38.0,71.2833,1,1,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,1
2,3,26.0,7.9250,1,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,1
3,4,35.0,53.1000,1,1,0,0,0,0,0,...,0,0,0,1,0,0,2.0,0,0,1
4,5,35.0,8.0500,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,28.0,8.0500,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1305,1306,39.0,108.9000,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0.0,0,0,0
1306,1307,38.5,7.2500,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0
1307,1308,28.0,8.0500,0,0,0,0,0,0,0,...,0,0,0,3,0,0,2.0,0,0,0


# Taking important features

In [40]:
data =raw_data[['2urvived','Age','Fare','Sex','sibsp','Embarked']]

In [41]:
data.head()

Unnamed: 0,2urvived,Age,Fare,Sex,sibsp,Embarked
0,0,22.0,7.25,0,1,2.0
1,1,38.0,71.2833,1,1,0.0
2,1,26.0,7.925,1,0,2.0
3,1,35.0,53.1,1,1,2.0
4,0,35.0,8.05,0,0,2.0


In [42]:
data.describe()

Unnamed: 0,2urvived,Age,Fare,Sex,sibsp,Embarked
count,1309.0,1309.0,1309.0,1309.0,1309.0,1307.0
mean,0.261268,29.503186,33.281086,0.355997,0.498854,1.492731
std,0.439494,12.905241,51.7415,0.478997,1.041658,0.814626
min,0.0,0.17,0.0,0.0,0.0,0.0
25%,0.0,22.0,7.8958,0.0,0.0,1.0
50%,0.0,28.0,14.4542,0.0,0.0,2.0
75%,1.0,35.0,31.275,1.0,1.0,2.0
max,1.0,80.0,512.3292,1.0,8.0,2.0


In [43]:
data=data.dropna()
data.describe()

Unnamed: 0,2urvived,Age,Fare,Sex,sibsp,Embarked
count,1307.0,1307.0,1307.0,1307.0,1307.0,1307.0
mean,0.260138,29.471821,33.209595,0.355011,0.499617,1.492731
std,0.438877,12.881592,51.748768,0.4787,1.042273,0.814626
min,0.0,0.17,0.0,0.0,0.0,0.0
25%,0.0,22.0,7.8958,0.0,0.0,1.0
50%,0.0,28.0,14.4542,0.0,0.0,2.0
75%,1.0,35.0,31.275,1.0,1.0,2.0
max,1.0,80.0,512.3292,1.0,8.0,2.0


# Declaring variables

In [44]:
x1=data[['Age','Fare','Sex','sibsp','Embarked']]
y=data['2urvived']

# Scaling

In [45]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x1)

StandardScaler()

In [46]:
x1_scaled=scaler.transform(x1)

# Split

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x1_scaled, y, test_size=0.2, random_state=365)

In [48]:
x_train

array([[-0.81324028, -0.48935435, -0.74189971, -0.47953739,  0.62293984],
       [-0.34728105, -0.48878987,  1.34789107,  0.48027176,  0.62293984],
       [ 2.44847436,  4.43011751, -0.74189971,  0.48027176, -1.83311559],
       ...,
       [-0.65792053, -0.41967966, -0.74189971,  0.48027176,  0.62293984],
       [-1.43451926,  1.67779117, -0.74189971,  0.48027176,  0.62293984],
       [-0.34728105, -0.50232194, -0.74189971, -0.47953739, -1.83311559]])

# Logistic Regression

In [49]:
x_scaled = sm.add_constant(x1_scaled)
reg_logit = sm.Logit(y,x_scaled)
results_logit = reg_logit.fit()
results_logit.summary2()

Optimization terminated successfully.
         Current function value: 0.483049
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.157
Dependent Variable:,2urvived,AIC:,1274.689
Date:,2021-09-29 14:41,BIC:,1305.742
No. Observations:,1307,Log-Likelihood:,-631.34
Df Model:,5,LL-Null:,-749.17
Df Residuals:,1301,LLR p-value:,6.5499e-49
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-1.2558,0.0753,-16.6830,0.0000,-1.4034,-1.1083
x1,-0.2101,0.0741,-2.8364,0.0046,-0.3553,-0.0649
x2,0.2813,0.0709,3.9663,0.0001,0.1423,0.4203
x3,0.8846,0.0683,12.9468,0.0000,0.7507,1.0185
x4,-0.2475,0.0840,-2.9458,0.0032,-0.4121,-0.0828
x5,-0.0650,0.0703,-0.9241,0.3554,-0.2029,0.0729


# Function to predict and calculate accuracy

In [54]:
def confusion_matrix(data,actual_values,model):
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        return cm, accuracy

# Predicting trained data

In [51]:
confusion_matrix(x_scaled,y,results_logit)

(array([[880.,  87.],
        [222., 118.]]),
 0.7635807192042846)

# Adding constant to test dataset 

In [52]:
x_test_f=sm.add_constant(x_test)

# Prediction and accuracy of test dataset

In [53]:
confusion_matrix(x_test_f,y_test,results_logit)

(array([[186.,  14.],
        [ 41.,  21.]]),
 0.7900763358778626)

# Conclusion

WE CAN CONCLUDE THAT THE ACCURACY OF OUR MODEL IS 79%