# Titanic Survival using Logistic Regression

In [1]:
#Import libraries

import pandas as pd
import numpy as np
import math

In [2]:
#load data

df = pd.read_csv('titanic.csv')
print(df.shape)
df.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,31.0,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## Pre processing

### (a) Missing values:
The features "boat", "body", "home.dest" and "cabin" have many missing values, so, droping them from the dataframe.

### (b) Irrelevant features:
Feature "tickets" is a combination of alphabets and digits. Also, name of passenges seems to be an irrelevant feature, removing that as well.

In [3]:
df = df.drop(['boat', 'body','cabin', 'home.dest', 'name','ticket'], axis=1) 
print(df.shape)
df.head()

(1309, 8)


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,31.0,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


### (c) Missing Data Imputation:
Features like "age", "fare" and "embarked" seems to be an important information but it has got some missing values. Therefore, we'll use fillna.() function to fill in the null values with previous ones.

In [4]:
df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [5]:
df['age'] = df['age'].fillna(method ='pad')
df['fare'] = df['fare'].fillna(method ='pad')
df['embarked'] = df['embarked'].fillna(method ='pad')

In [6]:
df.isnull().sum()

pclass      0
survived    0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

### (d) Create numeric values:
Features like "sex" and "embarked" contains values in the form of strings that cannot be used in numeric calculations. So, converting them into numeric values to get useful features.

In [7]:
df.loc[df['sex']=='female','sex']=0
df.loc[df['sex']=='male','sex']=1

df.loc[df['embarked']=='S','embarked']=0
df.loc[df['embarked']=='C','embarked']=1
df.loc[df['embarked']=='Q','embarked']=2

### (e) Adding new features:
Features like "age" and "fare" contains float values and rest are integers. So, convertig these to integers for easy calculations. 

In [8]:
df['age_new']=df['age'].astype(int)
df['fare_new']=df['fare'].astype(int)

df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,age_new,fare_new
0,1,1,0,29.0,0,0,211.3375,0,29,211
1,1,1,1,31.0,1,2,151.55,0,31,151
2,1,0,0,2.0,1,2,151.55,0,2,151
3,1,0,1,30.0,1,2,151.55,0,30,151
4,1,0,0,25.0,1,2,151.55,0,25,151


## Model Building

In [9]:
# Separate X and y data

cols = [0,2,4,5,7,8,9]
x = df[df.columns[cols]]
y = df[df.columns[1]]

In [10]:
def add_intercept(X):
    intercept = np.ones((X.shape[0], 1))
    return np.concatenate((intercept, X), axis=1)

X = add_intercept(x)

In [11]:
#Split the data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(916, 8)
(393, 8)
(916,)
(393,)


In [12]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [13]:
def cost(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

In [14]:
def gradient_descent(X, y, lr, num_iter):   
    # weights initialization
    theta = np.zeros(X.shape[1])
        
    for i in range(num_iter):
        z = np.dot(X, theta)
        h = sigmoid(z)
        gradient = np.dot(X.T, (h - y)) / y.size
        theta -= lr * gradient
            
        loss = cost(h, y)
                
        if i % 10000 == 0:
            print(f'cost: {loss} \t', f'weights: {theta} \t')
    return theta

In [15]:
def train(x, y, learning_rate,iterations):
    theta = gradient_descent(x, y, learning_rate, iterations)
    return theta

In [16]:
#train the model

theta = train(X_train, y_train, learning_rate = 0.005, iterations = 100000)
print("\n\nFinal weights :", theta)

cost: 0.693147180559936 	 weights: [-6.27729258e-04 -2.12336245e-03 -1.00436681e-03 -4.06659389e-04
 -2.72925764e-05 -7.09606987e-05 -2.01200873e-02  1.30322052e-02] 	
cost: 0.5200970463937097 	 weights: [ 7.03855280e-01 -1.93286540e-01 -1.91093660e+00 -3.26299144e-01
  7.41864454e-02  2.81189230e-01  1.05817629e-03  2.92560312e-02] 	
cost: 0.5052605881900989 	 weights: [ 1.39507508 -0.32150281 -2.19526166 -0.34362297  0.0349963   0.24006376
 -0.00372451  0.02680955] 	
cost: 0.49869492053926434 	 weights: [ 1.95410656 -0.45087108 -2.29370497 -0.35785921  0.02225422  0.22728977
 -0.00846948  0.02602934] 	
cost: 0.4945629509074 	 weights: [ 2.40321482 -0.55765856 -2.35972369 -0.36963938  0.01498014  0.22083744
 -0.01245498  0.02547289] 	
cost: 0.4917855195671382 	 weights: [ 2.76579244 -0.64385405 -2.41339473 -0.37942608  0.00948678  0.21590622
 -0.01572631  0.02498853] 	
cost: 0.4898856923336641 	 weights: [ 3.06032233 -0.71362364 -2.45841526 -0.38755904  0.00508358  0.21175049
 -0.0184

## Predictions

In [17]:
def predict_prob(X, theta):
    final_pred = sigmoid(np.dot(X, theta))
    return final_pred.round()

predictions = predict_prob(X_test, theta)

## Evaluation metrices:

In [18]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(predictions, y_test)
print("Confusion matrix : \n", matrix)

from sklearn.metrics import classification_report
report = classification_report(predictions, y_test)
print("\nClassification report : \n", report)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,predictions)
print('Accuracy = {:0.2f}%'.format(accuracy*100))

Confusion matrix : 
 [[216  58]
 [ 20  99]]

Classification report : 
               precision    recall  f1-score   support

         0.0       0.92      0.79      0.85       274
         1.0       0.63      0.83      0.72       119

   micro avg       0.80      0.80      0.80       393
   macro avg       0.77      0.81      0.78       393
weighted avg       0.83      0.80      0.81       393

Accuracy = 80.15%
