In [1]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#reading input files
titanic_data = pd.read_csv('TITANIC.csv',encoding='ISO-8859–1')

In [3]:
#finding missing values and their count
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#creating function to replace missing ages by median of ages
def imput_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        return int(titanic_data[titanic_data["Pclass"] == Pclass]["Age"].median())
    else:
        return Age

In [5]:
#imputing median age in place of missing values
titanic_data["Age"] = titanic_data[["Age", "Pclass"]].apply(imput_age, axis = 1)

In [6]:
#dropping cabin column
titanic_data.drop(titanic_data.columns[[10]],axis = 1, inplace = True)

In [7]:
#impute embark with the most common value found by visual analysis
most_common_value= 'S'

for data in titanic_data:
    titanic_data['Embarked'] = titanic_data['Embarked'].fillna(most_common_value)

In [8]:
titanic_data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [9]:
#converting categorical features into dummy variables
sex = pd.get_dummies(titanic_data['Sex'], drop_first = True)
embark = pd.get_dummies(titanic_data['Embarked'], drop_first = True)
pclass = pd.get_dummies(titanic_data['Pclass'], drop_first = True)

In [10]:
titanic_data.drop(['PassengerId', 'Sex', 'Embarked', 'Name', 'Ticket', 'Pclass'], axis = 1, inplace = True)

In [11]:
titanic_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,0,35.0,0,0,8.05


In [12]:
#adding the dummy variables to the dataset
titanic_data = pd.concat([titanic_data, sex, embark, pclass], axis = 1)

In [13]:
#splitting the dataset into train and split

#shuffling the dataset for random values
shuffle_df = titanic_data.sample(frac=1)

#defining a size for the train set 
train_size = int(0.70 * len(titanic_data))

#splitting
train_set = shuffle_df[:train_size]
test_set = shuffle_df[train_size:]

In [14]:
#separate independent [features] and dependent [target] variables
x = titanic_data.drop('Survived', axis = 1)
y = titanic_data['Survived']

In [15]:
#defining sigmoid and loss functions [y_hat is the prediction]
def sigmoid(z):
    return 1.0/(1 + np.exp(-z))

def loss(y, y_hat):
    loss = -np.mean(y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat))
    return loss
    

In [16]:
#gradient descent
def gradients(x, y, y_hat):
    
    #m - number of training examples.
    m = x.shape[0]
    
    #gradient of loss w.r.t weights.
    dw = (1/m)*np.dot(x.T, (y_hat - y))
    
    # Gradient of loss w.r.t bias.
    db = (1/m)*np.sum((y_hat - y)) 
    
    return dw, db

In [17]:
#to plot decision boundary (for non-linearly separeable data)
def plot_decision_boundary(X, w, b):
    
    x1 = [min(x[:,0]), max(x[:,0])]
    m = -w[0]/w[1]
    c = -b/w[1]
    x2 = m*x1 + c
    
    #plotting
    fig = plt.figure(figsize=(10,8))
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "g^")
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs")
    plt.xlim([-2, 2])
    plt.ylim([0, 2.2])
    plt.xlabel("feature 1")
    plt.ylabel("feature 2")
    plt.title('Decision Boundary')
    plt.plot(x1, x2, 'y-')

In [18]:
#further normalization of input to fit for logistic regression
def normalize(x):
    
    # m - number of training examples
    # n - number of features 
    m, n = x.shape
    
    for i in range(n):
        x = (x - x.mean(axis=0))/x.std(axis=0)
        
    return x

In [19]:
#defining the train function with initilaizing weights and bias
def train(x, y, bs, epochs, lr):
    
    
    #bs - gradient descent batch size
    #epochs - number of iterations
    #lr - learning rate
        
    #m - number of training examples
    #n - number of features 
    m, n = x.shape
    
    #initializing weights and bias to zeros
    w = np.zeros((n,1))
    b = 0
    
    #reshaping y
    y = y.values.reshape(m,1)
    
    #normalizing the inputs
    x = normalize(x)
    
    #empty list to store losses
    losses = []
    
    #Training loop
    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            
            # Defining batches. SGD.
            start_i = i*bs
            end_i = start_i + bs
            xb = x[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculating prediction.
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Getting the gradients of loss w.r.t parameters
            dw, db = gradients(xb, yb, y_hat)
            
            # Updating the parameters.
            w -= lr*dw
            b -= lr*db
        
        #appending losses in the lis
        y_hat = sigmoid(np.dot(x, w) + b)
        l = loss(y, y_hat)
        losses.append(l)
        
    return w, b, losses

In [20]:
#prediction function
def predict(x):
    
    # Normalizing the inputs.
    x = normalize(x)
    
    # Calculating predictions/y_hat.
    preds = sigmoid(np.dot(x, w) + b)
    
    # Empty List to store predictions.
    pred_class = []    
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = [1 if i > 0.5 else 0 for i in preds]
    
    return np.array(pred_class)

In [21]:
#Training 

w, b, l = train(x, y, bs=100, epochs=1000, lr=0.01)

In [22]:
#calculating accuracy function
def accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy

In [23]:
accuracy(y, y_hat=predict(x))*100

80.92031425364759