# Logistic Regression Implementation from scratch

In [211]:
#Getting all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### Exploring our dataset

We have a total of 11 input columns and 1 output column which represents whether the person has a Cardiovascular disease or not.
We have roughly 150 entries for each classification so it is a balanced dataset. No more processing is required on it.

In [212]:
#loading the dataset
heart = pd.read_csv("heart.csv")

#shuffling the dataset 
heart = heart.sample(frac=1).reset_index(drop=True)

#Seperating the input and output variables
X = heart[["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]]
Y = heart["target"]

#Splitting the dataset in to test and training samples
sample_x = X[:250]
sample_y = Y[:250].values
sample_y = sample_y.reshape(sample_y.shape[0],-1)
test_x = X[250:]
test_y = Y[250:].values
test_y = test_y.reshape(test_y.shape[0],-1)

In [213]:
sample_x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,34,1,3,118,182,0,0,174,0,0.0,2,0,2
1,42,0,0,102,265,0,0,122,0,0.6,1,0,2
2,62,0,0,150,244,0,1,154,1,1.4,1,0,2
3,56,1,3,120,193,0,0,162,0,1.9,1,0,3
4,50,1,0,150,243,0,0,128,0,2.6,1,0,3


In [214]:
print(sample_x.shape)
print(sample_y.shape)

(250, 13)
(250, 1)


Defining helper functions for the aglorithm

In [215]:
def get_sigmoid(inp):
    
    #simply returning sigmoid our values
    return (1/(1+np.exp(-inp))) - 0.00000001
    

In [216]:
def get_loss(y_hat, y, n):
    
    #simply returning the loss calculated using our loss function
    return -(1/n)*(np.sum(y*(np.log(y_hat)) + (1-y)*np.log(1-y_hat)))
    

In [217]:
def update_params(n, X, y_hat, y, alpha, w_old):
  
    #looping to update every parameter
    diff = y_hat-y
    update = (1/n)*np.dot(X.T,diff)
   
    #applying the update equation
    w_new = w_old-(alpha)*(update.T)
    
    #return the updated weights
    return w_new

In [218]:
def train_logistic_regression(X,y, alpha = 0.01, epochs = 10000):
    
    #first we get the number of examples present in our dataset
    n = len(X)
    
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    #Randomly initialising weights of our model
    w = np.random.rand(1,X.shape[1])
    print("Original weights =",w)
    
    #setting an initial value for loss (a high value)
    #loss = 1000
    for i in range(epochs):
        
        dot = np.dot(X, w.T)
        y_hat = get_sigmoid(dot)
        
        #calculate the loss against the predicted outputs
        loss = get_loss(y_hat, y, n)
        
        print("Loss ================>",loss)
        
        #updating the weights based on the loss
        w = update_params(n, X, y_hat, y, alpha, w)
    
    #we return the predictions made on the train dataset as well as the final weights.
    return w, y_hat

Lets train our built model

In [219]:
%%time
weights, out = train_logistic_regression(sample_x,sample_y)

Original weights = [[0.31129175 0.51786169 0.43548714 0.13574612 0.41867452 0.86717353
  0.25890865 0.44446465 0.86472381 0.75065537 0.62682723 0.67233518
  0.48020475]]














Wall time: 1.35 s


Since our dataset was a relatively small one so it only took 1.35 s to train with 1000 epochs

In [220]:
weights

array([[-0.63954008, -1.07365887,  1.98337025, -0.46291947, -0.09634861,
         0.19209945,  0.79558596,  1.598035  , -0.86770595, -0.71620467,
         1.81328009, -2.03994124, -1.02228925]])

In [221]:
def predict_logistic_regression(weights, x_test):
    
    #doing the neccesary calculations before applying sigmoid funciton
    X = x_test.values
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    #calculate the predictions
    product = np.dot(X,weights.T)
    y_preds = get_sigmoid(product)
    
    #return the predictions with applied thresholds
    return [1 if i >= 0.5 else 0 for i in y_preds]

In [222]:
y_pred = predict_logistic_regression(weights, test_x)

In [223]:
'''
funciton to calculate the percentage accuracy from the given predictions
'''

def get_accuracy(y_pred, y):
    
    assert len(y_pred) == len(y)
    n = len(y)
    Y = y
    
    count = 0
    for i in range(n):
        if y_pred[i] == Y[i]:
            count+=1
     
    return print("Accuracy:", round(((count/n)*100),2), "%")

In [224]:
get_accuracy(y_pred, test_y)

Accuracy: 75.47 %


An accuracy of **75.47%** is reasonable, we can increase the accuracy by training the using different hyperparameters *(learning_rate & epochs)*.

Lets see how well we have performed compared to the built-in function from Sk-Learn library



# Logistic Regression from SK-Learn

In [233]:
#loading new dataset
heart = pd.read_csv("heart.csv")
heart = heart.sample(frac=1).reset_index(drop=True)


In [234]:
X = heart[["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]]
Y = heart["target"]
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [235]:
sample_x = X[:250]
sample_y = Y[:250].values
test_x = X[250:]
test_y = Y[250:].values

In [236]:
 from sklearn.linear_model import LogisticRegression

In [237]:
clf = LogisticRegression().fit(X, Y)

In [238]:
logit_preds = clf.predict(test_x)

In [239]:
get_accuracy(logit_preds, test_y)

Accuracy: 79.25 %


It yields an accuracy of **79.25%** hence our model has performed quite well in the first try.