# Logistic Regression Implementation from scratch

In [1]:
#Getting all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### Exploring our dataset

In [2]:
cardio = pd.read_csv("cardio_train.csv", sep = ';')

We have a total of 11 input columns and 1 output column which represents whether the person has a Cardiovascular disease or not.
We have roughly 35000 entries for each classification so it is a balanced dataset. No more processing is required on it.

In [3]:
cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
sample_0 = cardio[cardio["cardio"] == 0][:11000]
sample_1 = cardio[cardio["cardio"] == 1][:11000]

In [5]:
sample = pd.concat([sample_0,sample_1])

In [6]:
sample = sample.sample(frac=1).reset_index(drop=True)
sample.head(20)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,14472,18796,1,165,69.0,140,90,2,1,0,0,1,0
1,9475,22588,2,170,65.0,120,80,1,1,0,0,1,0
2,20586,18489,1,157,61.0,120,80,1,1,0,0,1,0
3,15288,23476,1,171,120.0,150,100,2,2,0,0,0,1
4,15469,23492,2,171,73.0,120,80,1,1,0,0,0,0
5,983,19687,1,150,55.0,110,70,1,1,0,0,1,0
6,13608,17999,2,166,76.0,130,80,1,1,0,0,1,1
7,185,16889,1,156,60.0,120,80,1,1,0,0,1,0
8,15714,23253,1,155,67.0,120,80,3,2,0,0,1,1
9,8625,18240,1,160,50.0,110,80,1,1,0,0,1,0


In [7]:
#Slitting the input and output columns
x = sample[["age","gender","weight", "ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active"]]
y = sample["cardio"]

In [8]:
x.head()

Unnamed: 0,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18796,1,69.0,140,90,2,1,0,0,1
1,22588,2,65.0,120,80,1,1,0,0,1
2,18489,1,61.0,120,80,1,1,0,0,1
3,23476,1,120.0,150,100,2,2,0,0,0
4,23492,2,73.0,120,80,1,1,0,0,0


In [9]:
#splitting dataset into test and train samoples
train_x = x[:20000]
train_y = y[:20000]
test_x = x[20000:]
test_y = y[20000:]

In [10]:
X = train_x.values
Y = train_y.values
X = np.float32(X)
Y = np.float32(Y) 
Y = Y.reshape(Y.shape[0],-1)

In [11]:
print(Y.shape)
print(X.shape)

(20000, 1)
(20000, 10)


Defining helper functions for the aglorithm

In [12]:
def get_sigmoid(inp):
    
    #simply returning sigmoid our values
    return (1/(1+np.exp(-inp))) - 0.00000001
    

In [13]:
def get_loss(y_hat, y, n):
    
    #simply returning the loss calculated using our loss function
    return (1/n)*(np.sum(-y*(np.log(y_hat)) - (1-y)*np.log(1-y_hat)))
    

In [14]:
def update_params(n, X, y_hat, y, alpha, w_old):
  
    #looping to update every parameter
    diff = y_hat-y
    update = (1/n)*np.dot(diff.T,X)
   
    #applying the update equation
    w_new = w_old-(alpha)*(update)
    
    #return the updated weights
    return w_new

In [15]:
def train_logistic_regression(X,y, alpha = 0.0001, epochs = 25000):
    
    #first we get the number of examples present in our dataset
    n = len(X)
    
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    #Randomly initialising weights of our model
    w = np.random.rand(1,X.shape[1])
    print("Original weights =",w)
    #setting an initial value for loss (a high value)
    loss = 1000
    for i in range(epochs):
        print("Loss ================>",loss)
        dot = np.dot(X, w.T)
        y_hat = get_sigmoid(dot)
        
        #calculate the loss against the predicted outputs
        loss = get_loss(y_hat, y, n)

        #updating the weights based on the loss
        w = update_params(n, X, y_hat, y, alpha, w)
    
    return w, y_hat

In [17]:
%%time
weights, out = train_logistic_regression(X,Y)

Original weights = [[0.17467056 0.50280197 0.67611796 0.48841772 0.40160956 0.51317784
  0.17093419 0.99168994 0.9059328  0.16534312]]




















































































































































































































Wall time: 1min 13s


In [18]:
weights.shape

(1, 10)

In [19]:
def predict_logistic_regression(weights, x_test):
    
    #doing the neccesary calculations before applying sigmoid funciton
    X = x_test.values
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    product = np.dot(X,weights.T)
    y_preds = get_sigmoid(product)
    
    return [1 if i >= 0.5 else 0 for i in y_preds]

In [20]:
y_pred = predict_logistic_regression(weights, test_x)

In [21]:
def get_accuracy(y_pred, y):
    
    assert len(y_pred) == len(y)
    n = len(y)
    Y = y.values
    
    count = 0
    for i in range(n):
        if y_pred[i] == Y[i]:
            count+=1
     
    return print("Accuracy:", round(((count/n)*100),2), "%")

In [22]:
get_accuracy(y_pred, test_y)

Accuracy: 51.7 %


In [None]:
get_accuracy(out, train_y)

In [None]:
out

In [None]:
train_y