# Logistic Regression Implementation from scratch

In [1]:
#Getting all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing

### Exploring our dataset

In [2]:
cardio = pd.read_csv("cardio_train.csv", sep = ';')

We have a total of 11 input columns and 1 output column which represents whether the person has a Cardiovascular disease or not.
We have roughly 35000 entries for each classification so it is a balanced dataset. No more processing is required on it.

In [3]:
cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
sample_0 = cardio[cardio["cardio"] == 0][:10000]
sample_1 = cardio[cardio["cardio"] == 1][:10000]

In [5]:
sample = pd.concat([sample_0,sample_1])

In [6]:
sample = sample.sample(frac=1).reset_index(drop=True)
sample.head(20)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,15553,22627,2,180,108.0,120,80,2,2,0,0,1,0
1,18373,22617,1,162,75.0,180,80,3,3,0,0,0,1
2,8175,22783,2,172,78.0,120,80,1,1,0,0,1,0
3,23986,21911,2,173,85.0,130,90,1,1,0,0,0,1
4,3543,16491,1,165,82.0,120,80,1,3,1,0,1,0
5,9599,17568,2,165,75.0,120,80,1,1,1,0,1,0
6,27576,22525,2,175,88.0,120,80,2,1,1,1,1,0
7,3352,20994,2,186,105.0,140,10000,1,1,0,0,1,1
8,16022,21289,1,163,60.0,120,80,1,1,0,0,0,1
9,27643,18940,1,156,57.0,120,80,1,1,0,0,1,0


In [7]:
#Slitting the input and output columns
x = sample[["age","gender","weight", "ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active"]]
y = sample["cardio"]

In [8]:
x.head()

Unnamed: 0,age,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,22627,2,108.0,120,80,2,2,0,0,1
1,22617,1,75.0,180,80,3,3,0,0,0
2,22783,2,78.0,120,80,1,1,0,0,1
3,21911,2,85.0,130,90,1,1,0,0,0
4,16491,1,82.0,120,80,1,3,1,0,1


In [9]:
X = x.values
Y = y.values
X = np.float32(X)
Y = np.float32(Y) 

In [10]:
print(Y.shape)
print(X.shape)

(20000,)
(20000, 10)


Defining helper functions for the aglorithm

In [11]:
def get_sigmoid(inp):
    
    #simply returning sigmoid our values
    return (1/(1+np.exp(-inp)))
    

In [12]:
def get_loss(y_hat, y, n):
    
    #simply returning the loss calculated using our loss function
    return (1/n)*(np.sum(-y*(np.log10(y_hat)) - (1-y)*np.log10(1-y_hat)))
    

In [29]:
def update_params(n, X, y_hat, y, alpha, w_old):
    
    #looping to update every parameter
    update = (1/n)*np.dot((y_hat-y).T,X)
    print(update.shape)
    #applying the update equation
    w_new = w_old-(alpha)*(update.T)
    
    #return the updated weights
    return w_new

In [32]:
def train_logistic_regression(X,y, alpha = 0.01):
    #first we get the number of examples present in our dataset
    n = len(X)
    #normalise data
    X = preprocessing.normalize(X)
    
    #Randomly initialising weights of our model
    w = np.random.rand(1,X.shape[1])
    print("Original weights =",w)
    #setting an initial value for loss (a high value)
    loss = 1000
    #while loss > 0.01:
    
    dot = np.dot(X, w.T)
    y_hat = get_sigmoid(dot)
    print("y_hat predicted:", y_hat.shape)
    #calculate the loss against the predicted outputs
    l = get_loss(y_hat, y, n)
    
    #updating the weights based on the loss
    w = update_params(n, X, y_hat, y, alpha, w)
    print("New weights =",w)
    
    return l

In [31]:
out = train_logistic_regression(X,Y)

Original weights = [[0.43013827 0.27772099 0.41610357 0.33137083 0.37572188 0.94126688
  0.13857501 0.14792712 0.89225182 0.63791958]]
(20000, 10)


ValueError: operands could not be broadcast together with shapes (1,10) (10,20000) 

In [22]:
out

6445.141127279917

In [None]:
out