# Logistic Regression Implementation from scratch

In [1]:
#Getting all the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

### Exploring our dataset

In [84]:
#cardio = pd.read_csv("cardio_train.csv", sep = ';')

We have a total of 11 input columns and 1 output column which represents whether the person has a Cardiovascular disease or not.
We have roughly 35000 entries for each classification so it is a balanced dataset. No more processing is required on it.

In [85]:
#cardio.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [86]:
#sample_0 = cardio[cardio["cardio"] == 0][:11000]
#sample_1 = cardio[cardio["cardio"] == 1][:11000]

In [87]:
#sample = pd.concat([sample_0,sample_1])

In [88]:
#sample = sample.sample(frac=1).reset_index(drop=True)
#sample.head(20)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,13859,21328,1,156,55.0,120,80,1,1,0,0,0,1
1,20656,18250,1,159,75.0,120,70,1,1,0,0,0,0
2,10088,19712,1,153,95.0,120,80,2,1,0,0,1,1
3,21404,19613,1,160,70.0,140,100,1,1,0,0,1,1
4,18606,18015,1,159,67.0,120,80,1,1,0,0,1,0
5,29706,18183,1,165,60.0,120,70,1,1,0,0,1,1
6,24310,20182,1,159,70.0,124,76,2,3,0,0,1,0
7,30464,20499,1,159,70.0,120,80,1,1,0,0,1,1
8,5394,16558,1,153,70.0,170,80,2,1,0,0,0,0
9,2785,20338,1,156,73.0,110,80,1,1,0,0,1,0


In [135]:
#Slitting the input and output columns
#x = sample[["age","gender","weight", "ap_hi","ap_lo","cholesterol","gluc","smoke","alco","active"]]
#y = sample["cardio"]

heart = pd.read_csv("heart.csv")
heart = heart.sample(frac=1).reset_index(drop=True)
X = heart[["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]]
Y = heart["target"]
sample_x = X[:250]
sample_y = Y[:250].values
sample_y = sample_y.reshape(sample_y.shape[0],-1)
test_x = X[250:]
test_y = Y[250:].values
test_y = test_y.reshape(test_y.shape[0],-1)

In [107]:
#x.head()

In [108]:
#splitting dataset into test and train samoples
#train_x = x[:20000]
#train_y = y[:20000]
#test_x = x[20000:]
#test_y = y[20000:]

In [109]:
#X = train_x.values
#Y = train_y.values
#X = np.float32(X)
#Y = np.float32(Y) 
#Y = Y.reshape(Y.shape[0],-1)

In [111]:
print(sample_x.shape)
print(sample_y.shape)

(250, 13)
(250, 1)


Defining helper functions for the aglorithm

In [12]:
def get_sigmoid(inp):
    
    #simply returning sigmoid our values
    return (1/(1+np.exp(-inp))) - 0.00000001
    

In [149]:
def get_loss(y_hat, y, n):
    
    #simply returning the loss calculated using our loss function
    return -(1/n)*(np.sum(y*(np.log(y_hat)) + (1-y)*np.log(1-y_hat)))
    

In [171]:
def update_params(n, X, y_hat, y, alpha, w_old):
  
    #looping to update every parameter
    diff = y_hat-y
    update = (1/n)*np.dot(X.T,diff)
   
    #applying the update equation
    w_new = w_old-(alpha)*(update.T)
    
    #return the updated weights
    return w_new

In [201]:
def train_logistic_regression(X,y, alpha = 0.01, epochs = 10000):
    
    #first we get the number of examples present in our dataset
    n = len(X)
    
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    
    #Randomly initialising weights of our model
    w = np.random.rand(1,X.shape[1])
    print("Original weights =",w)
    
    #setting an initial value for loss (a high value)
    #loss = 1000
    for i in range(epochs):
        
        dot = np.dot(X, w.T)
        y_hat = get_sigmoid(dot)
        
        #calculate the loss against the predicted outputs
        loss = get_loss(y_hat, y, n)
        
        print("Loss ================>",loss)
        
        #updating the weights based on the loss
        w = update_params(n, X, y_hat, y, alpha, w)
    
    return w, y_hat

In [202]:
%%time
weights, out = train_logistic_regression(sample_x,sample_y)

Original weights = [[0.70899551 0.6980396  0.77091479 0.34237205 0.95753236 0.58468123
  0.24457395 0.77523914 0.0126027  0.15031552 0.56783131 0.14663492
  0.94254937]]














Wall time: 1.34 s


In [203]:
weights

array([[-0.19038687, -1.10448619,  2.33946919, -0.33596155,  0.09174613,
        -0.08223067,  0.81949708,  1.4348011 , -0.85755601, -1.35889781,
         1.57345486, -1.91690004, -1.29763529]])

In [204]:
def predict_logistic_regression(weights, x_test):
    
    #doing the neccesary calculations before applying sigmoid funciton
    X = x_test.values
    #normalise data using min_max scaling
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    product = np.dot(X,weights.T)
    y_preds = get_sigmoid(product)
    
    print(y_preds)
    
    return [1 if i >= 0.5 else 0 for i in y_preds]

In [205]:
y_pred = predict_logistic_regression(weights, test_x)

[[0.03381087]
 [0.6737428 ]
 [0.96270592]
 [0.03538196]
 [0.4139697 ]
 [0.92339969]
 [0.30252469]
 [0.89617118]
 [0.84799294]
 [0.77655926]
 [0.08893795]
 [0.72831077]
 [0.73675276]
 [0.49822079]
 [0.43796471]
 [0.70506016]
 [0.97005277]
 [0.90545725]
 [0.02788201]
 [0.90375049]
 [0.64205366]
 [0.02762479]
 [0.59264462]
 [0.83566023]
 [0.95814523]
 [0.59008532]
 [0.65760395]
 [0.44709999]
 [0.19036239]
 [0.94343326]
 [0.23088247]
 [0.94785469]
 [0.54320865]
 [0.37436294]
 [0.88219048]
 [0.08657577]
 [0.90621327]
 [0.04724417]
 [0.50169946]
 [0.48527344]
 [0.90982321]
 [0.7996972 ]
 [0.15422638]
 [0.33962929]
 [0.48602653]
 [0.69595034]
 [0.21213957]
 [0.67170708]
 [0.70600236]
 [0.85301697]
 [0.11108269]
 [0.73966711]
 [0.07467136]]


In [206]:
def get_accuracy(y_pred, y):
    
    assert len(y_pred) == len(y)
    n = len(y)
    Y = y
    
    count = 0
    for i in range(n):
        if y_pred[i] == Y[i]:
            count+=1
     
    return print("Accuracy:", round(((count/n)*100),2), "%")

In [207]:
get_accuracy(y_pred, test_y)

Accuracy: 79.25 %


In [208]:
get_accuracy([1 if i >= 0.5 else 0 for i in out], sample_y)

Accuracy: 84.8 %


In [198]:
y_pred

[0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0]

0        0
1        0
2        1
3        0
4        1
        ..
19995    1
19996    0
19997    1
19998    1
19999    0
Name: cardio, Length: 20000, dtype: int64

# Logistic Regression from SK-Learn

In [64]:
#loading new dataset
heart = pd.read_csv("heart.csv")
heart = heart.sample(frac=1).reset_index(drop=True)


In [65]:
X = heart[["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal"]]
Y = heart["target"]
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [75]:
sample_x = X[:250]
sample_y = Y[:250].values
test_x = X[250:]
test_y = Y[250:].values

In [76]:
 from sklearn.linear_model import LogisticRegression

In [93]:
clf = LogisticRegression().fit(X, Y)

In [94]:
logit_preds = clf.predict(test_x)

In [98]:
get_accuracy(logit_preds, test_y.values)

Accuracy: 55.1 %


In [96]:
logit_preds

array([1., 1., 1., ..., 0., 1., 0.], dtype=float32)

In [97]:
test_y

20000    0
20001    1
20002    1
20003    0
20004    0
        ..
21995    1
21996    0
21997    0
21998    1
21999    1
Name: cardio, Length: 2000, dtype: int64