In [645]:
import numpy as np
import pandas as pd

# Importing the dataset to a Pandas Dataframe

In [646]:
diabatesdf = pd.read_csv("diabetes.csv")
diabatesdf.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Split the dataset into Features and outcomes

In [647]:
diabetes_X=diabatesdf.iloc[:,:-1]
diabetes_Y=diabatesdf.iloc[:,-1:]

# Splitting the data into Train, Validation and Test Data and Normalizing it.

In [648]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


X_train,X_val_test,Y_train,Y_val_test=train_test_split(diabetes_X,diabetes_Y,train_size=0.6)
scaler = preprocessing.StandardScaler().fit(X_train)
X_val,X_test,Y_val,Y_test=train_test_split(X_val_test,Y_val_test,train_size=0.5)
X_train=scaler.transform(X_train)
X_val=scaler.transform(X_val)
X_test=scaler.transform(X_test)

# Logestic Regression

In [649]:


#This function is used to calculate the sigmoid value 
def sigmoid_function(result):
    return 1/(1+np.exp(-result))
#Based on threshhold value this function will return either 1 or Zero
def set_threshhold(value):
    return 1 if value>0.3 else 0;

#This function is used to calculate the accuracy of the predicted values over actual values
def calculate_accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy
#The logestic regression is performed here. It takes epoch and learning rate as paramaters and performs regression
def perform_logestic_regression(epoch,learningrate):
    #setting the seed to a constant value so that initial weights are equal at all times
    np.random.seed(2311)
    
#weights are asigned at random before the first iteration 
    weights=np.random.randn(X_train.shape[1])
    
#in y=mx+b initializing b to zero initially which will be modified later on
    intercept=0
    
#an array of loss values  
    loss_array=[]

#Iterate through all the tuples in the train set to adjust weights by calculating loss function epoch number of times
    for i in range(1,epoch):
    #Retrieving Y value using the features and assigned weights
        Y_interm=X_train.dot(weights)+intercept
    #Normalizing the response using the sigmoid function to [0.0,1.0]
        Y_interm=sigmoid_function(Y_interm)
        no_of_rows=X_train.shape[0]
    #Calculate loss Function
        loss = (1/no_of_rows)*(-np.sum(np.multiply(np.log(Y_interm), Y_train.to_numpy()[0]) + np.multiply((1 - Y_train.to_numpy()[0]), np.log(1 - Y_interm))))
        loss_array.append(loss)
    #Calculate the gradient using the gradient function 1/m(Y-Y`)*X_Train
        gradient=(1/no_of_rows)*(np.dot(X_train.T,Y_interm-Y_train['Outcome'].to_numpy()))
    #Calculate gradient for intercept using 1/m(sum(Y'-Y))
        dz=Y_interm-Y_train['Outcome'].to_numpy()
        intercept_gradient = (1 / no_of_rows) * np.sum(dz)
    #Updating the weights and intercept based on the gradient function
        weights=weights-learningrate*gradient
        intercept=intercept-learningrate*intercept_gradient
    #Returning weights and the intercept 
    return weights,intercept


In [650]:
#Logestic Regression modeling happenshere 
weights,intercept= perform_logestic_regression(170,0.10)

#Calculating accuracy for train data
y_interm=pd.DataFrame(np.dot(X_train,weights)+intercept).apply(sigmoid_function)[0]
y_interm=y_interm.apply(set_threshhold)

print("The accuracy of validation  ",calculate_accuracy(Y_train['Outcome'].to_numpy(),y_interm.to_numpy()))



The accuracy of validation   0.7304347826086957


In [651]:
#Validating the Model

y_interm=pd.DataFrame(np.dot(X_val,weights)+intercept).apply(sigmoid_function)[0]
y_interm=y_interm.apply(set_threshhold)

print("Accuracy of the test set is :",calculate_accuracy(Y_val['Outcome'].to_numpy(),y_interm.to_numpy()))

Accuracy of the test set is : 0.7337662337662337


In [652]:
#Testing the Model

y_interm=pd.DataFrame(np.dot(X_test,weights)+intercept).apply(sigmoid_function)[0]
y_interm=y_interm.apply(set_threshhold)

print("Accuracy of the test set is :",calculate_accuracy(Y_test['Outcome'].to_numpy(),y_interm.to_numpy()))



Accuracy of the test set is : 0.7532467532467533
