### Step-1:Importing Necessary Libraries

In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

###  Y_hat = 1/(1+e^-z) 
###  Z = w.X + b

#### Gradient Descent:
Gradient Descent is an optimization algorithm used for minimizing the loss function in various machine learning algorithms. It is used for updating the parameters of the learning model.



### w  =  w - α*dw
### b  =  b - α*db
### dw = 1/m * (Y_hat - Y)*X
### db = 1/m * (Y_hat - Y)





Y_hat --> predicted value

X --> Input Variable

w --> weight

b --> bias

### Step-2:Making Logistic Regression Algorithm

In [2]:
class Logistic_R():


  # declaring learning rate & number of iterations (Hyperparametes)
  def __init__(self, learning_rate, no_of_iterations):

    self.learning_rate = learning_rate
    self.no_of_iterations = no_of_iterations



  # fit function to train the model with dataset
  def fit(self, X, Y):

    # number of data points in the dataset (number of rows)  -->  m
    # number of input features in the dataset (number of columns)  --> n
    self.m, self.n = X.shape


    #initiating weight & bias value

    self.w = np.zeros(self.n)
    
    self.b = 0

    self.X = X

    self.Y = Y


    # implementing Gradient Descent for Optimization

    for i in range(self.no_of_iterations):
      self.update_weights()



  def update_weights(self):

    # Y_hat formula (sigmoid function)

    Y_hat = 1 / (1 + np.exp( - (self.X.dot(self.w) + self.b ) ))    


    # derivaties

    dw = (1/self.m)*np.dot(self.X.T, (Y_hat - self.Y))

    db = (1/self.m)*np.sum(Y_hat - self.Y)


    # updating the weights & bias using gradient descent

    self.w = self.w - self.learning_rate * dw

    self.b = self.b - self.learning_rate * db


  # Sigmoid Equation & Decision Boundary

  def predict(self, X):

    Y_pred = 1 / (1 + np.exp( - (X.dot(self.w) + self.b ) )) 
    Y_pred = np.where( Y_pred > 0.5, 1, 0)
    return Y_pred

### Step-3: Data Collection and Basic Analysis

In [3]:
df=pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35,,33.6,0.627,50,1
1,1,85.0,66,29,,26.6,0.351,31,0
2,8,,64,0,,23.3,0.672,32,1
3,1,,66,23,94.0,28.1,0.167,21,0
4,0,,40,35,168.0,43.1,,33,1


In [5]:
df=df.dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 759 entries, 9 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               759 non-null    int64  
 1   Glucose                   759 non-null    float64
 2   BloodPressure             759 non-null    int64  
 3   SkinThickness             759 non-null    object 
 4   Insulin                   759 non-null    float64
 5   BMI                       759 non-null    float64
 6   DiabetesPedigreeFunction  759 non-null    float64
 7   Age                       759 non-null    int64  
 8   Outcome                   759 non-null    int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 59.3+ KB


In [7]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
9,8,125.0,96,no info,0.0,0.0,0.232,54,1
10,4,110.0,92,no info,0.0,37.6,0.191,30,0
11,10,168.0,74,no info,0.0,38.0,0.537,34,1
12,10,139.0,80,no info,0.0,27.1,1.441,57,0
13,1,189.0,60,23,846.0,30.1,0.398,59,1


In [8]:
df['SkinThickness']=df['SkinThickness'].replace('no info',0)

In [9]:
df['SkinThickness'].unique()

array([0, '23', '19', '0', '47', '38', '30', '41', '35', '33', '26', '15',
       '36', '11', '31', '37', '42', '25', '18', '24', '39', '27', '32',
       '21', '34', '10', '60', '13', '20', '22', '28', '29', '54', '40',
       '51', '56', '14', '17', '50', '44', '12', '46', '16', '7', '52',
       '43', '45', '48', '8', '49', '63', '99'], dtype=object)

In [10]:
df['SkinThickness']=df['SkinThickness'].astype('int64')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 759 entries, 9 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               759 non-null    int64  
 1   Glucose                   759 non-null    float64
 2   BloodPressure             759 non-null    int64  
 3   SkinThickness             759 non-null    int64  
 4   Insulin                   759 non-null    float64
 5   BMI                       759 non-null    float64
 6   DiabetesPedigreeFunction  759 non-null    float64
 7   Age                       759 non-null    int64  
 8   Outcome                   759 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 59.3 KB


In [12]:
df.shape

(759, 9)

In [13]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,759.0,759.0,759.0,759.0,759.0,759.0,759.0,759.0,759.0
mean,3.843215,120.815547,69.263505,20.517787,79.56917,32.006851,0.471087,33.233202,0.346509
std,3.370746,31.855797,19.263117,15.943895,114.468959,7.90576,0.325669,11.779063,0.476171
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,63.0,0.0,0.0,27.3,0.2445,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.375,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [14]:
df['Outcome'].value_counts()

Outcome
0    496
1    263
Name: count, dtype: int64

In [15]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.290323,110.050403,68.318548,19.717742,69.157258,30.315524,0.43148,31.217742
1,4.885932,141.117871,71.045627,22.026616,99.205323,35.196578,0.545783,37.034221


### Step-4:Feature Engineering

In [16]:
X=df.drop(columns=['Outcome'],axis=1)
Y=df['Outcome']

In [17]:
scaler=StandardScaler()
X=scaler.fit_transform(X)

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=43)

### Step-5: Training the Model 

In [19]:
logistic=Logistic_R(learning_rate=0.1, no_of_iterations=100)

In [20]:
logistic.fit(X_train,y_train)

### Step-6:Prediction and Evaluation of the Model 

In [21]:
y_pred=logistic.predict(X_test)

In [22]:
acc=accuracy_score(y_test,y_pred)
print("The Accuracy:",round(acc,2))

The Accuracy: 0.77


In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.82       137
           1       0.74      0.64      0.69        91

    accuracy                           0.77       228
   macro avg       0.76      0.75      0.75       228
weighted avg       0.77      0.77      0.76       228



In [24]:
confusion_matrix(y_test,y_pred)

array([[117,  20],
       [ 33,  58]], dtype=int64)