<a href="https://colab.research.google.com/github/MiroxDot/Projects/blob/main/Diabetes_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **GitHub Repository**

In [3]:
!git clone https://github.com/XiaoxueRan/hku_phys3151_2022

fatal: destination path 'hku_phys3151_2022' already exists and is not an empty directory.


In [4]:
# Import necessary libraries
import pandas as pd
from pandas import DataFrame

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import numpy as np
from scipy.optimize import fmin_tnc

class LogisticRegressionUsingGD:

  # Static method to compute the sigmoid function, used to map predictions to probabilities
  @staticmethod
  def sigmoid(x):
    return 1 / (1 + np.exp(-x))

  # Static method for computing the net input, i.e., the dot product of input features and weights
  @staticmethod
  def net_input(theta, x):
    return np.dot(x, theta)

  # Calculates the probability of an input belonging to the positive class
  def probability(self, theta, x):
    return self.sigmoid(self.net_input(theta, x))

  # Computes the logistic regression cost function
  def cost_function(self, theta, x, y):
    m = x.shape[0]  # number of samples
    total_cost = -(1 / m) * np.sum(
        y * np.log(self.probability(theta, x)) + (1 - y) * np.log(
        1 - self.probability(theta, x)))
    return total_cost

  # Computes the gradient of the cost function to be used in the optimization algorithm
  def gradient(self, theta, x, y):
    m = x.shape[0]  # number of samples
    return (1 / m) * np.dot(x.T, self.sigmoid(self.net_input(theta, x)) - y)

  # Fits the logistic regression model to the data using gradient descent
  def fit(self, x, y, theta):
    opt_weights = fmin_tnc(func=self.cost_function, x0=theta, fprime=self.gradient, args=(x, y.flatten()))
    self.w_ = opt_weights[0]
    return self

  # Predicts the class labels for samples in x
  def predict(self, x):
    theta = self.w_[:, np.newaxis]
    return self.probability(theta, x)

  # Calculates the model accuracy based on a probability threshold, usually 0.5 for binary classification
  def accuracy(self, x, actual_classes, probab_threshold=0.5):
    predicted_classes = (self.predict(x) >= probab_threshold).astype(int)
    predicted_classes = predicted_classes.flatten()
    accuracy = np.mean(predicted_classes == actual_classes)
    return accuracy * 100



# **Read Data**

In [7]:
# Load data from a CSV file located at the specified path using pandas' read_csv function
df = pd.read_csv("/content/hku_phys3151_2022/logistic-regression/logistic-regression-example-1.csv", sep=",")

# Print the first 10 rows and the last 10 rows of the DataFrame to the console using concat instead of append
# pd.concat is recommended as append is deprecated in newer versions of pandas
print(pd.concat([df.head(10), df.tail(10)]))

# Assign the DataFrame to a new variable 'data' for further manipulation or analysis
data = df

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
6              3       78             50             32       88  31.0   
7             10      115              0              0        0  35.3   
8              2      197             70             45      543  30.5   
9              8      125             96              0        0   0.0   
758            1      106             76              0        0  37.5   
759            6      190             92              0        0  35.5   
760            2       88             

In [8]:
X = data.iloc[:, :-1] # Features
y = data.iloc[:, -1] # Outcome

In [9]:
# Add an intercept term (column of ones) to the feature matrix 'X'
# This is essential for the logistic regression model to include a bias term (intercept)
X = np.c_[np.ones((X.shape[0], 1)), X] # np.c_ concatenates along the second axis (columns)

# Convert the target variable 'y' from a pandas Series to a numpy array and reshape it into a column vector
# This ensures 'y' is in the correct shape for matrix operations that follow
y = y.values.reshape((len(y), 1))

# Initialize the parameter vector 'theta' with zeros for each feature (including the intercept)
# The shape of 'theta' is determined by the number of features in 'X', ensuring compatibility for dot products
theta = np.zeros((X.shape[1], 1))

In [11]:
# Instantiate an object of the LogisticRegressionUsingGD class
model1 = LogisticRegressionUsingGD()

# Fit the logistic regression model using the feature matrix X, target vector y, and initial parameter vector theta
model1.fit(X, y, theta)

# Calculate the accuracy of the model on the dataset X with targets y, after flattening y for compatibility
accuracy = model1.accuracy(X, y.flatten())

# Retrieve the parameter weights learned by the model during fitting
parameters = model1.w_

# Print the accuracy of the model using string formatting
print("The accuracy of the model is {}".format(accuracy))

# Print the learned model parameters
print("The model parameters got by Gradient descent:")
print(parameters)

The accuracy of the model is 78.25520833333334
The model parameters got by Gradient descent:
[-8.33950666e+00  1.22992795e-01  3.49871049e-02 -1.33847683e-02
  6.20495143e-04 -1.17722550e-03  8.89250124e-02  9.37136526e-01
  1.46876967e-02]


In [22]:
# Define a new input matrix 'X1' with two example feature sets
# Each example set includes various feature values and a predefined intercept (the first element set to 1)
# Features may represent different characteristics such as age, weight, blood pressure, etc.
X1 = [[1, 6, 150, 72, 36, 170, 42, 1, 51], [1, 0, 85, 70, 24, 200, 25, 0.2, 25]]

# Use the 'predict' method from the 'model1' instance (an instance of LogisticRegressionUsingGD)
# to calculate the probability that each instance in 'X1' belongs to the positive class
# The model outputs probabilities based on the logistic regression formula using the learned parameters
print('The prediction of the first method:', model1.predict(X1))

The prediction of the first method: [[0.87281508]
 [0.02308211]]
