<div style='font-family: "Times New Roman"'>
    <h1 align="center">Zewail City of Science and Technology</h1>
    <h2 align="center">CSAI 253 - Machine Learning</h2>
    <h3 align="center">Assignment 5: Logistic Regression </h3>
</div>

---

<h3 style="text-align:center">Assignment Main Keypoints</h3>
<ul>
    <li><b>Data Cleaning:</b>
        <ul>
            <li>Perform data cleaning steps such as handling missing values, encoding categorical variables, and scaling numerical features as needed.</li>
        </ul>
    </li>
    <li><b>Logistic Regression Model:</b>
        <ul>
            <li>Implement a logistic regression model from scratch without using any pre-built libraries for the model itself. You can use libraries like NumPy or Pandas for data manipulation and preprocessing.</li>
            <li>Split the dataset into training and testing sets (e.g., 80% training, 20% testing).</li>
            <li>Train your logistic regression model on the training data.</li>
            <li>Evaluate the model's performance on the testing data using appropriate evaluation metrics such as accuracy, precision, recall, and F1-score.</li>
        </ul>
    </li>
    <li><b>Discussion:</b>
        <ul>
            <li>Discuss your observations about the dataset, any challenges faced during data cleaning or model implementation, and the insights gained from the model's performance.</li>
        </ul>
    </li>
</ul>

---

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Import needed libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

### Read and Explore the data

In [3]:
df = pd.read_csv("Data/heart.csv")

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### Check for nulls

In [5]:
if df.isna().sum().sum() == 0:
    print("Data has no nulls")
else:
    print(f"There is {df.isna().sum().sum()} records has nulls")

Data has no nulls


### Check for duplicates

In [6]:
if df.duplicated().sum() == 0:
    print("Data has no duplicates")
else:
    print(f"There is {df.duplicated().sum()} duplicates")

There is 1 duplicates


### Drop the duplicates

In [7]:
df.drop_duplicates(inplace=True)

### Explore the data

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB


In [9]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


---

### Split and Normalize the data

In [10]:
X, y = df.drop("target", axis=1), df["target"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [12]:
minMax = MinMaxScaler()
X_train_norm = minMax.fit_transform(X_train)
X_test_norm  = minMax.fit_transform(X_test)

pd.DataFrame(X_train_norm).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,0.528613,0.651452,0.333333,0.350779,0.266384,0.13278,0.263485,0.541494,0.302905,0.160621,0.707469,0.16805,0.77455
std,0.191255,0.477502,0.345607,0.167613,0.121452,0.340043,0.266291,0.196404,0.460471,0.17967,0.303635,0.246382,0.198087
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.395833,0.0,0.0,0.245283,0.187067,0.0,0.0,0.421053,0.0,0.0,0.5,0.0,0.666667
50%,0.5625,1.0,0.333333,0.339623,0.251732,0.0,0.5,0.561404,0.0,0.129032,0.5,0.0,0.666667
75%,0.666667,1.0,0.666667,0.433962,0.330254,0.0,0.5,0.675439,1.0,0.258065,1.0,0.25,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


---

### Logistic Regression Model

In [13]:
class LogisticRegression:
    def __init__(self, learning_rate=0.1, num_iterations=1000): 
        self.learning_rate, self.num_iterations = learning_rate, num_iterations
        self.weights, self.bias = None, None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.weights, self.bias = np.zeros(X.shape[1]), 0

        for _ in range(self.num_iterations):
            y_pred = self.sigmoid((X @ self.weights) + self.bias)
            
            self.weights -= self.learning_rate * (1 / X.shape[0]) * (X.T @ (y_pred - y))
            self.bias -= self.learning_rate * (1 / X.shape[0]) * np.sum(y_pred - y)

    def predict(self, X):
        y_pred = self.sigmoid((X @ self.weights) + self.bias)
        return np.where(y_pred > 0.5, 1, 0)

### Model Training and predication

In [14]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred_norm = lr.predict(X_test_norm)

### Model Evaluation

#### Without normalization

In [15]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.80      0.14      0.24        29
           1       0.55      0.97      0.70        32

    accuracy                           0.57        61
   macro avg       0.68      0.55      0.47        61
weighted avg       0.67      0.57      0.48        61



#### With normalization

In [16]:
report = classification_report(y_test, y_pred_norm)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.93      0.82        29
           1       0.92      0.69      0.79        32

    accuracy                           0.80        61
   macro avg       0.82      0.81      0.80        61
weighted avg       0.83      0.80      0.80        61

