In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [None]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
x = df.drop('Outcome', axis=1)
y = df['Outcome']
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [None]:
# prompt: split the data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
668,6,98,58,33,190,34.0,0.430,43
324,2,112,75,32,0,35.7,0.148,21
624,2,108,64,0,0,30.8,0.158,21
690,8,107,80,0,0,24.6,0.856,34
473,7,136,90,0,0,29.9,0.210,50
...,...,...,...,...,...,...,...,...
355,9,165,88,0,0,30.4,0.302,49
534,1,77,56,30,56,33.3,1.251,24
344,8,95,72,0,0,36.8,0.485,57
296,2,146,70,38,360,28.0,0.337,29


In [None]:
# prompt: log regression

model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy: 0.7402597402597403
Confusion Matrix:
[[78 21]
 [19 36]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# prompt: confusion matrix for train and test

y_train_pred = model.predict(x_train)

print(f"Training Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
print(f"Testing Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Training Confusion Matrix:
[[353  48]
 [ 93 120]]
Testing Confusion Matrix:
[[78 21]
 [19 36]]


In [None]:
# prompt: classification report train and test


print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred))

Training Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       401
           1       0.71      0.56      0.63       213

    accuracy                           0.77       614
   macro avg       0.75      0.72      0.73       614
weighted avg       0.76      0.77      0.76       614

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        99
           1       0.63      0.65      0.64        55

    accuracy                           0.74       154
   macro avg       0.72      0.72      0.72       154
weighted avg       0.74      0.74      0.74       154



In [None]:
# prompt: normalize the data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model_scaled = LogisticRegression()
model_scaled.fit(x_train_scaled, y_train)
y_pred_scaled = model_scaled.predict(x_test_scaled)

print("After Scaling:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_scaled)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_scaled)}")

y_train_pred_scaled = model_scaled.predict(x_train_scaled)

print(f"Training Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred_scaled)}")
print(f"Testing Confusion Matrix:\n{confusion_matrix(y_test, y_pred_scaled)}")

print("Training Classification Report:")
print(classification_report(y_train, y_train_pred_scaled))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred_scaled))


After Scaling:
Accuracy: 0.7532467532467533
Confusion Matrix:
[[79 20]
 [18 37]]
Training Confusion Matrix:
[[354  47]
 [ 94 119]]
Testing Confusion Matrix:
[[79 20]
 [18 37]]
Training Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.88      0.83       401
           1       0.72      0.56      0.63       213

    accuracy                           0.77       614
   macro avg       0.75      0.72      0.73       614
weighted avg       0.76      0.77      0.76       614

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154



In [None]:
# prompt: SMOTE for train dataset

!pip install imbalanced-learn

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

print("Shape of original training data:", x_train.shape)
print("Shape of resampled training data:", x_train_resampled.shape)

print("\nValue counts of original training target:")
print(y_train.value_counts())
print("\nValue counts of resampled training target:")
print(y_train_resampled.value_counts())

Shape of original training data: (614, 8)
Shape of resampled training data: (802, 8)

Value counts of original training target:
Outcome
0    401
1    213
Name: count, dtype: int64

Value counts of resampled training target:
Outcome
0    401
1    401
Name: count, dtype: int64


In [None]:
# prompt: apply log reg

model_resampled = LogisticRegression()
model_resampled.fit(x_train_resampled, y_train_resampled)
y_pred_resampled = model_resampled.predict(x_test)

print("After SMOTE:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_resampled)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_resampled)}")

y_train_pred_resampled = model_resampled.predict(x_train_resampled)

print(f"Training Confusion Matrix:\n{confusion_matrix(y_train_resampled, y_train_pred_resampled)}")
print(f"Testing Confusion Matrix:\n{confusion_matrix(y_test, y_pred_resampled)}")

print("Training Classification Report:")
print(classification_report(y_train_resampled, y_train_pred_resampled))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred_resampled))


# Combine SMOTE and Scaling
smote_scaled = SMOTE(random_state=42)
x_train_resampled_scaled, y_train_resampled_scaled = smote_scaled.fit_resample(x_train_scaled, y_train)

print("\nAfter SMOTE and Scaling:")
print("Shape of original scaled training data:", x_train_scaled.shape)
print("Shape of resampled and scaled training data:", x_train_resampled_scaled.shape)

print("\nValue counts of original scaled training target:")
print(y_train.value_counts())
print("\nValue counts of resampled and scaled training target:")
print(y_train_resampled_scaled.value_counts())

model_resampled_scaled = LogisticRegression()
model_resampled_scaled.fit(x_train_resampled_scaled, y_train_resampled_scaled)
y_pred_resampled_scaled = model_resampled_scaled.predict(x_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred_resampled_scaled)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_resampled_scaled)}")

y_train_pred_resampled_scaled = model_resampled_scaled.predict(x_train_resampled_scaled)

print(f"Training Confusion Matrix:\n{confusion_matrix(y_train_resampled_scaled, y_train_pred_resampled_scaled)}")
print(f"Testing Confusion Matrix:\n{confusion_matrix(y_test, y_pred_resampled_scaled)}")

print("Training Classification Report:")
print(classification_report(y_train_resampled_scaled, y_train_pred_resampled_scaled))

print("Testing Classification Report:")
print(classification_report(y_test, y_pred_resampled_scaled))

After SMOTE:
Accuracy: 0.7077922077922078
Confusion Matrix:
[[68 31]
 [14 41]]
Training Confusion Matrix:
[[308  93]
 [101 300]]
Testing Confusion Matrix:
[[68 31]
 [14 41]]
Training Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.77      0.76       401
           1       0.76      0.75      0.76       401

    accuracy                           0.76       802
   macro avg       0.76      0.76      0.76       802
weighted avg       0.76      0.76      0.76       802

Testing Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.69      0.75        99
           1       0.57      0.75      0.65        55

    accuracy                           0.71       154
   macro avg       0.70      0.72      0.70       154
weighted avg       0.74      0.71      0.71       154


After SMOTE and Scaling:
Shape of original scaled training data: (614, 8)
Shape of resampled and scaled trai

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
