In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [37]:
df = pd.read_csv('/content/sample_data/diabetes.csv')

In [38]:
print(df.head())
print(df.info())
# print(df.describe())
# print(df.isnull().sum())


   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ...                     0.351   31        0
2            8      183             64  ...                     0.672   32        1
3            1       89             66  ...                     0.167   21        0
4            0      137             40  ...                     2.288   33        1

[5 rows x 9 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-n

In [39]:
# Replace 0s with np.nan in specific columns

def handle_missing_values():
  cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
  df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)
  imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  df[cols_with_zeros] = imputer.fit_transform(df[cols_with_zeros])

In [40]:
handle_missing_values()

In [54]:
x = df.drop(columns=['Outcome'])
x_features = np.array([(col) for col in x.columns])
X = x.to_numpy()
y = df["Outcome"]
y = y.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# print(X.shape)
# print(y.shape)

In [None]:
fig, ax = plt.subplots(4,2, figsize=(30, 15))
ax = ax.flatten()
for i in range(len(ax)):
  ax[i].scatter(X_train[:,i], y_train)
  ax[i].set_ylabel("Outcome")
  ax[i].set_xlabel(x_features[i])
plt.show()


In [55]:
def normalizing_data():
  scaler = StandardScaler()
  x_norm = scaler.fit_transform(X_train)
  x_test_norm = scaler.fit_transform(X_test)
  print(f"peak to peak range by column  x:{np.ptp(X_train, axis=0)}")
  print(f"peak to peak range by column in normalization  x:{np.ptp(x_norm, axis=0)}")
  print(x_test_norm.shape)
  print(x_test_norm[0])
  return x_norm, x_test_norm, scaler

# x_norm, x_test_norm, scaler= normalizing_data()


In [None]:
def training_model(X_train, y_train):
  pipe = Pipeline([
    ('scaler', StandardScaler()),
    ("lr_model", LogisticRegression())
  ])
  pipe.fit(X_train, y_train)
  return pipe
lr_model = training_model(X_train, y_train)

In [59]:
y_pred = lr_model.predict(X_train)
print(X_train[0])
print("Prediction on training set:", y_pred[:8])
print("actual value of training set:", y_train[:8])
y_test_pred = lr_model.predict(X_test)
pred = lr_model.predict([[  1.,    139.  ,   46.   ,  19.,     83.   ,  28.7    , 0.654  ,22.   ]])
print(pred)

[  3.         162.          52.          38.         155.54822335
  37.2          0.652       24.        ]
Prediction on training set: [1 0 0 0 0 0 0 0]
actual value of training set: [1 0 0 0 0 0 1 0]
[0]


In [None]:
cm = confusion_matrix(y_train, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Diabetes', 'Diabetes'])
disp.plot(cmap='Blues')

In [57]:
x_train_accuracy = accuracy_score(y_train, y_pred)
print(x_train_accuracy)
x_test_accuracy = accuracy_score(y_test_pred, y_test)
print(x_test_accuracy)

0.7760736196319018
0.7586206896551724


In [None]:
cm = confusion_matrix(y_test_pred, y_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Diabetes', 'Diabetes'])
disp.plot(cmap='Blues')


In [58]:
import pickle
with open('trained_diabetes-0.1.0.pkl', 'wb') as f:
  pickle.dump(lr_model, f)