## Imports

In [None]:
import pandas as pd
import numpy as np
import itertools
import warnings
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
warnings.filterwarnings("ignore")

In [None]:
diabetes = pd.read_csv("../diabetes.csv")
diabetes.iloc[:, 1:7] = diabetes.iloc[:, 1:7].replace(0, np.NaN)
diabetes

# Model to Predict Missing Values in Dataset

In [None]:
# nanRows = diabetes[diabetes.isna().any(axis=1)]
filledRows = diabetes.dropna()
diabetesPredicted = diabetes.copy()
for col in diabetes.columns:
    tmpImputer = SimpleImputer(missing_values=np.nan, strategy="mean")

    if col == "Pregnancies" or col == "DiabetesPedigreeFunction" or col == "Age" or col == "Outcome":
        continue
    nanRows = diabetes[diabetes[col].isna()]
     
    filledRowsX = filledRows.drop(columns=[col])
    filledRowsY = filledRows[[col]]
    nanRowsX = nanRows.drop(columns=[col])
    nanRowsY = nanRows[[col]]
    for icol in nanRowsX.columns:
        # nanRowsX[icol] = tmpImputer.fit_transform(nanRowsX[icol].values.reshape(-1, 1))
        nanRowsX[icol].fillna(diabetes[icol].mean(), inplace=True)
        
    linRegModel = LinearRegression()
    linRegModel.fit(filledRowsX, filledRowsY)
    
    linRegPred = linRegModel.predict(nanRowsX)
    diabetesPredicted.loc[diabetesPredicted[col].isna(), col] = linRegPred

display(diabetesPredicted)
# bro we have to write a model for every row with missing values(6) SIX MODELS!!!!!!

# Simple Imputer to Replace Missing Values With the Average of the Column

In [None]:
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

for i in diabetes.columns:
    diabetes[i]=imputer.fit_transform(diabetes[i].values.reshape(-1,1))
    
diabetes

# Correlation Between All Variables

In [None]:
# diabetes.corr().style.background_gradient(cmap='winter_r').set_precision(3)

In [None]:
diabetes.corr()

# Balance Out The Data

In [None]:
px.histogram(diabetes, x='Outcome', color='Outcome', title='Visualization of Bias Before', width=500, height=500,
             template='plotly_dark', color_discrete_sequence=['#F63366', '#00CC96'])

In [None]:
diabetesUB = diabetes.sort_values("Outcome", ascending = False).reset_index(drop = True)
oneCount = len(diabetesUB[diabetes.Outcome == 1])
diabetesUB = diabetesUB.iloc[ :oneCount + oneCount, :]
diabetesUB

In [None]:
px.histogram(diabetesUB, x='Outcome', color='Outcome', title='Visualization of Bias After', width=500, height=500,
             template='plotly_dark', color_discrete_sequence=['#F63366', '#00CC96'])

# Spliting data for training and testing

In [None]:
diabetes_x = diabetes.drop(columns=['Outcome'])
diabetes_y = diabetes["Outcome"]

diabetesUB_x = diabetesUB.drop(columns=['Outcome'])
diabetesUB_y = diabetesUB['Outcome']

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(diabetes_x)

X_scaledUB = scaler.fit_transform(diabetesUB_x)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, diabetes_y, test_size=0.2, random_state=60)

X_trainUB, X_testUB, Y_trainUB, Y_testUB = train_test_split(X_scaledUB, diabetesUB_y, test_size=0.3, random_state=60)

# Detecting Outliers

In [None]:
# for col in diabetesUB_x.columns:
#     fig = px.box(diabetesUB_x,y=col,color=diabetesUB["Outcome"],title=col,width=500,height=500,template="plotly_dark")
#     fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(y=diabetesUB["Pregnancies"], name="Pregnancies"))
fig.add_trace(go.Box(y=diabetesUB["SkinThickness"], name="SkinThickness"))
fig.add_trace(go.Box(y=diabetesUB["BMI"], name="BMI"))
fig.add_trace(go.Box(y=diabetesUB["Age"], name="Age"))
fig.add_trace(go.Box(y=diabetesUB["DiabetesPedigreeFunction"], name="DiabetesPedigreeFunction"))

fig.update_traces(boxpoints='all', jitter=0)
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(y=diabetesUB["Glucose"], name="Glucose"))
fig.add_trace(go.Box(y=diabetesUB["BloodPressure"], name="BloodPressure"))

fig.update_traces(boxpoints='all', jitter=0)
fig.show()

In [None]:
fig = go.Figure()

for col in diabetesUB_x.columns:
    if col == "Insulin":
        continue
    fig.add_trace(go.Box(y=diabetesUB[col], name=col))

# fig.update_traces(boxpoints='all', jitter=0)
fig.show()

# Training and Predicting

In [None]:
features=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',"Age","BMI","DiabetesPedigreeFunction"]
scoreRelationship = pd.DataFrame(columns=["Model", "Accuracy Scores"])
f1Scores = pd.DataFrame(columns=["Model", "F1 Score 0", "F1 Score 1"])
ReScores = pd.DataFrame(columns=["Model", "Recall Score 0", "Recall Score 1"])
PrScores = pd.DataFrame(columns=["Model", "Precision Score 0", "Precision Score 1"])

In [None]:
def allScores(Y_test, predict):
    prc = precision_score(Y_test, predict, average=None)
    rec = recall_score(Y_test, predict, average=None)
    f1 = f1_score(Y_test, predict, average=None)
    acc = accuracy_score(Y_test, predict)
    return prc, rec, f1, acc

## Linear Regression Training

In [None]:
accScores = []
f10 = []
f11 = []
re0 = []
re1 = []
pr0 = []
pr1 = []

kfold = KFold(n_splits=5, shuffle=True, random_state=20)
for train, test in kfold.split(diabetesPredicted):
    train_x = diabetesPredicted.iloc[train, :8]
    train_y = diabetesPredicted.iloc[train, 8]
    test_x = diabetesPredicted.iloc[test, :8]
    test_y = diabetesPredicted.iloc[test, 8]
    linmodel = LinearRegression()
    linmodel.fit(train_x, train_y)
    linear_pred = (linmodel.predict(test_x) > 0.55) * 1
    print(allScores(test_y, linear_pred))
    linmodel_fi = permutation_importance(linmodel, train_x, train_y)
    accScores.append(accuracy_score(test_y, linear_pred))
    f1sc = f1_score(test_y, linear_pred, average=None)
    resc = recall_score(test_y, linear_pred, average=None)
    prsc = precision_score(test_y, linear_pred, average=None)
    f10.append(f1sc[0])
    f11.append(f1sc[1])
    re0.append(resc[0])
    re1.append(resc[1])
    pr0.append(prsc[0])
    pr1.append(prsc[1])

scoreRelationship = pd.concat([scoreRelationship, pd.DataFrame({"Model": "Linear Regression", "Accuracy Scores":(sum(accScores) / 5)},index=[0])],ignore_index=True)
f1Scores = pd.concat([f1Scores, pd.DataFrame({"Model":"Linear Regression", "F1 Score 0":(sum(f10)/5),  "F1 Score 1":(sum(f11)/5)},index=[0])],ignore_index=True)
ReScores = pd.concat([ReScores, pd.DataFrame({"Model":"Linear Regression", "Recall Score 0":(sum(re0)/5),  "Recall Score 1":(sum(re1)/5)},index=[0])],ignore_index=True)
PrScores = pd.concat([PrScores, pd.DataFrame({"Model":"Linear Regression", "Precision Score 0":(sum(pr0)/5),  "Precision Score 1":(sum(pr1)/5)},index=[0])],ignore_index=True)

tmp = pd.DataFrame({'Feature': features, 'Feature importance': abs(linmodel_fi['importances_mean'])})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
fig=px.bar(tmp,x='Feature',y='Feature importance',color='Feature importance',title="Features Importance of Linear Regression Model",
            labels=dict(x="Feature",y="Feature importance",color="Feature importance"),color_continuous_midpoint=0.8,
            width=600,height=600,template="plotly_dark")
fig.show()

cmLin = confusion_matrix(test_y, linear_pred)
dispLin = ConfusionMatrixDisplay(confusion_matrix=cmLin)
dispLin.plot()
plt.show()

## Logistic Regression Training

In [None]:
accScores = []
f10 = []
f11 = []
re0 = []
re1 = []
pr0 = []
pr1 = []

kfold = KFold(n_splits=5, shuffle=True, random_state=20)
for train, test in kfold.split(diabetesPredicted):
    train_x = diabetesPredicted.iloc[train, :8]
    train_y = diabetesPredicted.iloc[train, 8]
    test_x = diabetesPredicted.iloc[test, :8]
    test_y = diabetesPredicted.iloc[test, 8]
    logreg_model = LogisticRegression(max_iter=30000)
    logreg_model.fit(train_x, train_y)
    logreg_pred = (logreg_model.predict_proba(test_x) > 0.6) * 1
    print(allScores(test_y, logreg_pred[:,1]))
    logmodel_fi = permutation_importance(linmodel, train_x, train_y)
    accScores.append(accuracy_score(test_y, logreg_pred[:,1]))
    f1sc = f1_score(test_y, logreg_pred[:,1], average=None)
    resc = recall_score(test_y, logreg_pred[:,1], average=None)
    prsc = precision_score(test_y, logreg_pred[:,1], average=None)
    f10.append(f1sc[0])
    f11.append(f1sc[1])
    re0.append(resc[0])
    re1.append(resc[1])
    pr0.append(prsc[0])
    pr1.append(prsc[1])

scoreRelationship = pd.concat([scoreRelationship, pd.DataFrame({"Model": "Logistic Regression", "Accuracy Scores":(sum(accScores) / 5)},index=[0])],ignore_index=True)
f1Scores = pd.concat([f1Scores, pd.DataFrame({"Model":"Logistic Regression", "F1 Score 0":(sum(f10)/5),  "F1 Score 1":(sum(f11)/5)},index=[0])],ignore_index=True)
ReScores = pd.concat([ReScores, pd.DataFrame({"Model":"Logistic Regression", "Recall Score 0":(sum(re0)/5),  "Recall Score 1":(sum(re1)/5)},index=[0])],ignore_index=True)
PrScores = pd.concat([PrScores, pd.DataFrame({"Model":"Logistic Regression", "Precision Score 0":(sum(pr0)/5),  "Precision Score 1":(sum(pr1)/5)},index=[0])],ignore_index=True)


tmp = pd.DataFrame({'Feature': features, 'Feature importance': logmodel_fi['importances_mean']})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
fig=px.bar(tmp,x='Feature',y='Feature importance',color='Feature importance',title="Features Importance of Logistic Regression Model",
            labels=dict(x="Feature",y="Feature importance",color="Feature importance"),color_continuous_midpoint=0.8,
            width=600,height=600,template="plotly_dark")
fig.show()

cmLog = confusion_matrix(test_y, logreg_pred[:,1])
dispLog = ConfusionMatrixDisplay(confusion_matrix=cmLog)
dispLog.plot()
plt.show()

## Forest Training

In [None]:
accScores = []
f10 = []
f11 = []
re0 = []
re1 = []
pr0 = []
pr1 = []

kfold = KFold(n_splits=5, shuffle=True, random_state=20)
for train, test in kfold.split(diabetesPredicted):
    train_x = diabetesPredicted.iloc[train, :8]
    train_y = diabetesPredicted.iloc[train, 8]
    test_x = diabetesPredicted.iloc[test, :8]
    test_y = diabetesPredicted.iloc[test, 8]
    forestModel = RandomForestClassifier(n_estimators=100, max_depth=4)
    forestModel.fit(train_x, train_y)
    forestModelPred = forestModel.predict(test_x)
    print(allScores(test_y, forestModelPred))
    accScores.append(accuracy_score(test_y, forestModelPred))
    f1sc = f1_score(test_y, forestModelPred, average=None)
    resc = recall_score(test_y, forestModelPred, average=None)
    prsc = precision_score(test_y, forestModelPred, average=None)
    f10.append(f1sc[0])
    f11.append(f1sc[1])
    re0.append(resc[0])
    re1.append(resc[1])
    pr0.append(prsc[0])
    pr1.append(prsc[1])

scoreRelationship = pd.concat([scoreRelationship, pd.DataFrame({"Model": "Random Forest Classifier", "Accuracy Scores":(sum(accScores) / 5)},index=[0])],ignore_index=True)
f1Scores = pd.concat([f1Scores, pd.DataFrame({"Model":"Random Forest Classifier", "F1 Score 0":(sum(f10)/5),  "F1 Score 1":(sum(f11)/5)},index=[0])],ignore_index=True)
ReScores = pd.concat([ReScores, pd.DataFrame({"Model":"Random Forest Classifier", "Recall Score 0":(sum(re0)/5),  "Recall Score 1":(sum(re1)/5)},index=[0])],ignore_index=True)
PrScores = pd.concat([PrScores, pd.DataFrame({"Model":"Random Forest Classifier", "Precision Score 0":(sum(pr0)/5),  "Precision Score 1":(sum(pr1)/5)},index=[0])],ignore_index=True)


tmp = pd.DataFrame({'Feature': features, 'Feature importance': forestModel.feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
fig=px.bar(tmp,x='Feature',y='Feature importance',color='Feature importance',title="Features Importance of Forest Model",
            labels=dict(x="Feature",y="Feature importance",color="Feature importance"),color_continuous_midpoint=0.8,
            width=600,height=600,template="plotly_dark")
fig.show()

cmFor = confusion_matrix(test_y, forestModelPred)
dispFor = ConfusionMatrixDisplay(confusion_matrix=cmFor)
dispFor.plot()
plt.show()

In [None]:
display(scoreRelationship.style)
display(f1Scores.style)
display(ReScores.style)
display(PrScores.style)

# Graph of our Tree Model

In [None]:
# plt.figure(figsize=(20,10))
# plot_tree(forestModel,max_depth=3,fontsize=10,feature_names=train_x.columns.to_list())
# plt.show()

![alt text](https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/7304a882-5280-4444-9156-b330524036c6/d60uxie-5cd0ab54-acce-4454-818f-b6b05dc0e12f.jpg?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzczMDRhODgyLTUyODAtNDQ0NC05MTU2LWIzMzA1MjQwMzZjNlwvZDYwdXhpZS01Y2QwYWI1NC1hY2NlLTQ0NTQtODE4Zi1iNmIwNWRjMGUxMmYuanBnIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.YwRvA-feO5GcHeVtp8FAF3ECswTyouAREnVh8Pop3EI)

https://www.kaggle.com/datasets/ashishkumarjayswal/diabetes-dataset

# TODO LIST

* Code reward system from scratch
* Create feature importance graph for each model (Done)
* Train model to predict missing values in original dataset (Done)
