In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [None]:
df=pd.read_csv("Diabetes Classification.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Diagnosis'].value_counts()

In [None]:
categorical_columns=['Gender']

le=LabelEncoder()
for col in categorical_columns:
    df[col]=le.fit_transform(df[col])

In [None]:
X=df.drop('Diagnosis',axis=1)
y=df['Diagnosis']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [None]:
random_forest=RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=15,
    class_weight='balanced',           # tree depth
    random_state=42
)

In [None]:
random_forest.fit(X_train_res,y_train_res)

In [None]:
predict=random_forest.predict(X_test)

In [None]:
model_accuracy=accuracy_score(y_test,predict)
print("Model Accuracy: ",model_accuracy)

model_precision=precision_score(y_test,predict)
print("Model Precision: ",model_precision)

model_recall=recall_score(y_test,predict)
print("Model Recall Score: ",model_recall)

model_f1=f1_score(y_test,predict)
print("Model f1 score: ",model_f1)

In [None]:
xgboost_model=XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),  # balance classes
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

xgboost_model.fit(X_train_res,y_train_res)

In [None]:
xgb_predict=xgboost_model.predict(X_test)

In [None]:
xgb_model_accuracy=accuracy_score(y_test,xgb_predict)
print("Model Accuracy: ",xgb_model_accuracy)

xgb_model_precision=precision_score(y_test,xgb_predict)
print("Model Precision: ",xgb_model_precision)

xgb_model_recall=recall_score(y_test,xgb_predict)
print("Model Recall Score: ",xgb_model_recall)

xgb_model_f1=f1_score(y_test,xgb_predict)
print("Model f1 score: ",xgb_model_f1)

In [None]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

y_probs = random_forest.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

plt.plot(thresholds, precision[:-1], label="Precision")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.legend()
plt.show()

In [None]:
age=int(input("Enter age: "))
gender=input("Enter your gender (Male / Female): ")
bmi=float(input("Enter your BMI: "))
chol=float(input("Enter your cholesterol level: "))
tg=float(input("Enter the TG level: "))
hdl=float(input("Enter the hdl level: "))
ldl=float(input("Enter the ldl level: "))
cr=float(input("Enter the cr level: "))
bun=float(input("Enter the BUN level: "))

gender_low=1 if gender.lower()=='male' else 0

In [None]:
user_input=np.array(
    [
        [age,gender_low,bmi,chol,tg,hdl,ldl,cr,bun]
    ]
)

In [None]:
predictions=xgboost_model.predict(user_input)

In [None]:
print("Diagnosis Details: \n")
print("Age: ",age,"\n")
print("Gender: ",gender,"\n")
print("BMI: ",bmi,"\n")
print("Cholesterol Level: ",chol,"\n")
print("TG level: ",tg,"\n")
print("HDL Level: ",hdl,"\n")
print("LDL Level: ",ldl,"\n")
print("CR level: ",cr,"\n")
print("BUN level: ",bun,"\n")
print("Possibility to have diabetes: ")


if predictions[0] == 1:
    print("The patient is likely to have diabetes.")
else:
    print("The patient is unlikely to have diabetes.")

In [None]:
probability = xgboost_model.predict_proba(user_input)

no_disease_prob = probability[0][0] #no diabetes
disease_prob = probability[0][1] #Probable diabetes patient

In [None]:
labels = ['Non Diabetic', 'Diabetic']
sizes = [no_disease_prob, disease_prob]
colors = ["#EFEFC5","#A6B9FF"]  # blue = healthy, red = risky


plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Probability of Diabetes')
plt.show()

In [None]:
# Labels and probabilities
labels = ['Non Diabetic', 'Diabetic']
probabilities = [no_disease_prob, disease_prob]
colors = ["#789DC5","#D594CD"]  # blue = healthy, red = risky

# Plot
plt.figure(figsize=(8,6))
plt.bar(labels, probabilities, color=colors)
plt.ylim(0, 1)  # probability ranges from 0 to 1
plt.ylabel('Probability')
plt.title('Probability of Heart Disease')

# Show probabilities on top of bars
for i, v in enumerate(probabilities):
    plt.text(i, v + 0.02, f"{v*100:.1f}%", ha='center', fontweight='bold')

plt.show()