### Libraries to be used

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

Reading the Data

In [None]:
df = pd.read_csv("Dataset of Diabetes .csv")

### Examining the Data

In [None]:
df.head()

In [None]:
df.tail()

As we can see, we have 1000 rows(data) that we will work on, also we have two nominal data columns(Gender and Class)

We also have two columns that we won't be using during the trainig phase(ID and No_Pation)

In [None]:
df.info()

We have a dataset containing 14 columns and 1000 rows with no missing values. Out of the 14 columns, 12 are numerical and likely won't require encoding, while the remaining 2 are of object type and may need to be encoded.

In [None]:
df.describe()

Let's check for null values

In [None]:
df.isnull().sum()

No null values

Now let's check for duplicates

In [None]:
duplicates = df.duplicated()
print(duplicates.sum())
df[df.duplicated()]

As we can see we don't have any duplicates

Let's drop useless columns, such as id and No_Pation

Instead of dropping data from original dataset, we will use the iloc method and save the data to be used in new variable called df_cleaned, that we will be using from now

In [None]:
df_cleaned = df.iloc[:,2:]
df_cleaned

### Encoding

In [None]:
label_Gender = LabelEncoder()
label_Class = LabelEncoder()

Check original unique values before mapping

In [None]:
print("Original unique values in 'Gender':", df_cleaned['Gender'].unique())
print("Original unique values in 'CLASS':", df_cleaned['CLASS'].unique())

As we can see, we have some errors in the data entry, we have 'F' and 'f', we also have 'N' and 'N ', ...etc
all of this need to be fixed before encoding these nominal data


First in 'Gender'; we have females and Males, so we must have encoded values of 'F' and 'M', let's handle this

In [None]:
# remove spaces and convert to upper case
df_cleaned['Gender'] = df_cleaned['Gender'].str.strip().str.upper()

In [None]:
print("Original unique values in 'Gender':", df_cleaned['Gender'].unique())

Now in 'CLASS'; we shall have unique values of Diabetic, Non-Diabetic, or Predict-Diabetic

In [None]:
df_cleaned['CLASS'] = df_cleaned['CLASS'].str.strip().str.upper()

In [None]:
print("Original unique values in 'CLASS':", df_cleaned['CLASS'].unique())

Now let's move on to the next part and encode these values, we shall have encoded values of 0 and 1 in the Gender attribute and 0, 1 and 2 in the CLASS attribute

In [None]:
df_cleaned["Gender"] = label_Gender.fit_transform(df_cleaned["Gender"])
df_cleaned["CLASS"] = label_Class.fit_transform(df_cleaned["CLASS"])

In [None]:
print("Unique values in 'Gender':", df_cleaned['Gender'].unique())
print("Unique values in 'CLASS':", df_cleaned['CLASS'].unique())
print("\nGender mapping:")
for num, cat in enumerate(label_Gender.classes_):
    print(f"{num} -> {cat}")
print("\nClass mapping:")
for num, cat in enumerate(label_Class.classes_):
    print(f"{num} -> {cat}")

### Outlier Detection

We will start by plotting the box plot for each attribute

In [None]:
attributes = ['Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS']

In [None]:
plt.figure(figsize=(16, 12))

for i, col in enumerate(attributes, start=1):
    plt.subplot(3, 4, i)
    sns.boxplot(y=df_cleaned[col])
    plt.title(f'Box Plot - {col}')

plt.tight_layout()
plt.show()

We want 'apply' outlier detection on Gender and Class attributes as this is nonsense, outliers only make sense for continuous data as they’re values that are way off the usual range, but Gender (0/1) and CLASS (0/1/2) are categories encoded as numbers, not continuous numbers.

In [None]:
def remove_outliers_iqr(data, columns):
    mask = pd.Series([True] * len(data))
    initial_len = len(data)
    for column in columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        col_mask = (data[column] >= lower) & (data[column] <= upper)
        removed = (~col_mask).sum()
        print(f"Outliers removed in '{column}': {removed}")
        mask &= col_mask
    final_len = mask.sum()
    print(f"\nTotal rows after outlier removal: {final_len} (removed {initial_len - final_len} rows)")
    return data[mask]

The mask used lets us check all columns at once on the full dataset before removing anything, instead of removing  down the data repeatedly, this way we avoid accidentally removing too many rows as outliers are multidimensional, a row might look like an outlier on column A alone, but if columns B and C values are perfectly normal, we might want to reconsider. 
Filtering column by column and reducing the dataframe after each step can over-filter — throwing out rows that would pass a combined all-columns check.

In [None]:
numerical_columns = ['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']

In [None]:
df_cleaned = remove_outliers_iqr(df_cleaned, numerical_columns)

Now let's plot once again

In [None]:
plt.figure(figsize=(16, 12))

for i, col in enumerate(attributes, start=1):
    plt.subplot(3, 4, i)
    sns.boxplot(y=df_cleaned[col])
    plt.title(f'Box Plot - {col}')

plt.tight_layout()
plt.show()

### Feature Scaling

In [None]:
scaler = MinMaxScaler()

In [None]:
df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])

In [None]:
print("\n----- SCALED FEATURES PREVIEW -----")
print(df_cleaned.head())

### Final Dataset Check

In [None]:
df_cleaned

Feature Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df_cleaned.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

Gender VS Class percentage

In [None]:
gender_dist = df_cleaned.groupby(['Gender', 'CLASS']).size().unstack().fillna(0)
gender_dist_pct = gender_dist.div(gender_dist.sum(axis=1), axis=0) * 100

gender_dist_pct.plot(kind='bar', stacked=True, colormap='coolwarm')
plt.title("Diabetes Class Distribution by Gender (%)")
plt.ylabel("Percentage")
plt.xticks(ticks=[0, 1], labels=['Male', 'Female'], rotation=0)
plt.legend(title='CLASS', labels=['Non-Diabetic', 'Predict-Diabetic', 'Diabetic'])
plt.tight_layout()
plt.show()

Age Group vs Diabetes percentage

In [None]:
df_cleaned['AgeGroup'] = pd.cut(df_cleaned['AGE'], bins=[0, 0.3, 0.6, 1.0], labels=['Young', 'Middle-Aged', 'Older'])
age_dist = df_cleaned.groupby(['AgeGroup', 'CLASS']).size().unstack().fillna(0)
age_dist_pct = age_dist.div(age_dist.sum(axis=1), axis=0) * 100

age_dist_pct.plot(kind='bar', stacked=True, colormap='viridis')
plt.title("Diabetes Class Distribution by Age Group (%)")
plt.ylabel("Percentage")
plt.xticks(rotation=0)
plt.legend(title='CLASS', labels=['Non-Diabetic', 'Predict-Diabetic', 'Diabetic'])
plt.tight_layout()
plt.show()


### Training Models

### Agglomerative Model (unsupervised)

In [None]:
X_clustering = df_cleaned.drop('CLASS', axis=1)

In [None]:
agglo_complete = AgglomerativeClustering(
    n_clusters=3,
    linkage='complete',
    metric='euclidean' 
)

agglo_single = AgglomerativeClustering(
    n_clusters=3,
    linkage='single',
    metric='euclidean'
)

cluster_labels_complete = agglo_complete.fit(X_clustering)
cluster_labels_single = agglo_single.fit(X_clustering)
print(len(agglo_complete.labels_))
print((agglo_complete.labels_))
print(len(agglo_single.labels_))
print((agglo_single.labels_))

In [None]:
linked_complete = linkage(X_clustering, method='complete', metric='euclidean')
plt.figure(figsize=(8, 4))
dendrogram(linked_complete)
plt.title("Dendrogram (Complete Linkage)")
plt.show()

# Single linkage dendrogram
linked_single = linkage(X_clustering, method='single', metric='euclidean')
plt.figure(figsize=(8, 4))
dendrogram(linked_single)
plt.title("Dendrogram (Single Linkage)")
plt.show()

### K-Means


In [None]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)  
kmeans.fit(X_clustering)

labels = kmeans.labels_
centers = kmeans.cluster_centers_
#print(X_clustering)

In [None]:
print(labels)
unique, counts = np.unique(labels, return_counts=True)
result = dict(zip(unique, counts))
print(result)
print(df_cleaned['CLASS'].value_counts())

In [None]:
from scipy.optimize import linear_sum_assignment
# Example data (replace with yours)
y_true = df_cleaned["CLASS"]
y_pred = labels

# Build confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Hungarian algorithm to find best mapping
row_ind, col_ind = linear_sum_assignment(-cm)

# Create mapping dict
mapping = {pred_label: true_label for pred_label, true_label in zip(col_ind, row_ind)}

# Map predictions to true labels
mapped_preds = np.array([mapping[label] for label in y_pred])

print("Best mapping:", mapping)
mapped_preds

as we can see, now we have the changed labels

In [None]:
acc = accuracy_score(y_true, mapped_preds)
cm = confusion_matrix(y_true, mapped_preds)
report = classification_report(y_true, mapped_preds)

print(f"Accuracy: {acc:.4f}")
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)

### Data splitting

In [None]:
x = df_cleaned.drop('CLASS', axis='columns')
x

In [None]:
y = df_cleaned['CLASS']
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Training multiple models in a pipeline

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

In [None]:
performance = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    performance[name] = {
        "Accuracy": accuracy_score(y_test, preds),
        "Report": classification_report(y_test, preds),
        "Confusion Matrix": confusion_matrix(y_test, preds)
    }

### Performance Metrics

In [None]:
for name, metrics in performance.items():
    print(f"\n{name} - Accuracy: {metrics['Accuracy']:.4f}")
    print("Classification Report:\n", metrics["Report"])
    print("Confusion Matrix:\n", metrics["Confusion Matrix"])

### Visualize Accuracy

In [None]:
acc_df = pd.DataFrame({name: [metrics["Accuracy"]] for name, metrics in performance.items()}, index=["Accuracy"]).T
acc_df.plot(kind='barh', legend=False, color='teal')
plt.title("Model Accuracy Comparison")
plt.xlabel("Accuracy")
plt.xlim(0, 1)
plt.tight_layout()
plt.show()

Feature Importance (Random Forest)

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(x, y)
importances = pd.Series(rf_model.feature_importances_, index=x.columns)

importances.sort_values().plot(kind='barh', color='skyblue', figsize=(10, 6))
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.tight_layout()
plt.show()
