In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [14]:
file_path = 'diabetes_prediction_dataset.csv'  
df = pd.read_csv(file_path)
data = df.copy()
data=data.dropna()

numeric_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
numeric_data = data[numeric_columns]

#Scaling
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numeric_data)
scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns)
result_data = pd.concat([data.drop(columns=numeric_columns), scaled_df], axis=1)

#Subsample
num_samples_per_class = 5000
diabetes_class_0 = result_data[result_data['diabetes'] == 0].sample(n=num_samples_per_class, random_state=42)
diabetes_class_1 = result_data[result_data['diabetes'] == 1].sample(n=num_samples_per_class, random_state=42)
random_sample = pd.concat([diabetes_class_0, diabetes_class_1], axis=0)
random_sample = random_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [61]:
X = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
y = random_sample['diabetes']

In [36]:
#Create a dict for each feature with value count and safe as csv for whole subset

def create_unique_count_df(column_name):
    if column_name not in X.columns:
        print(f"Column '{column_name}' not found in the DataFrame.")
        return pd.DataFrame()  # Return an empty DataFrame
    unique_values = X[column_name].unique()
    value_counts = X[column_name].value_counts()
    unique_count_df = pd.DataFrame({'Value': unique_values, 'Count': value_counts})
    return unique_count_df


columns = X.columns
unique_values_counts_dict = {}

for col in columns:
    unique_count_df = create_unique_count_df(col)
    unique_values_counts_dict[col] = unique_count_df

# Save each dict for each feature to separate csv files
for col, df_count in unique_values_counts_dict.items():
    csv_file_name = f"{col}_unique_values_counts.csv"
    df_count.to_csv(csv_file_name, index=False)

In [41]:
file_path = 'bmi_unique_values_counts.csv'  # read one feature file 
df = pd.read_csv(file_path)

df = pd.DataFrame(df)

# Define bins for the intervals
bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]

# Create intervals and sum 'Count' values within each interval
df['Interval'] = pd.cut(df['Value'], bins=bins, include_lowest=False)
result_df = df.groupby('Interval')['Count'].sum().reset_index()

print(result_df)

# Save result in a csv file
result_df.to_csv("bmi_all.csv", index=False)   

       Interval  Count
0   (0.0, 0.05]     56
1   (0.05, 0.1]    610
2   (0.1, 0.15]    947
3   (0.15, 0.2]   3555
4   (0.2, 0.25]   1447
5   (0.25, 0.3]   1162
6   (0.3, 0.35]    908
7   (0.35, 0.4]    575
8   (0.4, 0.45]    387
9   (0.45, 0.5]    197
10  (0.5, 0.55]     83
11  (0.55, 0.6]     39
12  (0.6, 0.65]     14
13  (0.65, 0.7]     12
14  (0.7, 0.75]      4
15  (0.75, 0.8]      0
16  (0.8, 0.85]      2
17  (0.85, 0.9]      1
18  (0.9, 0.95]      1
19  (0.95, 1.0]      0


In [62]:
# print the feature value ranges with iterations

list = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

for l in list:
    print(l)
    #file_path = 'age_unique_values_counts.csv'  
    df = X[l].copy()
    df = pd.DataFrame(df)

    bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]

    df['Interval'] = pd.cut(df[l], bins=bins, include_lowest=False)

    bin_counts = df.groupby('Interval').size().reset_index(name='Count')

    bin_counts['Interval'] = bin_counts['Interval'].astype(str)
    bin_counts = bin_counts.sort_values(by='Interval')

    print(bin_counts)

    ordered_df = df.groupby('Interval')
    print(ordered_df)

age
       Interval  Count
0   (0.0, 0.05]    269
1   (0.05, 0.1]    184
2   (0.1, 0.15]    182
3   (0.15, 0.2]    247
4   (0.2, 0.25]    270
5   (0.25, 0.3]    282
6   (0.3, 0.35]    314
7   (0.35, 0.4]    369
8   (0.4, 0.45]    397
9   (0.45, 0.5]    498
10  (0.5, 0.55]    457
11  (0.55, 0.6]    574
12  (0.6, 0.65]    663
13  (0.65, 0.7]    692
14  (0.7, 0.75]    832
15  (0.75, 0.8]    749
16  (0.8, 0.85]    722
17  (0.85, 0.9]    589
18  (0.9, 0.95]    532
19  (0.95, 1.0]   1176
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000012C604270D0>
bmi
       Interval  Count
0   (0.0, 0.05]     33
1   (0.05, 0.1]    436
2   (0.1, 0.15]    905
3   (0.15, 0.2]   1621
4   (0.2, 0.25]   4026
5   (0.25, 0.3]   1295
6   (0.3, 0.35]    796
7   (0.35, 0.4]    429
8   (0.4, 0.45]    234
9   (0.45, 0.5]    119
10  (0.5, 0.55]     53
11  (0.55, 0.6]     30
12  (0.6, 0.65]      9
13  (0.65, 0.7]      8
14  (0.7, 0.75]      2
15  (0.75, 0.8]      0
16  (0.8, 0.85]      2
17  (0.85, 0.9]    

In [42]:
import pandas as pd
from sklearn.cluster import KMeans

# Use a K-Means clustering algorithm to partitioning data and calculate purity and counts for each cluster

number_clusters = [3, 4, 5, 6, 7, 8, 9]

for n in number_clusters:
    kmeans = KMeans(n_clusters=n, random_state=42)
    cluster_labels = kmeans.fit_predict(X)

    X_n = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']].copy()
    X_n['cluster'] = cluster_labels

    print(f"Number of clusters: {n}")

    # Class count for cluster
    for cluster in range(n):
        cluster_data = X_n[X_n['cluster'] == cluster]
        total_in_cluster = len(cluster_data)
        class_0_in_cluster = len(cluster_data[cluster_data['diabetes'] == 0])
        class_1_in_cluster = len(cluster_data[cluster_data['diabetes'] == 1])
        
        # Calculate purity score for the current cluster
        if total_in_cluster > 0:
            majority_class = cluster_data['diabetes'].mode()[0]
            correct_predictions = len(cluster_data[cluster_data['diabetes'] == majority_class])
            cluster_purity_score = correct_predictions / total_in_cluster
        else:
            cluster_purity_score = 0

        print(f"Cluster {cluster}:")
        print(f"  Total observations: {total_in_cluster}")
        print(f"  Observations in class 0: {class_0_in_cluster}")
        print(f"  Observations in class 1: {class_1_in_cluster}")
        print(f"  Purity score: {cluster_purity_score:.2f}")

    # Calculate overall purity score
    purity_scores = []
    for cluster in range(n):
        cluster_data = X_n[X_n['cluster'] == cluster]
        if len(cluster_data) > 0:
            majority_class = cluster_data['diabetes'].mode()[0]
            correct_predictions = len(cluster_data[cluster_data['diabetes'] == majority_class])
            purity_scores.append(correct_predictions / len(cluster_data))
    
    overall_purity_score = sum(purity_scores) / n
    print(f"Overall purity score: {overall_purity_score:.2f}\n")

Overall Purity for 3 clusters: 0.83
0, 0.92, 3291, 3035/256
1, 0.99, 2152, 22/2130
2, 0.57, 4557, 1943/2614
Overall Purity for 4 clusters: 0.89
0, 0.91, 2573, 2330/243
1, 0.67, 4013, 1305/2708
2, 0.99, 1379, 1364/15
3, 1.00, 2035, 1/2034
Overall Purity for 5 clusters: 0.89
0, 0.91, 2461, 2247/214
1, 0.56, 3387, 1498/1889
2, 1.00, 1255, 1255/0
3, 1.00, 1091, 0/1091
4, 1.00, 1806, 0/1806
Overall Purity for 6 clusters: 0.86
0, 0.96, 1639, 1579/60
1, 0.60, 2277, 914/1363
2, 1.00, 1240, 1240/0
3, 1.00, 998, 0/998
4, 1.00, 1810, 3/1807
5, 0.62, 2036, 1264/772
Overall Purity for 7 clusters: 0.88
0, 0.96, 1631, 1572/59
1, 0.59, 2195, 896/1299
2, 1.00, 1239, 1239/0
3, 1.00, 877, 0/877
4, 1.00, 593, 0/593
5, 0.63, 1990, 1256/734
6, 0.97, 1475, 37/1438
Overall Purity for 8 clusters: 0.89
0, 1.00, 1012, 1012/0
1, 0.64, 1951, 702/1249
2, 1.00, 932, 932/0
3, 1.00, 591, 0/591
4, 0.98, 1455, 32/1423
5, 0.61, 1973, 1199/774
6, 1.00, 852, 0/852
7, 0.91, 1234, 1123/111
Overall Purity for 9 clusters: 0.89

In [28]:
# The selected partitioning

kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(X)

X_5 = X.copy()
X_5['cluster'] = cluster_labels
print(X_5)

           age       bmi  HbA1c_level  blood_glucose_level  cluster
0     1.000000  0.191293     0.854545             0.209091        3
1     0.161662  0.241713     0.472727             0.354545        0
2     0.061562  0.202031     0.490909             0.000000        0
3     0.587087  0.418651     0.090909             0.359091        2
4     0.324324  0.202031     0.454545             0.227273        0
...        ...       ...          ...                  ...      ...
9995  0.712212  0.270775     0.854545             0.340909        3
9996  0.662162  0.202031     0.472727             1.000000        4
9997  0.436937  0.327848     0.854545             0.295455        3
9998  0.324324  0.202031     0.545455             0.000000        0
9999  0.699700  0.202031     0.418182             0.545455        1

[10000 rows x 5 columns]


In [29]:
# Save a df with all clusters in partitioning

cluster_0_df = X_5[X_5['cluster'] == 0][['age', 'bmi', 'blood_glucose_level', 'HbA1c_level']]
cluster_1_df = X_5[X_5['cluster'] == 1][['age', 'bmi', 'blood_glucose_level', 'HbA1c_level']]
cluster_2_df = X_5[X_5['cluster'] == 2][['age', 'bmi', 'blood_glucose_level', 'HbA1c_level']]
cluster_3_df = X_5[X_5['cluster'] == 3][['age', 'bmi', 'blood_glucose_level', 'HbA1c_level']]
cluster_4_df = X_5[X_5['cluster'] == 4][['age', 'bmi', 'blood_glucose_level', 'HbA1c_level']]

print(len(cluster_1_df))

3387


In [32]:
# Function to create a dict with unique values and their counts
def create_unique_count_df(column_name):
    if column_name not in cluster_1_df.columns:
        print(f"Column '{column_name}' not found in the DataFrame.")
        return pd.DataFrame()  # Return an empty DataFrame
    unique_values = cluster_1_df[column_name].unique()
    value_counts = cluster_1_df[column_name].value_counts()
    unique_count_df = pd.DataFrame({'Value': unique_values, 'Count': value_counts})
    return unique_count_df

columns = cluster_1_df.columns
unique_values_counts_dict = {}

# Iterate over each feature, and store it in the dictionary
for col in columns:
    unique_count_df = create_unique_count_df(col)
    unique_values_counts_dict[col] = unique_count_df

# Save each df for each feature to separate csv files
for col, df_count in unique_values_counts_dict.items():
    csv_file_name = f"{col}_unique_values_counts.csv"
    df_count.to_csv(csv_file_name, index=True)

In [60]:
# function to print all feature distributions 
list = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

for l in list:
    print(l)
    #file_path = 'age_unique_values_counts.csv'  
    df = cluster_1_df[l].copy()
    # Create DataFrame
    df = pd.DataFrame(df)

    # Define bins for the intervals
    bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00]

    # Create intervals and sum 'Count' values within each interval
    df['Interval'] = pd.cut(df[l], bins=bins, include_lowest=False)

    # Group by 'Interval' and count the number of occurrences
    bin_counts = df.groupby('Interval').size().reset_index(name='Count')

    # Sort bins by interval
    bin_counts['Interval'] = bin_counts['Interval'].astype(str)
    bin_counts = bin_counts.sort_values(by='Interval')

    print(bin_counts)

    ordered_df = df.groupby('Interval')
    print(ordered_df)
#result_df = df.groupby('Interval')['Count'].sum().reset_index()

#print(result_df)

#result_df.to_csv("age_bins.csv", index=True)

age
       Interval  Count
0   (0.0, 0.05]      0
1   (0.05, 0.1]      0
2   (0.1, 0.15]      0
3   (0.15, 0.2]      0
4   (0.2, 0.25]      0
5   (0.25, 0.3]      0
6   (0.3, 0.35]      0
7   (0.35, 0.4]      0
8   (0.4, 0.45]      1
9   (0.45, 0.5]     35
10  (0.5, 0.55]    198
11  (0.55, 0.6]    286
12  (0.6, 0.65]    310
13  (0.65, 0.7]    331
14  (0.7, 0.75]    388
15  (0.75, 0.8]    358
16  (0.8, 0.85]    330
17  (0.85, 0.9]    291
18  (0.9, 0.95]    272
19  (0.95, 1.0]    587
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000012C6046E760>
bmi
       Interval  Count
0   (0.0, 0.05]      1
1   (0.05, 0.1]     28
2   (0.1, 0.15]    205
3   (0.15, 0.2]    557
4   (0.2, 0.25]   1476
5   (0.25, 0.3]    497
6   (0.3, 0.35]    280
7   (0.35, 0.4]    175
8   (0.4, 0.45]     88
9   (0.45, 0.5]     44
10  (0.5, 0.55]     19
11  (0.55, 0.6]      9
12  (0.6, 0.65]      2
13  (0.65, 0.7]      3
14  (0.7, 0.75]      1
15  (0.75, 0.8]      0
16  (0.8, 0.85]      1
17  (0.85, 0.9]    

In [None]:
file_path = 'bmi_unique_values_counts.csv'  
df = pd.read_csv(file_path)
# Create DataFrame
df = pd.DataFrame(df)

# Define bins for the intervals
bins = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90]

# Create intervals and sum 'Count' values within each interval
df['Interval'] = pd.cut(df['Value'], bins=bins, right=False, include_lowest=True)
result_df = df.groupby('Interval')['Count'].sum().reset_index()

print(result_df)

result_df.to_csv("bmi_bins.csv", index=False)

In [None]:
print(type(cluster_labels))
print(type(y))
y_array = y.values
print(type(y_array))

In [None]:
df_overlap = pd.concat([pd.DataFrame(X, columns=X.columns), pd.DataFrame(cluster_labels, columns=['Clusters']), pd.DataFrame(y_array, columns=['Class'])], axis=1)

In [None]:
measure = df_overlap[df_overlap['Clusters'] == 1].copy()
grouped = measure.groupby('Class')

# Create an empty dictionary to store the mean and standard deviation for each feature
stats_dict = {}

# Iterate over each group
for group_name, group_df in grouped:
    # Calculate the mean and standard deviation for each column in the group
    mean_values = group_df.mean()
    std_values = group_df.std()
    
    # Store mean and std in a dictionary for each feature
    for feature in group_df.columns:
        if feature not in stats_dict:
            stats_dict[feature] = {}
        
        stats_dict[feature]['mean_' + str(group_name)] = mean_values[feature]
        stats_dict[feature]['std_' + str(group_name)] = std_values[feature]

# Display the dictionary
print(stats_dict)

In [None]:
# Assume 'feature_to_delete' is the feature you want to delete
feature_to_delete = 'Class'

# Check if the feature exists in stats_dict before deleting
if feature_to_delete in stats_dict:
    del stats_dict[feature_to_delete]
    print(f"{feature_to_delete} deleted from stats_dict")
else:
    print(f"{feature_to_delete} does not exist in stats_dict")

In [None]:
# Assume 'feature_to_delete' is the feature you want to delete
feature_to_delete = 'Clusters'

# Check if the feature exists in stats_dict before deleting
if feature_to_delete in stats_dict:
    del stats_dict[feature_to_delete]
    print(f"{feature_to_delete} deleted from stats_dict")
else:
    print(f"{feature_to_delete} does not exist in stats_dict")

In [None]:
# Create an empty dictionary to store the rf_i value for each feature
rf_dict = {}

# Iterate over each feature in stats_dict
for feature, stats_values in stats_dict.items():
    # Calculate rf_i value using the provided formula
    rf_i = ((stats_values['mean_0'] - stats_values['mean_1'])**2) / (stats_values['std_0']**2 + stats_values['std_1']**2)
    
    # Store rf_i value in rf_dict
    rf_dict[feature] = rf_i

# Display the dictionary with rf values
print(rf_dict)