In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
file_path = 'diabetes_prediction_dataset.csv'  
df = pd.read_csv(file_path)
data = df.copy()
print(data.dtypes)
data=data.dropna()


numeric_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
numeric_data = data[numeric_columns]

# StandardScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numeric_data)
scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns)

result_data = pd.concat([data.drop(columns=numeric_columns), scaled_df], axis=1)
print(result_data.head())

In [None]:
num_samples_per_class = 5000

# Filter rows with diabetes value 0 and 1
diabetes_class_0 = result_data[result_data['diabetes'] == 0].sample(n=num_samples_per_class, random_state=42)
diabetes_class_1 = result_data[result_data['diabetes'] == 1].sample(n=num_samples_per_class, random_state=42)

random_sample = pd.concat([diabetes_class_0, diabetes_class_1], axis=0)
random_sample = random_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
X = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
y = random_sample['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=100)

train_data = X_train.copy()
train_data['diabetes'] = y_train.copy()

test_data = X_test.copy()
test_data['diabetes'] = y_test.copy()

In [None]:
#min-max values for each feature
min_max_values = X.describe().loc[['min', 'max']]
min_max_dict = min_max_values.to_dict(orient='list')

print(min_max_values)
print(min_max_dict)

In [None]:
# Group the DataFrame by the 'diabetes' column
measure = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']].copy()
grouped = measure.groupby('diabetes')

# Create an empty dictionary to store the min-max ranges for each feature
min_max_dict = {}

# Iterate over each group
for group_name, group_df in grouped:
    # Calculate the min and max values for each column in the group
    min_max_values = group_df.describe().loc[['min', 'max']]
    
    # Convert the DataFrame to a dictionary and store it in the main dictionary
    min_max_dict[group_name] = min_max_values.to_dict(orient='list')

# Display the dictionary
print(min_max_dict)

In [None]:
#class 0: min, max, class 1: min, max
age = [
    min_max_dict[0]['age'][0],
    min_max_dict[0]['age'][1],
    min_max_dict[1]['age'][0],
    min_max_dict[1]['age'][1]
]

bmi = [
    min_max_dict[0]['bmi'][0],
    min_max_dict[0]['bmi'][1],
    min_max_dict[1]['bmi'][0],
    min_max_dict[1]['bmi'][1]
]

hba = [
    min_max_dict[0]['HbA1c_level'][0],
    min_max_dict[0]['HbA1c_level'][1],
    min_max_dict[1]['HbA1c_level'][0],
    min_max_dict[1]['HbA1c_level'][1]
]

blood = [
    min_max_dict[0]['blood_glucose_level'][0],
    min_max_dict[0]['blood_glucose_level'][1],
    min_max_dict[1]['blood_glucose_level'][0],
    min_max_dict[1]['blood_glucose_level'][1]
]

print(age)
print(bmi)
print(hba)
print(blood)


In [None]:
# Group the DataFrame by the 'diabetes' column
measure = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']].copy()
grouped = measure.groupby('diabetes')

# Create an empty dictionary to store the mean and standard deviation for each feature
stats_dict = {}

# Iterate over each group
for group_name, group_df in grouped:
    # Calculate the mean and standard deviation for each column in the group
    mean_values = group_df.mean()
    std_values = group_df.std()
    
    # Store mean and std in a dictionary for each feature
    for feature in group_df.columns:
        if feature not in stats_dict:
            stats_dict[feature] = {}
        
        stats_dict[feature]['mean_' + str(group_name)] = mean_values[feature]
        stats_dict[feature]['std_' + str(group_name)] = std_values[feature]

# Display the dictionary
print(stats_dict)

In [None]:
# Assume 'feature_to_delete' is the feature you want to delete
feature_to_delete = 'diabetes'

# Check if the feature exists in stats_dict before deleting
if feature_to_delete in stats_dict:
    del stats_dict[feature_to_delete]
    print(f"{feature_to_delete} deleted from stats_dict")
else:
    print(f"{feature_to_delete} does not exist in stats_dict")

In [None]:
# Create an empty dictionary to store the rf_i value for each feature
rf_dict = {}

# Iterate over each feature in stats_dict
for feature, stats_values in stats_dict.items():
    # Calculate rf_i value using the provided formula
    rf_i = ((stats_values['mean_0'] - stats_values['mean_1'])**2) / (stats_values['std_0']**2 + stats_values['std_1']**2)
    
    # Store rf_i value in rf_dict
    rf_dict[feature] = rf_i

# Display the dictionary with rf values
print(rf_dict)

In [None]:
f1_overlap = 1/(1+1.1373834470939403)
print(f1_overlap)