In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import hdbscan
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
file_path = 'diabetes_prediction_dataset.csv'  
df = pd.read_csv(file_path)
data = df.copy()
print(data.dtypes)
data=data.dropna()

dummy_gender = pd.get_dummies(data['gender'], prefix='gender', drop_first=True)

dummy_gender = dummy_gender.astype(int)


data = pd.concat([data, dummy_gender], axis=1)

data = data.drop('gender', axis=1)

dummy_smoking = pd.get_dummies(data['smoking_history'], prefix='smoking_history', drop_first=True)

dummy_smoking = dummy_smoking.astype(int)


data = pd.concat([data, dummy_smoking], axis=1)

data = data.drop('smoking_history', axis=1)

numeric_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
numeric_data = data[numeric_columns]

# Initialize StandardScaler
scaler = MinMaxScaler()

# Scale the numeric columns
scaled_data = scaler.fit_transform(numeric_data)

# Create a DataFrame with the scaled numeric columns
scaled_df = pd.DataFrame(scaled_data, columns=numeric_columns)

# Concatenate the scaled numeric columns with the one-hot encoded columns and non-numeric columns
result_data = pd.concat([data.drop(columns=numeric_columns), scaled_df], axis=1)

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object


In [4]:
num_samples_per_class = 6000

diabetes_class_0 = result_data[result_data['diabetes'] == 0].sample(n=num_samples_per_class, random_state=42)
diabetes_class_1 = result_data[result_data['diabetes'] == 1].sample(n=num_samples_per_class, random_state=42)

random_sample = pd.concat([diabetes_class_0, diabetes_class_1], axis=0)

random_sample = random_sample.sample(frac=1, random_state=42).reset_index(drop=True)
random_sample = random_sample[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']]

In [22]:
# Randomly select 3000 observation to test set
df_X_test = random_sample.sample(n=3000, random_state=42)

# Remove the selected observations from the original subset to create the second subset
df_subset_2 = random_sample.drop(df_X_test.index)

# df_subset_1 contains 3000 randomly selected observations for test set
# df_subset_2 contains the rest of the observations from the original subset

print("Number of rows in df_subset_1:", len(df_X_test))
print("Number of rows in df_subset_2:", len(df_subset_2))

Number of rows in df_subset_1: 3000
Number of rows in df_subset_2: 9000


In [23]:
df_X_test1 = df_X_test.drop('diabetes', axis=1)
df_y_test1 = df_X_test['diabetes']

In [24]:
X = df_subset_2[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
y = df_subset_2['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)


In [9]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9004444444444445


In [10]:
error_mask = y_test != y_pred
error_features = X_test[error_mask]
error_points = y_test[error_mask]

errors_0 = error_features[y_test[error_mask] == 0]
errors_1 = error_features[y_test[error_mask] == 1]
errors_0_y = error_points[y_test[error_mask] == 0]
errors_1_y = error_points[y_test[error_mask] == 1]


print(len(errors_0))
print(len(errors_1))

113
111


In [11]:
print(errors_0.describe().loc[['min', 'max']])
print(errors_1.describe().loc[['min', 'max']])

          age       bmi  HbA1c_level  blood_glucose_level
min  0.374374  0.098506     0.400000             0.209091
max  1.000000  0.539566     0.563636             0.545455
          age       bmi  HbA1c_level  blood_glucose_level
min  0.061562  0.062675     0.400000             0.209091
max  1.000000  0.455882     0.563636             0.545455


In [26]:
# Ranges for each feature
feature_ranges_0 = {
    'age': (0.374, 1.0),
    'bmi': (0.098, 0.539566),
    'HbA1c_level': (0.400000, 0.563636),
    'blood_glucose_level': (0.209091, 0.545455)
}

feature_ranges_1 = {
    'age': (0.0615, 1.0),
    'bmi': (0.062675, 0.455882),
    'HbA1c_level': (0.400000, 0.563636),
    'blood_glucose_level': (0.209091, 0.545455)
}

# Filter class 0
filtered_class_0_with_y = df_subset_2[(df_subset_2['diabetes'] == 0) &
    (df_subset_2['age'].between(feature_ranges_0['age'][0], feature_ranges_0['age'][1])) &
    (df_subset_2['bmi'].between(feature_ranges_0['bmi'][0], feature_ranges_0['bmi'][1])) &
    (df_subset_2['HbA1c_level'].between(feature_ranges_0['HbA1c_level'][0], feature_ranges_0['HbA1c_level'][1])) &
    (df_subset_2['blood_glucose_level'].between(feature_ranges_0['blood_glucose_level'][0], feature_ranges_0['blood_glucose_level'][1]))
]


# Filter class 1
filtered_class_1_with_y = df_subset_2[(df_subset_2['diabetes'] == 1) &
    (df_subset_2['age'].between(feature_ranges_1['age'][0], feature_ranges_1['age'][1])) &
    (df_subset_2['bmi'].between(feature_ranges_1['bmi'][0], feature_ranges_1['bmi'][1])) &
    (df_subset_2['HbA1c_level'].between(feature_ranges_1['HbA1c_level'][0], feature_ranges_1['HbA1c_level'][1])) &
    (df_subset_2['blood_glucose_level'].between(feature_ranges_1['blood_glucose_level'][0], feature_ranges_1['blood_glucose_level'][1])) 
]


print(filtered_class_0_with_y)
print(filtered_class_1_with_y)
print(df_subset_2)

            age       bmi  HbA1c_level  blood_glucose_level  diabetes
9      0.712212  0.202031     0.472727             0.359091         0
12     0.987487  0.202031     0.490909             0.295455         0
18     0.574575  0.167134     0.418182             0.227273         0
26     0.486987  0.202031     0.418182             0.363636         0
40     0.937437  0.098506     0.418182             0.295455         0
...         ...       ...          ...                  ...       ...
11900  0.461962  0.202031     0.545455             0.363636         0
11901  0.637137  0.202031     0.400000             0.295455         0
11918  0.712212  0.207283     0.490909             0.272727         0
11978  0.374374  0.193277     0.490909             0.359091         0
11989  0.386887  0.202031     0.418182             0.545455         0

[890 rows x 5 columns]
            age       bmi  HbA1c_level  blood_glucose_level  diabetes
20     0.649650  0.342204     0.490909             0.545455       

In [38]:
iterations = 500
all_predicted_probabilities = []

for i in range(iterations):
    
    # Randomly select X instances from outside the error ranges for each class
    additional_samples_class_0 = df_subset_2[(df_subset_2['diabetes'] == 0) & 
                                               (~df_subset_2.index.isin(filtered_class_0_with_y.index))].sample(n=250)
    
    additional_samples_class_1 = df_subset_2[(df_subset_2['diabetes'] == 1) & 
                                               (~df_subset_2.index.isin(filtered_class_1_with_y.index))].sample(n=250)
    
    # Combine selected instances
    combined_df = pd.concat([filtered_class_0_with_y, filtered_class_1_with_y, additional_samples_class_0, additional_samples_class_1])
    
    X = combined_df.drop(columns=['diabetes'])
    y = combined_df['diabetes']
    
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X, y)
    
    all_predicted_probabilities.append(clf.predict_proba(df_X_test1))

# Perform soft voting
soft_voting_predictions = np.mean(all_predicted_probabilities, axis=0)
soft_voting_accuracy = accuracy_score(df_y_test1, np.argmax(soft_voting_predictions, axis=1))
print("Soft Voting Accuracy:", soft_voting_accuracy)

Soft Voting Accuracy: 0.8813333333333333
Hard Voting Accuracy: 0.8813333333333333


In [36]:
from sklearn.utils import resample

# Bootstrapping

n_bootstraps = 500
accuracy_scores = []
all_predicted_probabilities = []

for i in range(n_bootstraps):
    X_train_bootstrap, y_train_bootstrap = resample(X, y, replace=True, random_state=i)
    
    model = RandomForestClassifier()
    model.fit(X_train_bootstrap, y_train_bootstrap)
    
    y_pred = model.predict(df_X_test1)
    
    accuracy = accuracy_score(df_y_test1, y_pred)
    accuracy_scores.append(accuracy)
    all_predicted_probabilities.append(clf.predict_proba(df_X_test1))

# Calculate mean accuracy
mean_accuracy = np.mean(accuracy_scores)
print("Mean Accuracy:", mean_accuracy)

# Perform soft voting
soft_voting_predictions = np.mean(all_predicted_probabilities, axis=0)
soft_voting_accuracy = accuracy_score(df_y_test1, np.argmax(soft_voting_predictions, axis=1))
print("Soft Voting Accuracy:", soft_voting_accuracy)

Mean Accuracy: 0.8836933333333333
Soft Voting Accuracy: 0.8876666666666667
Hard Voting Accuracy: 0.8876666666666667


In [34]:
# Single random forest 

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

y_pred = model.predict(df_X_test1)

accuracy = accuracy_score(df_y_test1, y_pred)
print(accuracy)

0.8706666666666667
