In [1]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
from collections import defaultdict
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import accuracy_score # type: ignore
import seaborn as sns # type: ignore
import matplotlib.pyplot as plt # type: ignore
sns.set_style("darkgrid")
import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox




In [2]:
data = pd.read_csv("diabetes_prediction_dataset.csv")
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
# Calculate the distribution of the target variable
class_distribution = data['diabetes'].value_counts()

# Print the distribution
print("Class Distribution:")
print(class_distribution)


Class Distribution:
diabetes
0    91500
1     8500
Name: count, dtype: int64


**Data Preprocessing**

In [4]:
print("Number of missing values:\n",data.isnull().sum())

Number of missing values:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


**Features types**

In [5]:
categorical_features = [i for i in data.columns if data[i].dtype == 'O' and i != 'Diabetes']
numerical_features = [i for i in data.columns if data[i].dtype != 'O']
print(f"The categorical features are:\n{categorical_features}\n\n")
print(f"The numerical features are:\n{numerical_features}")

The categorical features are:
['gender', 'smoking_history']


The numerical features are:
['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']


*Encoding Categrical Data*

In [6]:
def encode_categorical_columns(data):

    if 'gender' in data.columns:
        data.drop('gender', axis=1, inplace=True)
    
    data = pd.get_dummies(data, columns=['smoking_history'])
    
    return data

data = encode_categorical_columns(data)

*Bin Numerical Data*

In [7]:
def bin_numerical_columns(data, column, num_bins=5):
    # Perform binning using pd.cut
    data[f'{column}_bin'] = pd.cut(data[column], bins=num_bins, labels=False)
    return data

numerical_columns_to_bin = ['bmi', 'HbA1c_level', 'blood_glucose_level']

for column in numerical_columns_to_bin:
    data = bin_numerical_columns(data, column)

data.drop(numerical_columns_to_bin, axis=1, inplace=True)
data.drop('age', axis=1, inplace=True)



*Split data into Test And trainind dataset*

In [8]:
print(data)
X = data.drop('diabetes', axis=1)
y = data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


       hypertension  heart_disease  diabetes  smoking_history_No Info  \
0                 0              1         0                    False   
1                 0              0         0                     True   
2                 0              0         0                    False   
3                 0              0         0                    False   
4                 1              1         0                    False   
...             ...            ...       ...                      ...   
99995             0              0         0                     True   
99996             0              0         0                     True   
99997             0              0         0                    False   
99998             0              0         0                    False   
99999             0              0         0                    False   

       smoking_history_current  smoking_history_ever  smoking_history_former  \
0                        False             

*Calculate Entropy*

In [9]:
def calculate_entropy(y):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    total_samples = len(y)
    entropy = 0

    for count in class_counts:
        probability = count / total_samples
        entropy -= probability * np.log2(probability)

    return entropy

# Example usage
entropy = calculate_entropy(y_train)
print("Entropy of the target variable:", entropy)


Entropy of the target variable: 0.4192135797578374


**Calculate Information Gain**

In [10]:
def calculate_information_gain(X, y, feature):
    # Calculate entropy of the entire dataset
    total_entropy = calculate_entropy(y)

    # Calculate weighted entropy of the two child nodes after splitting by the given feature
    unique_values = X[feature].unique()
    weighted_entropy = 0
    for value in unique_values:
        subset_y = y[X[feature] == value]
        subset_weight = len(subset_y) / len(y)
        subset_entropy = calculate_entropy(subset_y)

        weighted_entropy += subset_weight * subset_entropy

    information_gain = total_entropy - weighted_entropy

    return total_entropy, weighted_entropy, information_gain


information_gains = {}
for feature in X_train.columns:
    total_entropy, weighted_entropy, information_gain = calculate_information_gain(X_train, y_train, feature)
    print(f"Feature: {feature},\n Weighted Entropy: {weighted_entropy},\n Gain: {information_gain}\n\n")
    information_gains[feature] = information_gain


Feature: hypertension,
 Weighted Entropy: 0.3999741254767586,
 Gain: 0.019239454281078816


Feature: heart_disease,
 Weighted Entropy: 0.40512640535396627,
 Gain: 0.014087174403871139


Feature: smoking_history_No Info,
 Weighted Entropy: 0.40802118929968123,
 Gain: 0.011192390458156176


Feature: smoking_history_current,
 Weighted Entropy: 0.4189555734453783,
 Gain: 0.00025800631245909145


Feature: smoking_history_ever,
 Weighted Entropy: 0.4189199547812374,
 Gain: 0.0002936249766000154


Feature: smoking_history_former,
 Weighted Entropy: 0.41353561386579274,
 Gain: 0.005677965892044667


Feature: smoking_history_never,
 Weighted Entropy: 0.4187782184509814,
 Gain: 0.00043536130685600805


Feature: smoking_history_not current,
 Weighted Entropy: 0.4187568489343521,
 Gain: 0.00045673082348529626


Feature: bmi_bin,
 Weighted Entropy: 0.4028231499700497,
 Gain: 0.016390429787787708


Feature: HbA1c_level_bin,
 Weighted Entropy: 0.2652147739838584,
 Gain: 0.153998805773979


Feature: b

**Best Split**

In [11]:
def select_best_split_feature(X, y):
    information_gains = {}
    for feature in X.columns:
        total_entropy, weighted_entropy,information_gains[feature] = calculate_information_gain(X, y, feature)

    best_feature = max(information_gains, key=information_gains.get)
    return best_feature

# Example usage
best_split_feature = select_best_split_feature(X_train, y_train)
print(f"Best splitting feature:{best_split_feature}" )


Best splitting feature:HbA1c_level_bin


*Splitting Data*

In [12]:
def split_data(X, y, feature, value):
    X_left = X[X[feature] == value]
    y_left = y[X[feature] == value]
    X_right = X[X[feature] != value]
    y_right = y[X[feature] != value]
    return X_left, y_left, X_right, y_right


In [13]:
def majority_vote(y):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    majority_class_index = np.argmax(class_counts)
    return unique_classes[majority_class_index]

**Decision Tree Model**

In [14]:
def build_decision_tree(X, y, max_depth=None):
    if max_depth is None:
        max_depth = len(X.columns)

    # Base cases
    if max_depth == 0 or len(np.unique(y)) == 1:
        return majority_vote(y)

    # Find the best feature to split on
    best_feature = select_best_split_feature(X, y)

    # Create the decision node
    decision_node = {'feature': best_feature, 'branches': {}}

    # Recursively build subtrees for each branch
    for value in np.unique(X[best_feature]):
        X_subset = X[X[best_feature] == value]
        y_subset = y[X[best_feature] == value]
        decision_node['branches'][value] = build_decision_tree(X_subset.drop(columns=[best_feature]), y_subset, max_depth - 1)

    return decision_node

decision_tree = build_decision_tree(X_train, y_train)


**Check the Sort Of Test set and predection result**

In [15]:
# Check if the indices of y_test and predictions are the same
indices_match = all(y_test.index == X_test.index)

if indices_match:
    print("The order of predictions matches the order of instances in the test set.")
else:
    print("The order of predictions does not match the order of instances in the test set.")


The order of predictions matches the order of instances in the test set.


**Prediction Function**

In [16]:
def predict_sample(tree, sample):
    while isinstance(tree, dict):  # Traverse the tree until reaching a leaf node
        feature = tree['feature']
        value = sample[feature]
        if value in tree['branches']:
            tree = tree['branches'][value]
        else:
            # If the value is not present in the tree, return majority vote
            return majority_vote(sample)
    return tree

def predict_samples(tree, samples):
    predictions = []
    for _, sample in samples.iterrows():
        predicted_class = predict_sample(tree, sample)
        predictions.append(predicted_class)
    return predictions

# Predict using the decision tree model for the test set
y_pred_custom = predict_samples(decision_tree, X_test)
person_ids = X_test.index
# Print the statements from the test set followed by the corresponding predictions
for person_id, true_label, prediction in zip(person_ids, y_test, y_pred_custom):
    test_statement = f"Person {person_id} {'has' if true_label == 1 else 'does not have'} diabetes"
    pred_statement = f"Predicted: {f'{person_id} has' if prediction == 1 else f'{person_id} does not have'} diabetes"
    print(test_statement)
    print(pred_statement)




Person 75721 does not have diabetes
Predicted: 75721 does not have diabetes
Person 80184 does not have diabetes
Predicted: 80184 does not have diabetes
Person 19864 does not have diabetes
Predicted: 19864 does not have diabetes
Person 76699 does not have diabetes
Predicted: 76699 does not have diabetes
Person 92991 has diabetes
Predicted: 92991 does not have diabetes
Person 76434 does not have diabetes
Predicted: 76434 does not have diabetes
Person 84004 does not have diabetes
Predicted: 84004 does not have diabetes
Person 80917 does not have diabetes
Predicted: 80917 does not have diabetes
Person 60767 does not have diabetes
Predicted: 60767 does not have diabetes
Person 50074 does not have diabetes
Predicted: 50074 does not have diabetes
Person 27701 does not have diabetes
Predicted: 27701 does not have diabetes
Person 42141 does not have diabetes
Predicted: 42141 does not have diabetes
Person 45080 does not have diabetes
Predicted: 45080 does not have diabetes
Person 16638 does not 

In [17]:
# Calculate accuracy using sklearn's accuracy_score function
accuracy_custom = accuracy_score(y_test, y_pred_custom) * 100
print("Accuracy (Custom Decision Tree):", accuracy_custom)

Accuracy (Custom Decision Tree): 96.73


**GUI**

In [18]:

# Function to preprocess and train the model
def preprocess_and_train(file_path, train_size):
    # Read data from file
    try:
        data = pd.read_csv(file_path)  # Assuming CSV file for simplicity
    except Exception as e:
        messagebox.showerror("Error", f"Error reading file: {e}")
        return
    
    # Preprocessing steps (similar to what you've implemented)
    # Encode categorical columns
    data = encode_categorical_columns(data)
    X = data.drop('diabetes', axis=1)
    y = data['diabetes']
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1-train_size), random_state=42)
    
    # Train decision tree model
    decision_tree = build_decision_tree(X_train, y_train)
    
    # Predict using the model
    y_pred = predict_samples(decision_tree, X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Prepare test statements and predicted statements
    result_statements = []
    for person_id, true_label, prediction in zip(X_test.index, y_test, y_pred):
        test_statement = f"Person {person_id} {'has' if true_label == 1 else 'does not have'} diabetes"
        pred_statement = f"Predicted: {f'{person_id} has' if prediction == 1 else f'{person_id} does not have'} diabetes"
        result_statements.append(test_statement)
        result_statements.append(pred_statement)
    
    return accuracy, result_statements

# Function to handle button click event
def browse_files():
    file_path = filedialog.askopenfilename()
    file_entry.delete(0, tk.END)
    file_entry.insert(0, file_path)

def process_data():
    file_path = file_entry.get()
    train_size = float(train_size_entry.get())
    
    accuracy, result_statements = preprocess_and_train(file_path, train_size)
    
    # Display results
    result_label.config(text=f"Accuracy: {accuracy*100:.2f} %")
    result_text.config(state=tk.NORMAL)
    result_text.delete(1.0, tk.END)
    result_text.insert(tk.END, '\n'.join(result_statements))
    result_text.config(state=tk.DISABLED)

# Create GUI window
window = tk.Tk()
window.title("Diabetes Decision Tree Prediction")

# Create file selection widgets
file_label = tk.Label(window, text="Select file:")
file_label.grid(row=0, column=0)
file_entry = tk.Entry(window, width=50)
file_entry.grid(row=0, column=1)
browse_button = tk.Button(window, text="Browse", command=browse_files)
browse_button.grid(row=0, column=2)

# Create train size input widget
train_size_label = tk.Label(window, text="Training Data Size:")
train_size_label.grid(row=1, column=0)
train_size_entry = tk.Entry(window)
train_size_entry.grid(row=1, column=1)

# Create process button
process_button = tk.Button(window, text="Process Data", command=process_data)
process_button.grid(row=2, column=1)

# Create result display widgets
result_label = tk.Label(window, text="")
result_label.grid(row=3, column=0, columnspan=3)

result_text = tk.Text(window, height=20, width=80)
result_text.grid(row=4, column=0, columnspan=3)
result_text.config(state=tk.DISABLED)

# Start GUI event loop
window.mainloop()
