In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import seaborn as sns
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='seaborn')
warnings.filterwarnings('ignore', category=FutureWarning, module='pandas')

In [None]:
df = pd.read_csv("diabetes.csv")

In [None]:
df.info()

In [None]:
# sns.set(style="whitegrid")

# # Create a figure and axis for the subplots
# fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 10))

# # Flatten the 2D axes array for easier indexing
# axes = axes.flatten()

# # Loop through each column (excluding the 'Outcome' column)
# for i, column in enumerate(df.columns[:-1]):
#     sns.histplot(data=df, x=column, ax=axes[i], kde=True)
#     axes[i].set_title(f'Distribution of {column}')

# # Adjust the layout to prevent overlap
# plt.tight_layout()

# # Show the plots
# plt.show()

In [None]:
# # Create a figure and axis for the subplots
# fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 10))

# # Flatten the 2D axes array for easier indexing
# axes = axes.flatten()   

# # Loop through each column (excluding the 'Outcome' column)
# for i, column in enumerate(df.columns):
#     # Exclude the 'Outcome' column
#     if column != 'Outcome':
#         # Create a boxplot for the current column
#         sns.boxplot(data=df, y=column, ax=axes[i])
#         axes[i].set_title(f'Boxplot of {column}')
        
# # Adjust the layout to prevent overlap
# plt.tight_layout()

# # Show the plots
# plt.show()

In [None]:
df['BMI_to_Age_Ratio'] = df['BMI'] / df['Age']

In [None]:
df['Insulin_Indicator'] = (df['Insulin'] > 0).astype(int)

In [None]:
df['Glucose'] = df[['Glucose', 'Insulin']].mean(axis=1)

In [None]:
df['BP_BMI_Ratio'] = df['BloodPressure'] / df['BMI'].replace(0, np.nan)
df['BP_BMI_Ratio'].fillna(0, inplace=True)
df['BP_BMI_Ratio'].describe()

In [None]:
# Define the bins and labels for age categories
age_bins = [0, 12, 20, 30, 50, 100]
age_labels = ['Children', 'Teenagers', 'Young Adults', 'Middle-Aged', 'Senior Citizens']

# Create the 'Age_Category' feature
df['Age_Category'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

In [None]:
# Define the bins and labels for blood pressure categories
blood_pressure_bins = [0, 80, 89, 120, 130, 1000]
blood_pressure_labels = ['Low', 'Normal', 'Elevated', 'High Stage 1', 'High Stage 2']

# Create the 'BloodPressure_Category' feature
df['BloodPressure_Category'] = pd.cut(df['BloodPressure'], bins=blood_pressure_bins, labels=blood_pressure_labels)

In [None]:
bins = [0, 18.5, 24.9, 29.9, 34.9, 100]
labels = ['Underweight', 'Normal Weight', 'Overweight', 'Obese (Class I)', 'Obese (Class II)']

df['BMI_Category'] = pd.cut(df['BMI'], bins=bins, labels=labels)

df['BMI_Category'] = df['BMI_Category'].cat.add_categories("Zero BMI")
df['BMI_Category'].fillna("Zero BMI", inplace=True)

In [None]:
# no

# Define the bins and labels for glucose categories
glucose_bins = [0, 100, 125, 150, 200, 1000]
glucose_labels = ['Normal', 'Prediabetes', 'Mild Diabetes', 'Moderate Diabetes', 'Severe Diabetes']

# Create the 'Glucose_Category' feature
df['Glucose_Category'] = pd.cut(df['Glucose'], bins=glucose_bins, labels=glucose_labels)

In [None]:
# no

# Define the bins and labels for skin thickness categories
skin_thickness_bins = [0, 10, 20, 30, 40, 100]
skin_thickness_labels = ['Very Thin', 'Thin', 'Moderate', 'Thick', 'Very Thick']

# Create the 'SkinThickness_Category' feature
df['SkinThickness_Category'] = pd.cut(df['SkinThickness'], bins=skin_thickness_bins, labels=skin_thickness_labels)

In [None]:
# Creating the one hot encode function
def one_hot_encode(data, column):
    encoded = pd.get_dummies(data[column], drop_first = True)        
    data = data.drop(column,axis = 1)
    data = data.join(encoded)
    return data

In [None]:
# Applying one hot encode on the categorical data
df = one_hot_encode(df, 'Age_Category')
df = one_hot_encode(df, 'BMI_Category')
df = one_hot_encode(df, 'BloodPressure_Category')
df = one_hot_encode(df, 'SkinThickness_Category')
df = one_hot_encode(df, 'Glucose_Category')


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Outcome'], axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X

In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=0)

# Fit and apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
class_distribution_before = y_train.value_counts().sort_index()
class_distribution_after = y_train_resampled.value_counts().sort_index()

# Define class labels
class_labels = ['True', 'False']

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Plot class distribution before SMOTE
sns.countplot(x=y_train, ax=axes[0])
axes[0].set_title('Class Distribution of Outcome Before SMOTE')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(class_labels)

# Plot class distribution after SMOTE
sns.countplot(x=y_train_resampled, ax=axes[1])
axes[1].set_title('Class Distribution of Outcome After SMOTE')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(class_labels)

plt.tight_layout()
plt.show()

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion='entropy', max_depth=3, 
                                                                  min_samples_split=2, 
                                                                  min_samples_leaf=1)
clf.fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

param_grid = {
    'max_depth': [2, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'sqrt', 'log2'],
    'splitter': ['best', 'random'],
    'ccp_alpha': [0.0, 0.01, 0.1, 0.2, 0.5]
}

clf = DecisionTreeClassifier(random_state=0)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='accuracy', cv=5)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_

print(best_params)

In [None]:
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']
best_min_criterion = best_params['criterion']
best_min_max_features = best_params['max_features']
best_min_splitter = best_params['splitter']
best_ccp_alpha = best_params['ccp_alpha']



# With Pre-Pruning
clf = DecisionTreeClassifier(random_state=0, criterion=best_min_criterion, max_depth=best_max_depth, 
                                                                  min_samples_split=best_min_samples_split, 
                                                                  min_samples_leaf=best_min_samples_leaf,
                                                                  max_features=best_min_max_features,
                                                                  splitter=best_min_splitter,
                                                                  ccp_alpha=best_ccp_alpha)
# clf.fit(X_train_resampled, y_train_resampled)

In [None]:
from sklearn.feature_selection import RFE

# Create an RFE object
num_features_to_select = 5  # Adjust as needed
rfe = RFE(estimator=clf, n_features_to_select=num_features_to_select, step=1)

# Fit the RFE model
rfe.fit(X_train_resampled, y_train_resampled)

# Get the selected features
selected_features_mask = rfe.support_
selected_features = X_train_resampled.columns[selected_features_mask]

# Filter the training data based on selected features
X_train_selected = X_train_resampled[selected_features]

# Now, you can use X_train_selected for training your classifier
clf.fit(X_train_selected, y_train_resampled)

In [None]:
# # Access feature importances
# feature_importances = clf.feature_importances_

# # Print or visualize feature importances
# for feature, importance in zip(X.columns, feature_importances):
#     print(f"{feature}: {importance}")

In [None]:
y_preds = clf.predict(X_test)

In [None]:
print("Accuracy %.4f" % accuracy_score(y_test, y_preds))
print("Precision %.4f" % precision_score(y_test, y_preds))
print("Recall %.4f" % recall_score(y_test, y_preds))
print("F1 %.4f" % f1_score(y_test, y_preds))

In [None]:
# plt.figure(figsize=(15, 12))

# Use the plot_tree function with adjusted parameters
plot_tree(clf,
          feature_names=X.columns,
          class_names=['No Diabetes', 'Diabetes'],
          filled=True,
          rounded=True,  
          )

# Add a title
plt.title("Decision Tree Visualization")

# Show the plot
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc
# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_preds)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()