In [2]:
# Import required libraries
import numpy as np
import pandas as pd
from google.colab import files
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from io import StringIO

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# Upload dataset from your PC
uploaded = files.upload()

# Get the filename (assuming you're uploading the CSV version)
filename = next(iter(uploaded))
print(f"\nUploaded file: {filename}")

# Read the dataset
df = pd.read_csv(filename)
print("\nDataset preview:")
display(df.head())

Saving diabetes.zip to diabetes.zip

Uploaded file: diabetes.zip

Dataset preview:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Split into features and target
X = df.drop('Outcome', axis=1)  # Features
y = df['Outcome']  # Target (0 = no diabetes, 1 = diabetes)

# Display dataset information
print("Feature names:", list(X.columns))
print("\nClass distribution:")
print(y.value_counts())
print("\nMissing values per column:")
print(df.isnull().sum())

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nClass distribution in training set:")
print(y_train.value_counts())
print("\nClass distribution in testing set:")
print(y_test.value_counts())

Feature names: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

Class distribution:
Outcome
0    500
1    268
Name: count, dtype: int64

Missing values per column:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Training set size: (537, 8)
Testing set size: (231, 8)

Class distribution in training set:
Outcome
0    350
1    187
Name: count, dtype: int64

Class distribution in testing set:
Outcome
0    150
1     81
Name: count, dtype: int64


In [5]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: {:.2f}%".format(accuracy_rf*100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 76.19%

Confusion Matrix:
 [[132  18]
 [ 37  44]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.88      0.83       150
           1       0.71      0.54      0.62        81

    accuracy                           0.76       231
   macro avg       0.75      0.71      0.72       231
weighted avg       0.76      0.76      0.75       231



In [6]:
# Initialize and train Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluate model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy: {:.2f}%".format(accuracy_gb*100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 75.32%

Confusion Matrix:
 [[127  23]
 [ 34  47]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       150
           1       0.67      0.58      0.62        81

    accuracy                           0.75       231
   macro avg       0.73      0.71      0.72       231
weighted avg       0.75      0.75      0.75       231



In [7]:
# Sample prediction using Random Forest
sample_idx = 10  # You can change this index
sample = X_test.iloc[sample_idx].values.reshape(1, -1)
true_label = "Diabetic" if y_test.iloc[sample_idx] == 1 else "Non-Diabetic"
predicted_label = "Diabetic" if rf_model.predict(sample)[0] == 1 else "Non-Diabetic"

print("Sample features:")
display(pd.DataFrame(sample, columns=X.columns))
print(f"\nTrue label: {true_label}")
print(f"Predicted label: {predicted_label}")

# Feature importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 5 most important features:")
display(feature_importances.head(5))

Sample features:




Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2.0,88.0,74.0,19.0,53.0,29.0,0.229,22.0



True label: Non-Diabetic
Predicted label: Non-Diabetic

Top 5 most important features:


Unnamed: 0,Feature,Importance
1,Glucose,0.265229
5,BMI,0.153544
7,Age,0.131911
6,DiabetesPedigreeFunction,0.120567
0,Pregnancies,0.092917



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

