In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel

# Load the dataset
file_path = r"C:\Users\H\Downloads\diabetes.csv"
data = pd.read_csv(file_path)

# Display the first 5 rows of the dataset
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [54]:
# Display general information about the dataset
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None


In [73]:
# Display descriptive statistics
print(data.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [74]:
# Check for missing values
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [75]:
# Define features and target variable
X = data.drop(columns='Outcome')
y = data['Outcome']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [76]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [77]:
# Feature selection using Logistic Regression
log_reg = LogisticRegression(max_iter=200, solver='saga')
log_reg.fit(X_train_scaled, y_train)
sfm_log_reg = SelectFromModel(log_reg, threshold='mean', prefit=True)
X_train_selected_log_reg = sfm_log_reg.transform(X_train_scaled)
X_test_selected_log_reg = sfm_log_reg.transform(X_test_scaled)

# Print the number of features before and after selection
print("Number of features before selection (Logistic Regression):", X_train_scaled.shape[1])
print("Number of features after selection (Logistic Regression):", X_train_selected_log_reg.shape[1])

# Print the selected features' names
selected_features_log_reg = X.columns[sfm_log_reg.get_support()]
print("Selected features (Logistic Regression):", selected_features_log_reg)

Number of features before selection (Logistic Regression): 8
Number of features after selection (Logistic Regression): 2
Selected features (Logistic Regression): Index(['Glucose', 'BMI'], dtype='object')


In [78]:
# Feature selection using RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=0)
clf.fit(X_train_scaled, y_train)
sfm = SelectFromModel(clf, threshold='mean', prefit=True)
X_train_selected = sfm.transform(X_train_scaled)
X_test_selected = sfm.transform(X_test_scaled)

# Print the number of features before and after selection
print("Number of features before selection:", X_train_scaled.shape[1])
print("Number of features after selection:", X_train_selected.shape[1])

# Print the selected features' names
selected_features_log_reg = X.columns[sfm_log_reg.get_support()]
print("Selected features (Logistic Regression):", selected_features_log_reg)

Number of features before selection: 8
Number of features after selection: 3
Selected features (Logistic Regression): Index(['Glucose', 'BMI'], dtype='object')


In [79]:
# Feature selection using Decision Tree
dt = DecisionTreeClassifier(max_depth=7, min_samples_split=10, min_samples_leaf=4)
dt.fit(X_train_scaled, y_train)
sfm_dt = SelectFromModel(dt, threshold='mean', prefit=True)
X_train_selected_dt = sfm_dt.transform(X_train_scaled)
X_test_selected_dt = sfm_dt.transform(X_test_scaled)

# Print the number of features before and after selection
print("Number of features before selection (Decision Tree):", X_train_scaled.shape[1])
print("Number of features after selection (Decision Tree):", X_train_selected_dt.shape[1])

# Print the selected features' names
selected_features_dt = X.columns[sfm_dt.get_support()]
print("Selected features (Decision Tree):", selected_features_dt)

Number of features before selection (Decision Tree): 8
Number of features after selection (Decision Tree): 2
Selected features (Decision Tree): Index(['Glucose', 'Age'], dtype='object')


In [80]:
# Feature selection using SVM
svm = SVC(C=2, kernel='linear')
svm.fit(X_train_scaled, y_train)
sfm_svm = SelectFromModel(svm, threshold='mean', prefit=True)
X_train_selected_svm = sfm_svm.transform(X_train_scaled)
X_test_selected_svm = sfm_svm.transform(X_test_scaled)

# Print the number of features before and after selection
print("Number of features before selection (SVM):", X_train_scaled.shape[1])
print("Number of features after selection (SVM):", X_train_selected_svm.shape[1])

# Print the selected features' names
selected_features_svm = X.columns[sfm_svm.get_support()]
print("Selected features (SVM):", selected_features_svm)

Number of features before selection (SVM): 8
Number of features after selection (SVM): 2
Selected features (SVM): Index(['Glucose', 'BMI'], dtype='object')


In [81]:
# Train Logistic Regression model with selected features
log_reg_model = LogisticRegression(max_iter=200, solver='saga')
log_reg_model.fit(X_train_selected_log_reg, y_train)
y_pred_log_reg = log_reg_model.predict(X_test_selected_log_reg)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)

print("Logistic Regression Test Accuracy:", log_reg_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Test Accuracy: 0.7619047619047619
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       157
           1       0.68      0.49      0.57        74

    accuracy                           0.76       231
   macro avg       0.73      0.69      0.70       231
weighted avg       0.75      0.76      0.75       231



In [82]:
# Train Random Forest model with selected features
rf_model_selected = RandomForestClassifier(n_estimators=150, max_depth=7, min_samples_split=10, min_samples_leaf=4)
rf_model_selected.fit(X_train_selected, y_train)
y_pred_rf_selected = rf_model_selected.predict(X_test_selected)
rf_selected_accuracy = accuracy_score(y_test, y_pred_rf_selected)

print("Random Forest Test Accuracy with Selected Features:", rf_selected_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_rf_selected))

Random Forest Test Accuracy with Selected Features: 0.7792207792207793
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       157
           1       0.71      0.53      0.60        74

    accuracy                           0.78       231
   macro avg       0.76      0.71      0.73       231
weighted avg       0.77      0.78      0.77       231



In [83]:
# Train Decision Tree model with selected features
dt_model = DecisionTreeClassifier(max_depth=7, min_samples_split=10, min_samples_leaf=4)
dt_model.fit(X_train_selected_dt, y_train)
y_pred_dt = dt_model.predict(X_test_selected_dt)
dt_accuracy = accuracy_score(y_test, y_pred_dt)

print("Decision Tree Test Accuracy:", dt_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_dt))

Decision Tree Test Accuracy: 0.7445887445887446
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       157
           1       0.63      0.49      0.55        74

    accuracy                           0.74       231
   macro avg       0.71      0.68      0.69       231
weighted avg       0.73      0.74      0.73       231



In [84]:
# Train SVM model with selected features
svm_model = SVC(C=2, kernel='linear')
svm_model.fit(X_train_selected_svm, y_train)
y_pred_svm = svm_model.predict(X_test_selected_svm)
svm_accuracy = accuracy_score(y_test, y_pred_svm)

print("SVM Test Accuracy:", svm_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_svm))

SVM Test Accuracy: 0.7662337662337663
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       157
           1       0.69      0.49      0.57        74

    accuracy                           0.77       231
   macro avg       0.74      0.69      0.71       231
weighted avg       0.76      0.77      0.75       231



In [90]:
# Bagging with Decision Tree
bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=0)
bagging.fit(X_train_selected, y_train)
y_pred_bagging = bagging.predict(X_test_selected)
bagging_accuracy = accuracy_score(y_test, y_pred_bagging)

print("Bagging Test Accuracy:", bagging_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_bagging))

Bagging Test Accuracy: 0.7402597402597403
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.86      0.82       157
           1       0.62      0.49      0.55        74

    accuracy                           0.74       231
   macro avg       0.70      0.67      0.68       231
weighted avg       0.73      0.74      0.73       231



In [89]:
# Boosting with Gradient Boosting
boosting = GradientBoostingClassifier(n_estimators=100, random_state=0)
boosting.fit(X_train_selected, y_train)
y_pred_boosting = boosting.predict(X_test_selected)
boosting_accuracy = accuracy_score(y_test, y_pred_boosting)

print("Boosting Test Accuracy:", boosting_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_boosting))

Boosting Test Accuracy: 0.7532467532467533
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       157
           1       0.65      0.49      0.56        74

    accuracy                           0.75       231
   macro avg       0.72      0.68      0.69       231
weighted avg       0.74      0.75      0.74       231



In [86]:
# Stacking with multiple models
estimators = [
    ('log_reg', LogisticRegression(max_iter=200, solver='saga')),
    ('dt', DecisionTreeClassifier(max_depth=7, min_samples_split=10, min_samples_leaf=4)),
    ('svm', SVC(C=2, kernel='linear', probability=True))
]
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=0))
stacking.fit(X_train_selected, y_train)
y_pred_stacking = stacking.predict(X_test_selected)
stacking_accuracy = accuracy_score(y_test, y_pred_stacking)

print("Stacking Test Accuracy:", stacking_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_stacking))

Stacking Test Accuracy: 0.7662337662337663
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       157
           1       0.69      0.49      0.57        74

    accuracy                           0.77       231
   macro avg       0.74      0.69      0.71       231
weighted avg       0.76      0.77      0.75       231



In [99]:
# Voting Classifier with multiple models
voting = VotingClassifier(estimators=estimators, voting='soft')
voting.fit(X_train_selected, y_train)
y_pred_voting = voting.predict(X_test_selected)

voting_accuracy = accuracy_score(y_test, y_pred_voting)
print("Voting Test Accuracy:", voting_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_voting))

Voting Test Accuracy: 0.7662337662337663
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       157
           1       0.70      0.47      0.56        74

    accuracy                           0.77       231
   macro avg       0.74      0.69      0.70       231
weighted avg       0.76      0.77      0.75       231



In [95]:
# Save the final model
model_classification = 'final_model.joblib'
dump(ensemble_model, model_classification)

['final_model.joblib']

In [96]:
# Reload the final model and test it on new data
loaded_model = load(model_classification)

In [97]:
# Test the model on new data (same data used as examples)
y_pred_loaded = loaded_model.predict(X_test_selected)
loaded_model_accuracy = accuracy_score(y_test, y_pred_loaded)

print("Loaded Model Test Accuracy:", loaded_model_accuracy)
print("-" * 50)
print(classification_report(y_test, y_pred_loaded))

Loaded Model Test Accuracy: 0.7619047619047619
--------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       157
           1       0.69      0.47      0.56        74

    accuracy                           0.76       231
   macro avg       0.73      0.69      0.70       231
weighted avg       0.75      0.76      0.75       231

