In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier


In [3]:
#import dataset from Resources Folder
file = "Resources/diabetes_data_cleaned.csv"
df = pd.read_csv(file)

In [4]:
# Separate the features (X) and target (y) variable
X = df.drop('DiabetesDiagnosis', axis=1)
y = df['DiabetesDiagnosis']

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Define pipelines for each model

random_forest_pipeline = Pipeline([
    ('random_forest', RandomForestClassifier(random_state=42))
])


gradient_boosting_pipeline = Pipeline([
    ('gradient_boosting', GradientBoostingClassifier(random_state=42))
])




# Dictionary of pipelines for easier iteration
pipelines = {
    'Random Forest': random_forest_pipeline,
    'Gradient Boosting': gradient_boosting_pipeline
}


In [6]:
# Train each pipeline and evaluate accuracy
for model_name, pipeline in pipelines.items():
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate accuracy
    train_accuracy = pipeline.score(X_train, y_train)
    test_accuracy = pipeline.score(X_test, y_test)
    
    # Print the results
    print(f"{model_name} Training Accuracy: {train_accuracy:.3f}")
    print(f"{model_name} Test Accuracy: {test_accuracy:.3f}")

Random Forest Training Accuracy: 0.972
Random Forest Test Accuracy: 0.728
Gradient Boosting Training Accuracy: 0.754
Gradient Boosting Test Accuracy: 0.753


In [7]:
#create the model
model = GradientBoostingClassifier(
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=5,
    n_estimators=200,
    subsample=0.8,
    random_state=42
    )

#fit the model
model.fit(X_train, y_train)

#predict the model
y_pred = model.predict(X_test)

#evaluate the model
print(classification_report(y_test, y_pred))

#print the accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")


              precision    recall  f1-score   support

         0.0       0.78      0.71      0.75      7090
         1.0       0.73      0.80      0.77      7049

    accuracy                           0.76     14139
   macro avg       0.76      0.76      0.76     14139
weighted avg       0.76      0.76      0.76     14139

Accuracy: 0.756
