In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier


In [None]:
#import dataset from Resources Folder
file = "Resources/diabetes_data_cleaned.csv"
df = pd.read_csv(file)

In [None]:
# Separate the features (X) and target (y) variable
X = df.drop('DiabetesDiagnosis', axis=1)
y = df['DiabetesDiagnosis']

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define pipelines for each model

random_forest_pipeline = Pipeline([
    ('random_forest', RandomForestClassifier(random_state=42))
])


gradient_boosting_pipeline = Pipeline([
    ('gradient_boosting', GradientBoostingClassifier(random_state=42))
])




# Dictionary of pipelines for easier iteration
pipelines = {
    'Random Forest': random_forest_pipeline,
    'Gradient Boosting': gradient_boosting_pipeline
}


In [None]:
# Train each pipeline and evaluate accuracy
for model_name, pipeline in pipelines.items():
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate accuracy
    train_accuracy = pipeline.score(X_train, y_train)
    test_accuracy = pipeline.score(X_test, y_test)
    
    # Print the results
    print(f"{model_name} Training Accuracy: {train_accuracy:.3f}")
    print(f"{model_name} Test Accuracy: {test_accuracy:.3f}")