In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Data
diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data.info())
print(diabetes_data.describe())



# Data Visualization
plt.figure(figsize=(6, 4))
diabetes_data['Outcome'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Diabetes: Count vs Outcome')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()



# Data processing
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=12) 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model 1: Logistic Regression
logreg_model = LogisticRegression(random_state=12)
logreg_model.fit(X_train, y_train)
logreg_predictions = logreg_model.predict(X_test)


# Model 2:  Decision Tree
dt_model = DecisionTreeClassifier(random_state=12)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)


# Model 3:  Random Forest
rf_model = RandomForestClassifier(random_state=12)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)



# Evaluate models
def evaluate_model(model, predictions):
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    matrix = confusion_matrix(y_test, predictions)
    return accuracy, report, matrix

logreg_accuracy, logreg_report, logreg_matrix = evaluate_model(logreg_model, logreg_predictions)
dt_accuracy, dt_report, dt_matrix = evaluate_model(dt_model, dt_predictions)
rf_accuracy, rf_report, rf_matrix = evaluate_model(rf_model, rf_predictions)

# Print evaluation results
print("Logistic Regression Accuracy:", logreg_accuracy)
print("Logistic Regression Classification Report:\n", logreg_report)
print("Logistic Regression Confusion Matrix:\n", logreg_matrix)

print("\nDecision Tree Accuracy:", dt_accuracy)
print("Decision Tree Classification Report:\n", dt_report)
print("Decision Tree Confusion Matrix:\n", dt_matrix)

print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)
print("Random Forest Confusion Matrix:\n", rf_matrix)

# Save the best model (Random Forest)
joblib.dump(rf_model, 'diabetes_model.pkl')