In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [None]:


# Load the dataset
def load_data():
    """Load the Pima Indians Diabetes Dataset"""
    columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
               'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
    df = pd.read_csv('diabetes.csv')
    return df

# Exploratory Data Analysis
def perform_eda(df):
    """Perform exploratory data analysis and create visualizations"""
    # Class distribution plot before handling imbalance
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    sns.countplot(x='Outcome', data=df)
    plt.title('Class Distribution (Before SMOTE)')
    
    # Correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()
    
    # Distribution of features
    plt.figure(figsize=(15, 10))
    for i, column in enumerate(df.columns[:-1], 1):
        plt.subplot(3, 3, i)
        sns.boxplot(x='Outcome', y=column, data=df)
        plt.title(f'{column} Distribution by Outcome')
    plt.tight_layout()
    plt.show()

# Data Preprocessing
def preprocess_data(df):
    """Preprocess the data including handling missing values and scaling"""
    # Replace 0 values with NaN for certain features
    zero_not_allowed = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
    df[zero_not_allowed] = df[zero_not_allowed].replace(0, np.nan)
    
    # Fill missing values with median
    for column in zero_not_allowed:
        df[column].fillna(df[column].median(), inplace=True)
    
    # Split features and target
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Handle class imbalance using SMOTE
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    
    # Plot class distribution after SMOTE
    plt.subplot(1, 2, 2)
    sns.countplot(x=y_train_balanced)
    plt.title('Class Distribution (After SMOTE)')
    plt.show()
    
    return X_train_balanced, X_test_scaled, y_train_balanced, y_test, scaler

# Model Training and Evaluation
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """Train and evaluate multiple models"""
    models = {
        'SVM': SVC(kernel='rbf', random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5)
    }
    
    results = {}
    best_accuracy = 0
    best_model = None
    
    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Store results
        results[name] = {
            'accuracy': accuracy,
            'classification_report': classification_report(y_test, y_pred),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        # Update best model if current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = (name, model)
        
        print(f"\nResults for {name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(results[name]['classification_report'])
        
        # Plot confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(results[name]['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
    
    return results, best_model

# Save the best model
def save_model(model, scaler, model_name):
    """Save the best performing model and scaler"""
    joblib.dump(model, f'{model_name.lower()}_model.joblib')
    joblib.dump(scaler, 'scaler.joblib')

# Function to make predictions using the saved model
def predict_diabetes(data, model, scaler):
    """Make predictions using the saved model"""
    # Scale the input data
    scaled_data = scaler.transform(data)
    
    # Make prediction
    prediction = model.predict(scaled_data)
    probability = model.predict_proba(scaled_data)
    
    return prediction, probability

def main():
    # Load data
    print("Loading data...")
    df = load_data()
    
    # Perform EDA
    print("\nPerforming exploratory data analysis...")
    perform_eda(df)
    
    # Preprocess data
    print("\nPreprocessing data...")
    X_train, X_test, y_train, y_test, scaler = preprocess_data(df)
    
    # Train and evaluate models
    print("\nTraining and evaluating models...")
    results, best_model = train_and_evaluate_models(X_train, X_test, y_train, y_test)
    
    # Save the best model
    print(f"\nBest model: {best_model[0]} with accuracy: {results[best_model[0]]['accuracy']:.4f}")
    save_model(best_model[1], scaler, best_model[0])
    
    # Example prediction
    print("\nExample prediction with the best model:")
    sample_data = df.drop('Outcome', axis=1).iloc[[0]]
    prediction, probability = predict_diabetes(sample_data, best_model[1], scaler)
    print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-diabetic'}")
    print(f"Probability: {probability[0][1]:.4f}")

if __name__ == "__main__":
    main()