In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE # Import SMOTE
import joblib # For saving and loading models

# --- 1. Load the dataset ---
# The dataset 'diabetes.csv' is assumed to be available in the environment.
try:
    df = pd.read_csv('diabetes.csv')
    print("Dataset loaded successfully.")
    print("First 5 rows of the dataset:")
    print(df.head())
    print("\nDataset information:")
    df.info()
except FileNotFoundError:
    print("Error: 'diabetes.csv' not found. Please ensure the file is uploaded correctly.")
    exit()

# --- 2. Data Preprocessing ---

# Identify features (X) and target (y)
# 'Outcome' is typically the target variable (0 for no diabetes, 1 for diabetes)
# All other columns are features.
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Define columns that might have 0s representing missing values
cols_to_replace_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Function to apply preprocessing steps (missing value handling, outlier treatment, scaling)
# This function will be used for both training and new data prediction
def preprocess_data(data_df, scaler=None, is_training=True, train_means=None, train_iqr_bounds=None):
    processed_df = data_df.copy()

    # Handle missing values (0s replaced with mean)
    # For new data, use means calculated from training data
    if is_training:
        for col in cols_to_replace_zero:
            processed_df[col] = processed_df[col].replace(0, processed_df[col].mean())
        # Store means for later use with new data
        current_means = {col: processed_df[col].mean() for col in cols_to_replace_zero}
    else:
        if train_means is None:
            raise ValueError("train_means must be provided for new data preprocessing.")
        for col in cols_to_replace_zero:
            processed_df[col] = processed_df[col].replace(0, train_means[col])
        current_means = train_means # No need to recalculate for new data

    # Outlier Treatment (Capping using IQR method)
    # For new data, use IQR bounds calculated from training data
    current_iqr_bounds = {}
    if is_training:
        for column in processed_df.columns:
            Q1 = processed_df[column].quantile(0.25)
            Q3 = processed_df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            processed_df[column] = processed_df[column].clip(lower=lower_bound, upper=upper_bound)
            current_iqr_bounds[column] = {'lower': lower_bound, 'upper': upper_bound}
    else:
        if train_iqr_bounds is None:
            raise ValueError("train_iqr_bounds must be provided for new data preprocessing.")
        for column in processed_df.columns:
            lower_bound = train_iqr_bounds[column]['lower']
            upper_bound = train_iqr_bounds[column]['upper']
            processed_df[column] = processed_df[column].clip(lower=lower_bound, upper=upper_bound)
        current_iqr_bounds = train_iqr_bounds # No need to recalculate for new data

    # Feature Scaling
    if is_training:
        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(processed_df)
    else:
        if scaler is None:
            raise ValueError("Scaler must be provided for new data preprocessing.")
        scaled_data = scaler.transform(processed_df)

    processed_df_scaled = pd.DataFrame(scaled_data, columns=processed_df.columns)

    if is_training:
        return processed_df_scaled, scaler, current_means, current_iqr_bounds
    else:
        return processed_df_scaled


# Preprocess training data
print("\nApplying preprocessing steps to training data...")
X_processed, scaler, train_means, train_iqr_bounds = preprocess_data(X, is_training=True)

print("\nMissing values (after replacement of 0s in specific columns):")
print(X_processed[cols_to_replace_zero].isin([0]).sum()) # Should show 0 for these columns now

print("Outliers capped for all features.")

print("\nFirst 5 rows of scaled features (after outlier treatment):")
print(X_processed.head())

# --- 3. Split the data into training and testing sets ---
# We'll use 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nOriginal Training set size: {X_train.shape[0]} samples")
print(f"Original Testing set size: {X_test.shape[0]} samples")
print(f"Original Outcome distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Original Outcome distribution in testing set:\n{y_test.value_counts(normalize=True)}")

# --- 4. Apply SMOTE to the training data ---
print("\nApplying SMOTE to balance the training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"\nResampled Training set size: {X_train_resampled.shape[0]} samples")
print(f"Resampled Outcome distribution in training set:\n{y_train_resampled.value_counts(normalize=True)}")


# --- 5. Train an SVM Classifier with Enhanced Hyperparameter Tuning on Resampled Data ---
print("\nStarting Enhanced Hyperparameter Tuning for SVM on resampled data (this might take a moment)...")

# Define an expanded parameter grid to search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],          # Regularization parameter: expanded range
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],      # Kernel coefficient for 'rbf': expanded range
    'kernel': ['rbf']                            # Focusing on RBF kernel
}

# Create a GridSearchCV object
# Set probability=True to enable predict_proba
grid_search = GridSearchCV(
    SVC(random_state=42, class_weight='balanced', probability=True), # Added probability=True
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV to the RESAMPLED training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best parameters and best score
print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-weighted score on resampled data: {grid_search.best_score_:.4f}")

# Get the best estimator (the SVM model with the best parameters)
best_svm_model = grid_search.best_estimator_
print("\nBest SVM model trained successfully using best parameters on resampled data.")

# --- 6. Evaluate the best model's performance on the ORIGINAL Test Set ---

# Make predictions on the test set using the best model
y_pred = best_svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment): {accuracy:.4f}")

# Generate a classification report (precision, recall, f1-score)
print("\nClassification Report on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment):")
print(classification_report(y_test, y_pred))

# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix on ORIGINAL Test Set (with best parameters, SMOTE, and Outlier Treatment):")
print(conf_matrix)

# Interpretation of Confusion Matrix:
# [[True Negatives (TN)  False Positives (FP)]
#  [False Negatives (FN) True Positives (TP)]]
# For diabetes prediction:
# TN: Correctly predicted non-diabetic
# FP: Incorrectly predicted diabetic (Type I error)
# FN: Incorrectly predicted non-diabetic (Type II error, more critical in medical diagnosis)
# TP: Correctly predicted diabetic

# --- 7. Save the trained model and scaler ---
model_filename = 'diabetes_svm_model.joblib'
scaler_filename = 'diabetes_scaler.joblib'
train_means_filename = 'diabetes_train_means.joblib'
train_iqr_bounds_filename = 'diabetes_train_iqr_bounds.joblib'

joblib.dump(best_svm_model, model_filename)
joblib.dump(scaler, scaler_filename)
joblib.dump(train_means, train_means_filename)
joblib.dump(train_iqr_bounds, train_iqr_bounds_filename)

print(f"\nModel saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")
print(f"Training means for imputation saved to {train_means_filename}")
print(f"Training IQR bounds for outlier treatment saved to {train_iqr_bounds_filename}")


# --- 8. Example: Load model and predict on new data ---
print("\n--- Example: Loading saved model and predicting on new data ---")

# Load the saved model, scaler, means, and IQR bounds
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)
loaded_train_means = joblib.load(train_means_filename)
loaded_train_iqr_bounds = joblib.load(train_iqr_bounds_filename)

print("Model, scaler, means, and IQR bounds loaded successfully.")

# Create some hypothetical new data for prediction
# Ensure the new data has the same columns as the training data, in the same order
# Example: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age
new_patient_data = pd.DataFrame({
    'Pregnancies': [2],
    'Glucose': [180],
    'BloodPressure': [80],
    'SkinThickness': [30],
    'Insulin': [0], # Example of a value that might be treated as missing
    'BMI': [35.5],
    'DiabetesPedigreeFunction': [0.5],
    'Age': [45]
})

print("\nNew patient data for prediction:")
print(new_patient_data)

# Preprocess the new data using the loaded scaler, means, and IQR bounds
# Note: is_training=False, and provide the loaded scaler, means, and IQR bounds
new_patient_processed = preprocess_data(
    new_patient_data,
    scaler=loaded_scaler,
    is_training=False,
    train_means=loaded_train_means,
    train_iqr_bounds=loaded_train_iqr_bounds
)

# Make prediction
new_prediction = loaded_model.predict(new_patient_processed)
prediction_proba = loaded_model.predict_proba(new_patient_processed) # Get probabilities

print(f"\nPredicted Outcome for new patient: {'Diabetic' if new_prediction[0] == 1 else 'Non-Diabetic'}")
print(f"Prediction Probability (Non-Diabetic, Diabetic): {prediction_proba[0]}")

# Another example: a patient likely non-diabetic
new_patient_data_2 = pd.DataFrame({
    'Pregnancies': [0],
    'Glucose': [90],
    'BloodPressure': [70],
    'SkinThickness': [20],
    'Insulin': [70],
    'BMI': [22.0],
    'DiabetesPedigreeFunction': [0.2],
    'Age': [25]
})

print("\nNew patient data 2 for prediction:")
print(new_patient_data_2)

new_patient_processed_2 = preprocess_data(
    new_patient_data_2,
    scaler=loaded_scaler,
    is_training=False,
    train_means=loaded_train_means,
    train_iqr_bounds=loaded_train_iqr_bounds
)

new_prediction_2 = loaded_model.predict(new_patient_processed_2)
prediction_proba_2 = loaded_model.predict_proba(new_patient_processed_2)

print(f"\nPredicted Outcome for new patient 2: {'Diabetic' if new_prediction_2[0] == 1 else 'Non-Diabetic'}")
print(f"Prediction Probability (Non-Diabetic, Diabetic): {prediction_proba_2[0]}")


Dataset loaded successfully.
First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Preg