In [None]:
import pandas as pd
import numpy as np

# Load the Pima Indians Diabetes dataset
data = pd.read_csv('Pima-Indian-Dataset.csv')
# Basic data inspection
print("Dataset Shape:", data.shape)
print("\nFirst 5 rows:\n", data.head())
print("\nMissing Values (Zeros):\n", data.eq(0).sum())
print("\nClass Distribution:\n", data['Outcome'].value_counts(normalize=True))

Dataset Shape: (768, 9)

First 5 rows:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Missing Values (Zeros):
 Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age          

In [12]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Step 1: Handle missing values (zeros in specific columns)
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zeros] = data[cols_with_zeros].replace(0, np.nan)
imputer = KNNImputer(n_neighbors=5, weights="uniform")
data[cols_with_zeros] = imputer.fit_transform(data[cols_with_zeros])

# Step 2: Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Step 3: Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% variance
X_pca = pca.fit_transform(X_scaled)
print(f"Number of components after PCA: {X_pca.shape[1]}")
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_):.4f}")

# Step 5: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)

# Step 6: Handle class imbalance with SMOTE (only on training data)
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Verify class distribution after SMOTE
print("\nClass Distribution before SMOTE:\n", y_train.value_counts(normalize=True))
print("\nClass Distribution after SMOTE:\n", pd.Series(y_train_balanced).value_counts(normalize=True))

# Save preprocessed data for next steps
preprocessed_data = {
    'X_train': X_train_balanced,
    'X_test': X_test,
    'y_train': y_train_balanced,
    'y_test': y_test,
    'X_scaled': X_scaled,  # For non-PCA comparisons if needed
}

Number of components after PCA: 7
Explained variance ratio: 0.9614

Class Distribution before SMOTE:
 Outcome
0    0.651466
1    0.348534
Name: proportion, dtype: float64

Class Distribution after SMOTE:
 Outcome
0    0.5
1    0.5
Name: proportion, dtype: float64


In [13]:
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, SpectralClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, silhouette_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Use preprocessed data from previous step
# Assuming preprocessed_data contains: X_train_balanced, X_test, y_train_balanced, y_test
# If not available, you can load it from the previous artifact or redefine it
try:
    X_train_balanced = preprocessed_data['X_train']
    X_test = preprocessed_data['X_test']
    y_train_balanced = preprocessed_data['y_train']
    y_test = preprocessed_data['y_test']
except NameError:
    raise Exception("Preprocessed data not found. Please run the preprocessing step first.")