1. Clean your data

Clean and prepare your data: validate data types, duplicates, inconsistency, extreme values

In [7]:
import pandas as pd

# Load the dataset
file_path = 'diabetes.csv'
df = pd.read_csv(file_path)
# Step 1: Remove duplicates
df = df.drop_duplicates()

# Step 2: Handle missing values
# List of numeric columns
numeric_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']
# Convert DiabetesPedigreeFunction to numeric
df['DiabetesPedigreeFunction'] = pd.to_numeric(df['DiabetesPedigreeFunction'], errors='coerce')
# Impute missing numeric values with the median
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Step 3: Correct categorical inconsistencies
# Replace "MISSING" in WeightGroup with NaN and impute with mode
df['WeightGroup'] = df['WeightGroup'].replace('MISSING', pd.NA)
df['WeightGroup'] = df['WeightGroup'].fillna(df['WeightGroup'].mode()[0])

# Replace "<65" in AgeGroup with "18 - 44"
df['AgeGroup'] = df['AgeGroup'].replace('<65', '18 - 44')

# Standardize Gender to uppercase
df['Gender'] = df['Gender'].str.upper()

# Step 4: Address extreme values
# Cap Pregnancies at the 99th percentile
pregnancy_cap = df['Pregnancies'].quantile(0.99)
df['Pregnancies'] = df['Pregnancies'].clip(upper=pregnancy_cap)

# Display the cleaned dataset summary
print("Missing Values After Cleaning:\n", df.isnull().sum())
print("\nCategorical Columns Summary:")
print(df['WeightGroup'].value_counts())
print(df['AgeGroup'].value_counts())
print(df['Gender'].value_counts())
print("\nNumeric Columns Summary:")
print(df.describe())

Missing Values After Cleaning:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
DiabetesPedigreeFunction    0
WeightGroup                 0
AgeGroup                    0
Gender                      0
Outcome                     0
dtype: int64

Categorical Columns Summary:
WeightGroup
obese_1           196
overweight        146
obese_2           126
obsese_3           87
healthy weight     75
MISSING             8
underweight         4
Name: count, dtype: int64
AgeGroup
18 - 44    528
45 - 64    103
>65          8
<65          3
Name: count, dtype: int64
Gender
F    565
M     60
m     17
Name: count, dtype: int64

Numeric Columns Summary:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   639.000000  639.000000     639.000000     639.000000  639.000000   
mean      3.913928  123.902973      72.464789      29.624413  144.960876   
std       3.406675   30.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
file_path = 'diabetes.csv'
data = pd.read_csv(file_path)

# 1. Split data into features (X) and target (y)
X = data.drop(columns='Outcome')
y = data['Outcome']

# 2. Split into training and testing sets to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Define preprocessing for numeric and categorical columns
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction']
categorical_features = ['WeightGroup', 'AgeGroup', 'Gender']

# Imputation for numeric columns: Median (robust against outliers)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # Scale numeric features
])

# Encoding for categorical columns: One-Hot Encoding (non-ordinal data)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing with the most frequent category
    ('onehot', OneHotEncoder(drop='first'))  # Avoid dummy variable trap
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 4. Create a pipeline for preprocessing
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing
X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

# Check for missing values post-processing
missing_train = pd.DataFrame(X_train_preprocessed).isnull().sum().sum()
missing_test = pd.DataFrame(X_test_preprocessed).isnull().sum().sum()

print(f"Missing values in training set after preprocessing: {missing_train}")
print(f"Missing values in testing set after preprocessing: {missing_test}")


Missing values in training set after preprocessing: 0
Missing values in testing set after preprocessing: 0


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Step 1: Define the model
model = RandomForestClassifier(random_state=42)

# Step 2: Set up a pipeline combining preprocessing and modeling
pipeline_with_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Step 3: Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 150],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(pipeline_with_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Step 4: Evaluate the final model on the test set
y_pred = best_model.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print("Best Hyperparameters:", grid_search.best_params_)
print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Justification for the final model choice
final_model_justification = """
The RandomForestClassifier was chosen for its robust performance with minimal tuning. It is less sensitive to scaling
and can handle both numeric and categorical features efficiently. The model also provides insights into feature
importance, which can aid interpretability. Based on hyperparameter tuning, the model with the best parameters
provided the highest accuracy and balanced Precision and Recall, making it a suitable choice for this classification task.
"""

print("\nFinal Model Justification:")
print(final_model_justification)


Best Hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 100}

Evaluation Metrics:
Accuracy: 0.71
Precision: 0.65
Recall: 0.63
F1 Score: 0.64

Confusion Matrix:
[[57 18]
 [20 34]]

Final Model Justification:

The RandomForestClassifier was chosen for its robust performance with minimal tuning. It is less sensitive to scaling
and can handle both numeric and categorical features efficiently. The model also provides insights into feature
importance, which can aid interpretability. Based on hyperparameter tuning, the model with the best parameters
provided the highest accuracy and balanced Precision and Recall, making it a suitable choice for this classification task.



In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Step 1: Define the model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
random_forest_model = RandomForestClassifier(random_state=42)
gradient_boosting_model = GradientBoostingClassifier(random_state=42)

# Step 2: Combine preprocessing and modeling into a pipeline
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', logistic_model)
])

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', random_forest_model)
])

gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', gradient_boosting_model)
])

# Step 3: Train and evaluate the models
def evaluate_model(pipeline, X_train, y_train, X_test, y_test):
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print the metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    return pipeline

# Evaluate Logistic Regression
print("Logistic Regression Evaluation:")
logistic_pipeline = evaluate_model(logistic_pipeline, X_train, y_train, X_test, y_test)

# Evaluate Random Forest
print("\nRandom Forest Evaluation:")
random_forest_pipeline = evaluate_model(random_forest_pipeline, X_train, y_train, X_test, y_test)

# Evaluate Gradient Boosting
print("\nGradient Boosting Evaluation:")
gradient_boosting_pipeline = evaluate_model(gradient_boosting_pipeline, X_train, y_train, X_test, y_test)

# Step 4: Tune the best model (e.g., Random Forest)
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(random_forest_pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)
# Evaluate on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
# Best parameters and score
print("\nBest Parameters from GridSearchCV:")
print(grid_search.best_params_)
print(f"Best ROC-AUC Score: {grid_search.best_score_:.4f}")
print(f"Testing MSE: {mse}")
print(f"Testing RMSE: {rmse}")
# Evaluate the tuned model
print("\nTuned Random Forest Evaluation:")
tuned_random_forest_pipeline = grid_search.best_estimator_
evaluate_model(tuned_random_forest_pipeline, X_train, y_train, X_test, y_test)

# Step 5: Choose the final model
# Based on evaluation metrics, select the best-performing model
final_model = tuned_random_forest_pipeline
print("\nFinal Model: Tuned Random Forest")

Logistic Regression Evaluation:
Accuracy: 0.7054
Precision: 0.6905
Recall: 0.5370
F1-Score: 0.6042
ROC-AUC: 0.7899

Random Forest Evaluation:
Accuracy: 0.6899
Precision: 0.6346
Recall: 0.6111
F1-Score: 0.6226
ROC-AUC: 0.7557

Gradient Boosting Evaluation:
Accuracy: 0.6822
Precision: 0.6327
Recall: 0.5741
F1-Score: 0.6019
ROC-AUC: 0.7649


NameError: name 'best_model' is not defined

In [None]:
//------------------

KeyError: 'BMI'