In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random, time
import warnings, os

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve

from scipy.stats import uniform, randint

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier



In [82]:
# Ignore Warnings
warnings.filterwarnings('ignore')

# Suppress scientific notation for pandas display
pd.options.display.float_format = '{:.4f}'.format

In [83]:
# Load the processed sampled dataframes
# Ensure they were saved in the directory : processed_data/
numeric_sampled = pd.read_csv('processed_data/num_sampled.csv')
categorical_sampled = pd.read_csv('processed_data/cat_sampled.csv')
date_sampled = pd.read_csv('processed_data/date_sampled.csv')


print("Processed sampled dataframes loaded.")
print(f"Shape of numeric_sampled: {numeric_sampled.shape}")
print(f"Shape of date_sampled: {date_sampled.shape}")
print(f"Shape of categorical_sampled: {categorical_sampled.shape}")

Processed sampled dataframes loaded.
Shape of numeric_sampled: (13107, 970)
Shape of date_sampled: (13107, 1157)
Shape of categorical_sampled: (13107, 2141)


In [84]:
# Step 0: Copy the numeric dataset
df_numeric = numeric_sampled.copy()
original_shape = df_numeric.shape
print(f"Step 0: Original shape of train_numeric_sampled: {original_shape}")

# ✅ Maintain index integrity — no row drops
# All preprocessing will be column-wise only

# Initialize logs
dropped_columns = {
    '100_percent_missing': [],
    'low_variance': [],
    'low_correlation': [], # Deferring this step until after train-test split
    'high_multicollinearity': [], # Deferring this step until after train-test split
    'imputed': []
}

# Step 1: Drop columns with 100% missing values
missing_100 = df_numeric.columns[df_numeric.isnull().mean() == 1.0].tolist()
df_numeric.drop(columns=missing_100, inplace=True)
dropped_columns['100_percent_missing'] = missing_100
print(f"Step 1: Dropped {len(missing_100)} columns with 100% missing values.")

# Step 2: Drop low/zero variance columns
id_cols = ['Id', 'Response']
feature_data = df_numeric.drop(columns=id_cols, errors='ignore')

var_thresh = VarianceThreshold(threshold=0.01)
var_thresh.fit(feature_data)
selected_features = feature_data.columns[var_thresh.get_support()].tolist()
dropped_variance = list(set(feature_data.columns) - set(selected_features))
df_numeric = pd.concat([df_numeric[id_cols], feature_data[selected_features]], axis=1)
dropped_columns['low_variance'] = dropped_variance
print(f"Step 2: Dropped {len(dropped_variance)} low/zero variance columns.")

# Step 3: Imputation using median (no row drops)
# Simplified for now, consider model-based imputation when compute available
missing_cols = [col for col in df_numeric.columns if df_numeric[col].isnull().any()]
for col in missing_cols:
    median_val = df_numeric[col].median() # Using overall median for now
    df_numeric[col].fillna(median_val, inplace=True)
dropped_columns['imputed'] = missing_cols
print(f"Step 3: Imputed missing values in {len(missing_cols)} columns using median.")
# Data leakage possible but insignificant.

# Step 4: Scaling (standardization)
# Perform scaling before multicollinearity check
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_numeric.drop(columns=id_cols, errors='ignore'))
df_scaled = pd.DataFrame(scaled_features, columns=df_numeric.drop(columns=id_cols, errors='ignore').columns, index=df_numeric.index)
df_numeric_scaled = pd.concat([df_numeric[id_cols], df_scaled], axis=1)
print(f"Step 4: Scaled {len(df_scaled.columns)} numeric columns using StandardScaler.")

# Step 5: Defer dropping highly correlated columns (multicollinearity) until after train-test split
# This step can be done after splitting to avoid data leakage from the test set's correlation structure
print("Step 5: Deferring dropping highly correlated columns (multicollinearity) until after train-test split.")
# dropped_columns['high_multicollinearity'] = high_corr_pairs # Commenting out the dropping part

# Step 6: Defer dropping low correlation to Response until after train-test split
# This step should be done after splitting to avoid data leakage
print("Step 6: Deferring dropping low-correlation columns until after train-test split.")
# dropped_columns['low_correlation'] = dropped_corr # Commenting out the dropping part


# Final output
numeric = df_numeric_scaled.copy()
final_shape = numeric.shape
print(f"\n✅ Final shape of numeric after column-only preprocessing: {final_shape}")
# Recalculating total columns dropped based on steps performed
total_dropped = len(dropped_columns['100_percent_missing']) + len(dropped_columns['low_variance'])
print(f"🔻 Total columns dropped so far: {total_dropped}")

# Save dropped column names to files
os.makedirs("drop_logs", exist_ok=True)
for step, cols in dropped_columns.items():
    if step not in ['low_correlation', 'high_multicollinearity']: # Only save logs for steps performed
        file_path = f"drop_logs/dropped_num_{step}.txt"
        with open(file_path, "w") as f:
            for col in cols:
                f.write(col + "\n")
        print(f"📁 Saved {len(cols)} columns dropped in '{step}' to {file_path}")

Step 0: Original shape of train_numeric_sampled: (13107, 970)
Step 1: Dropped 0 columns with 100% missing values.
Step 2: Dropped 516 low/zero variance columns.
Step 3: Imputed missing values in 452 columns using median.
Step 4: Scaled 452 numeric columns using StandardScaler.
Step 5: Deferring dropping highly correlated columns (multicollinearity) until after train-test split.
Step 6: Deferring dropping low-correlation columns until after train-test split.

✅ Final shape of numeric after column-only preprocessing: (13107, 454)
🔻 Total columns dropped so far: 516
📁 Saved 0 columns dropped in '100_percent_missing' to drop_logs/dropped_num_100_percent_missing.txt
📁 Saved 516 columns dropped in 'low_variance' to drop_logs/dropped_num_low_variance.txt
📁 Saved 452 columns dropped in 'imputed' to drop_logs/dropped_num_imputed.txt


In [85]:
# Step 0: Start with a copy of categorcial dataset
df_categorical = categorical_sampled.copy()
original_shape = df_categorical.shape
print(f"Step 0: Original shape of train_categorical_sampled: {original_shape}")

# Initialize logs
dropped_columns = {
    '100_percent_missing': [],
    'constant': [],
    'high_cardinality': [],
    'low_correlation': [], # Deferring this step until after train-test split
    'imputed': []
}

# Step 1: Drop 100% missing value features
missing_100 = df_categorical.columns[df_categorical.isnull().mean() == 1.0].tolist()
df_categorical.drop(columns=missing_100, inplace=True)
dropped_columns['100_percent_percent_missing'] = missing_100
print(f"Step 1: Dropped {len(missing_100)} columns with 100% missing values.")

# Step 2: Drop constant features (only one unique value)
constant_cols = [col for col in df_categorical.columns if df_categorical[col].nunique() == 1]
df_categorical.drop(columns=constant_cols, inplace=True)
dropped_columns['constant'] = constant_cols
print(f"Step 2: Dropped {len(constant_cols)} constant categorical features.")

# Step 3: Cardinality analysis
# Exclude 'Id' from high cardinality check
high_card_cols = [col for col in df_categorical.columns if col != 'Id' and df_categorical[col].nunique() > 50]
df_categorical.drop(columns=high_card_cols, inplace=True)
dropped_columns['high_cardinality'] = high_card_cols
print(f"Step 3: Dropped {len(high_card_cols)} high-cardinality features (>50 unique values), excluding 'Id'.")

# Step 4: Missing value imputation - Using a placeholder 'Missing' for now, consider more sophisticated imputation after merging
missing_cols = [col for col in df_categorical.columns if df_categorical[col].isnull().any()]
for col in missing_cols:
    df_categorical[col].fillna("Missing", inplace=True)
dropped_columns['imputed'] = missing_cols
print(f"Step 4: Imputed missing values in {len(missing_cols)} columns with 'Missing' label.")
# TODO: Consider more sophisticated imputation methods (e.g., mode imputation or model-based) after merging dataframes and train-test split.


# Step 5: Defer Mean Response Encoding until after train-test split
# Mean Response Encoding can cause data leakage if done before splitting
print("Step 5: Deferring Mean Response Encoding until after train-test split.")
# Encoding will be done after the split


# Step 6: Defer dropping low correlation features until after train-test split
# This step should be done after splitting to avoid data leakage
print("Step 6: Deferring dropping low-correlation features until after train-test split.")
# dropped_columns['low_correlation'] = low_corr_features # Commenting out the dropping part

# Final output
categorical = df_categorical.copy()
final_shape = categorical.shape
print(f"\n✅ Final shape of categorical after preprocessing: {final_shape}")
# Recalculating total columns dropped based on steps performed
total_dropped = len(dropped_columns['100_percent_percent_missing']) + len(dropped_columns['constant']) + len(dropped_columns['high_cardinality'])
print(f"🔻 Total columns dropped so far: {total_dropped}")


# Save dropped column names to files
os.makedirs("drop_logs", exist_ok=True)
for step, cols in dropped_columns.items():
    if step not in ['low_correlation']: # Only save logs for steps performed
        file_path = f"drop_logs/dropped_cat_{step}.txt"
        with open(file_path, "w") as f:
            for col in cols:
                f.write(col + "\n")
        print(f"📁 Saved {len(cols)} columns dropped in '{step}' to {file_path}")

Step 0: Original shape of train_categorical_sampled: (13107, 2141)
Step 1: Dropped 1262 columns with 100% missing values.
Step 2: Dropped 780 constant categorical features.
Step 3: Dropped 0 high-cardinality features (>50 unique values), excluding 'Id'.
Step 4: Imputed missing values in 98 columns with 'Missing' label.
Step 5: Deferring Mean Response Encoding until after train-test split.
Step 6: Deferring dropping low-correlation features until after train-test split.

✅ Final shape of categorical after preprocessing: (13107, 99)
🔻 Total columns dropped so far: 2042
📁 Saved 0 columns dropped in '100_percent_missing' to drop_logs/dropped_cat_100_percent_missing.txt
📁 Saved 780 columns dropped in 'constant' to drop_logs/dropped_cat_constant.txt
📁 Saved 0 columns dropped in 'high_cardinality' to drop_logs/dropped_cat_high_cardinality.txt
📁 Saved 98 columns dropped in 'imputed' to drop_logs/dropped_cat_imputed.txt
📁 Saved 1262 columns dropped in '100_percent_percent_missing' to drop_logs/

In [86]:
# Step 0: Start with a copy of date dataset
df_date = date_sampled.copy()
original_shape = df_date.shape
print(f"Step 0: Original shape of date_sampled: {original_shape}")

# Initialize logs
dropped_columns = {
    '100_percent_missing': [],
    'constant': [],
    'derived_features_missing': [] # Log for rows dropped due to missingness for derived features
}

# Step 1: Drop 100% missing value features
missing_100 = df_date.columns[df_date.isnull().mean() == 1.0].tolist()
df_date.drop(columns=missing_100, inplace=True)
dropped_columns['100_percent_missing'] = missing_100
print(f"Step 1: Dropped {len(missing_100)} columns with 100% missing values.")

# Step 2: Drop constant features (only one unique value)
constant_cols = [col for col in df_date.columns if df_date[col].nunique() == 1]
df_date.drop(columns=constant_cols, inplace=True)
dropped_columns['constant'] = constant_cols
print(f"Step 2: Dropped {len(constant_cols)} constant date features.")

# Step 3: Convert to datetime and create derived features
# Convert date columns to datetime objects, coercing errors
date_cols = [col for col in df_date.columns if col != 'Id']
for col in date_cols:
    df_date[col] = pd.to_datetime(df_date[col], errors='coerce')

# Create derived features (e.g., day of week, day of year, hour, minute, etc.)
# This can be done after merging to handle potential missingness across different date columns consistently.
# For now, we will just convert to numeric representation (e.g., Unix timestamp or days since a reference date)
# and impute missing values.

# Example: Convert to days since the minimum date in the dataset
min_date = df_date[date_cols].min().min()
for col in date_cols:
    df_date[col + '_days_since_min'] = (df_date[col] - min_date).dt.days

# Drop original datetime columns after creating derived features
df_date.drop(columns=date_cols, inplace=True)

# Step 4: Imputation for derived date features - Using median for now, consider model-based imputation after merging
derived_date_cols = [col for col in df_date.columns if col.endswith('_days_since_min')]
missing_cols_derived = [col for col in derived_date_cols if df_date[col].isnull().any()]
for col in missing_cols_derived:
    median_val = df_date[col].median() # Using overall median for now
    df_date[col].fillna(median_val, inplace=True)
dropped_columns['imputed'] = missing_cols_derived
print(f"Step 4: Imputed missing values in {len(missing_cols_derived)} derived date columns using median.")
# TODO: Consider more sophisticated imputation methods (e.g., model-based) after merging dataframes and train-test split.


# Final output
date = df_date.copy()
final_shape = date.shape
print(f"\n✅ Final shape of date after preprocessing: {final_shape}")
# Recalculating total columns dropped based on steps performed
total_dropped = len(dropped_columns['100_percent_missing']) + len(dropped_columns['constant'])
print(f"🔻 Total columns dropped so far: {total_dropped}")

# Save dropped column names to files
os.makedirs("drop_logs", exist_ok=True)
for step, cols in dropped_columns.items():
    file_path = f"drop_logs/dropped_date_{step}.txt"
    with open(file_path, "w") as f:
        for col in cols:
            f.write(col + "\n")
    print(f"📁 Saved {len(cols)} columns dropped in '{step}' to {file_path}")

Step 0: Original shape of date_sampled: (13107, 1157)
Step 1: Dropped 10 columns with 100% missing values.
Step 2: Dropped 1 constant date features.
Step 4: Imputed missing values in 1145 derived date columns using median.

✅ Final shape of date after preprocessing: (13107, 1146)
🔻 Total columns dropped so far: 11
📁 Saved 10 columns dropped in '100_percent_missing' to drop_logs/dropped_date_100_percent_missing.txt
📁 Saved 1 columns dropped in 'constant' to drop_logs/dropped_date_constant.txt
📁 Saved 0 columns dropped in 'derived_features_missing' to drop_logs/dropped_date_derived_features_missing.txt
📁 Saved 1145 columns dropped in 'imputed' to drop_logs/dropped_date_imputed.txt


In [87]:
# Step 1: Merge the preprocessed dataframes
print("Step 1: Merging preprocessed dataframes...")
# Taking 'Id' as the common column for merging
merged_data = numeric_sampled.merge(categorical_sampled, on='Id', how='left')
merged_data = merged_data.merge(date_sampled, on='Id', how='left')

print("✅ Dataframes merged successfully.")
print(f"Shape of the merged dataframe: {merged_data.shape}")

Step 1: Merging preprocessed dataframes...
✅ Dataframes merged successfully.
Shape of the merged dataframe: (13107, 4266)


In [88]:
# Step 2: Split the merged data into training and testing sets
print("\nStep 2: Splitting data into training and testing sets...")

# Separate features (X) and target (y)
# 'Response' is the target variable and 'Id' is an identifier
X = merged_data.drop(columns=['Response', 'Id'])
y = merged_data['Response']
ids = merged_data['Id'] # Keep track of IDs for potential later use

# Split data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42, stratify=y
)

print("✅ Data split successfully.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


Step 2: Splitting data into training and testing sets...
✅ Data split successfully.
Shape of X_train: (10485, 4264)
Shape of X_test: (2622, 4264)
Shape of y_train: (10485,)
Shape of y_test: (2622,)


In [None]:
# Step 3: Implement Deferred Preprocessing Steps on Training Data

print("\nStep 3: Applying deferred preprocessing steps...")

# Separate numeric and non-numeric columns for targeted preprocessing
numeric_cols_after_split = X_train.select_dtypes(include=np.number).columns.tolist()
non_numeric_cols_after_split = X_train.select_dtypes(exclude=np.number).columns.tolist()

print(f"- Identified {len(numeric_cols_after_split)} numeric columns and {len(non_numeric_cols_after_split)} non-numeric columns.")


# --- Deferred Step: Handle Multicollinearity (Numeric Features) ---
print("- Handling multicollinearity for numeric features...")
if numeric_cols_after_split:
    corr_matrix = X_train[numeric_cols_after_split].corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop_high_corr = [column for column in upper.columns if any(upper[column] > 0.9)]

    print(f"  Identified {len(to_drop_high_corr)} highly correlated numeric columns to drop.")

    # Drop highly correlated features from both training and testing sets
    X_train = X_train.drop(columns=to_drop_high_corr)
    X_test = X_test.drop(columns=to_drop_high_corr)
    print(f"  Dropped highly correlated columns from X_train and X_test.")
else:
    print("  No numeric columns to check for multicollinearity.")


# --- Deferred Step: Mean Response Encoding (Categorical Features) ---
# Apply Mean Response Encoding to non-numeric features in X_train and X_test
print("\n- Applying Mean Response Encoding to non-numeric features...")
if non_numeric_cols_after_split:
    # Ensure y_train aligns with X_train
    temp_train = X_train[non_numeric_cols_after_split].copy()
    temp_train['Response'] = y_train

    mean_response_features_train = pd.DataFrame()
    mean_response_features_test = pd.DataFrame()

    for col in non_numeric_cols_after_split:
        if col in temp_train.columns:
            # Calculate means from training data only
            means = temp_train.groupby(col)['Response'].mean()
            overall_mean = temp_train['Response'].mean()

            # Apply to both train and test sets
            X_train[col + '_mean_response'] = X_train[col].map(means).fillna(overall_mean)
            X_test[col + '_mean_response'] = X_test[col].map(means).fillna(overall_mean)

    # Drop original non-numeric columns
    X_train = X_train.drop(columns=non_numeric_cols_after_split)
    X_test = X_test.drop(columns=non_numeric_cols_after_split)
    print(f"  Applied Mean Response Encoding and dropped original non-numeric columns.")
else:
    print("  No non-numeric columns to encode.")


# --- Deferred Step: Drop Low Correlation Features (Now all features should be numeric) ---
# Calculate correlations with the target variable 'y_train'
print("\n- Dropping low correlation features...")
# Ensure y_train aligns with X_train after previous drops and encoding
# Re-create a temporary dataframe with current X_train and y_train
train_data_with_target_numeric = X_train.copy()
train_data_with_target_numeric['Response'] = y_train


correlations_with_target = train_data_with_target_numeric.corr()['Response'].abs().sort_values(ascending=False)

# Define a correlation threshold (e.g., 0.01, adjust as needed)
corr_threshold = 0.01

# Find features with correlation less than the threshold (excluding 'Response' itself)
to_drop_low_corr = correlations_with_target[correlations_with_target < corr_threshold].index.tolist()
if 'Response' in to_drop_low_corr:
    to_drop_low_corr.remove('Response') # Ensure target is not dropped

print(f"  Identified {len(to_drop_low_corr)} low correlation features to drop.")

# Drop low correlation features from both training and testing sets
X_train = X_train.drop(columns=to_drop_low_corr)
X_test = X_test.drop(columns=to_drop_low_corr)
print(f"  Dropped low correlation features from X_train and X_test.")


# --- Deferred Step: More Sophisticated Imputation (Placeholder) ---
print("\n- Deferring more sophisticated imputation methods for now.")
# Implement more sophisticated imputation methods (e.g., KNNImputer, IterativeImputer) here if needed.
# This would involve fitting the imputer on X_train and transforming both X_train and X_test.


print("\n✅ Deferred preprocessing steps applied to training and testing data.")
print(f"Final shape of X_train after deferred steps: {X_train.shape}")
print(f"Final shape of X_test after deferred steps: {X_test.shape}")


Step 3: Applying deferred preprocessing steps...
- Identified 3386 numeric columns and 878 non-numeric columns.
- Handling multicollinearity for numeric features...


## Baseline model.
Train a simple baseline model (e.g., Logistic Regression) to establish a performance benchmark.


In [None]:
# Impute missing values after train-test split and deferred preprocessing
print("\nImputing remaining missing values after deferred preprocessing...")

# Initialize imputer (using median for now, can be changed to KNNImputer etc.)
imputer = SimpleImputer(strategy='median')

# Fit on training data and transform both training and testing data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("✅ Imputation of remaining missing values complete.")
print(f"Shape of X_train_imputed: {X_train_imputed.shape}")
print(f"Shape of X_test_imputed: {X_test_imputed.shape}")

In [None]:
# Instantiate the model
baseline_model = LogisticRegression()

# Train the model using the imputed data
baseline_model.fit(X_train_imputed, y_train)

# Evaluate the model
y_pred = baseline_model.predict(X_test_imputed)

# Calculate various evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, baseline_model.predict_proba(X_test_imputed)[:, 1]) # Calculate AUC

# Print the evaluation metrics
print(f"Baseline Logistic Regression Model Accuracy: {accuracy:.4f}")
print(f"Baseline Logistic Regression Model Precision: {precision:.4f}")
print(f"Baseline Logistic Regression Model Recall: {recall:.4f}")
print(f"Baseline Logistic Regression Model F1-score: {f1:.4f}")
print(f"Baseline Logistic Regression Model AUC: {auc:.4f}")

## Multiple model training
Train several different machine learning models suitable for the classification task.


In [None]:


# 1. Define the dictionary of models
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000), # Increased max_iter for convergence
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42), # Added probability=True for potential later use with metrics like AUC
    'KNeighbors': KNeighborsClassifier()
}

trained_models = {}

# 2. Iterate through the models and train them
print("Step 1: Training machine learning models...")
for name, model in models.items():
    print(f"  Training {name}...")
    model.fit(X_train_imputed, y_train)
    trained_models[name] = model
    print(f"  {name} training complete.")

# 3. Store the trained models (already done in the loop)
print("\n✅ All models trained successfully.")

## Model evaluation
Evaluate all trained models using appropriate classification metrics (e.g., Accuracy, Precision, Recall, F1-score, AUC).


In [None]:
evaluation_metrics = {}
print("Initialized evaluation_metrics dictionary.")
print("\nEvaluating trained models...")
for name, model in trained_models.items():
    print(f"  Evaluating {name}...")
    y_pred = model.predict(X_test_imputed)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    model_metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }

    # Calculate AUC if the model supports probability predictions
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
        auc = roc_auc_score(y_test, y_pred_proba)
        model_metrics['AUC'] = auc

    evaluation_metrics[name] = model_metrics

    print(f"    Accuracy: {accuracy:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall: {recall:.4f}")
    print(f"    F1-score: {f1:.4f}")
    if 'AUC' in model_metrics:
        print(f"    AUC: {auc:.4f}")

print("\n✅ Model evaluation complete.")

## Visual metrics
Visualize the evaluation metrics to compare the performance of different models.


In [None]:
# 1. Convert the evaluation_metrics dictionary into a pandas DataFrame
eval_df = pd.DataFrame(evaluation_metrics).T # Transpose to have models as rows

# 2. Create bar plots for each metric
metrics_to_plot = ['Accuracy', 'AUC', 'F1-score', 'Precision', 'Recall']

print("Generating plots for evaluation metrics...")

# Calculate the number of rows needed (ceil(num_metrics / 2))
n_metrics = len(metrics_to_plot)
nrows = (n_metrics + 1) // 2
ncols = 2

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 4 * nrows)) # Adjusted figsize and subplots layout
fig.suptitle('Model Performance Comparison', y=1.02, fontsize=16)

# Flatten the axes array for easy iteration
axes = axes.flatten()

for i, metric in enumerate(metrics_to_plot):
    if metric in eval_df.columns:
        eval_df[metric].plot(kind='bar', ax=axes[i], color=sns.color_palette('viridis', len(eval_df)))
        axes[i].set_title(f'{metric} Comparison')
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(axis='y', linestyle='--', alpha=0.7)
    else:
        print(f"Warning: Metric '{metric}' not found in evaluation_metrics DataFrame.")

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])


plt.tight_layout()
plt.show()

print("\n✅ Evaluation metrics visualized.")

## Model selection
Select the best performing model based on the evaluation metrics.


In [None]:
# Analyze eval_df and the plots to select the best model.
print("Analyzing evaluation metrics for model selection...")
display(eval_df)

# Identify the best model based on AUC, as it is a good overall metric for imbalanced datasets
# and the bar plots show very low Precision, Recall, and F1 for most models.
# AUC is less sensitive to class imbalance than other metrics.
best_model_name = eval_df['AUC'].idxmax()
best_model_auc = eval_df['AUC'].max()

print(f"\nBased on the evaluation metrics, particularly AUC, the best performing model is: {best_model_name}")
print(f"Reasoning:")
print(f"- The target variable is highly imbalanced, as indicated by the low Precision, Recall, and F1-scores for most models.")
print(f"- AUC (Area Under the ROC Curve) is a robust metric for imbalanced datasets as it measures the model's ability to distinguish between positive and negative classes across various thresholds.")
print(f"- {best_model_name} achieved the highest AUC score ({best_model_auc:.4f}) among the evaluated models.")
print(f"- While Accuracy is high for all models (due to class imbalance), other metrics like Precision, Recall, and F1 are very low, highlighting the challenge in predicting the minority class. AUC provides a better indication of overall model performance in this context.")

# Store the best model name in the specified variable
# best_model_name is already assigned above
print(f"\nSelected best model name stored in 'best_model_name' variable.")


## Hyperparameter tuning
Tune the hyperparameters of the selected best model to further optimize its performance.


In [None]:
# Use GridSearchCV when compute resources permit to have more thorough search of model tuning space

# For now, define a parameter distribution for RandomizedSearchCV
# Using distributions instead of a fixed grid
# Each factor reduced to fit the compute available. Use larger factors and grids when compute available
param_distributions = {
    'n_estimators': randint(100, 200), # Reduced max estimators
    'learning_rate': uniform(loc=0.01, scale=0.04), # Reduced scale for learning rate
    'max_depth': randint(3, 4), # Reduced max depth
    'min_samples_split': randint(2, 5), # Reduced max min_samples_split
    'min_samples_leaf': randint(1, 3) # Reduced max min_samples_leaf
}

# Instantiate RandomizedSearchCV
# Use 'roc_auc' as the scoring metric (or 'auc')
# Set n_iter to control the number of parameter combinations sampled
random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                                   param_distributions=param_distributions,
                                   n_iter=20, # Reduced number of iterations
                                   scoring='roc_auc', # Use 'roc_auc' as scoring metric
                                   cv=3, # Use 3-fold cross-validation
                                   n_jobs=-1, # Used all available cores
                                   random_state=42,
                                   verbose=2) # Increased verbosity for progress updates

# Fit RandomizedSearchCV to the imputed training data
print("Starting RandomizedSearchCV for hyperparameter tuning (optimizing for AUC)...")
start_time = time.time()

# Fit on the imputed training data (X_train_imputed, y_train) - Correcting the data source
random_search.fit(X_train_imputed, y_train)

end_time = time.time()
print("RandomizedSearchCV completed.")
print(f"Total time taken for RandomizedSearchCV: {end_time - start_time:.2f} seconds")


# Get the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("\nBest parameters found (optimizing for AUC):")
print(best_params)
print(f"\nBest AUC from cross-validation: {best_score:.4f}")

# Store the best model from the random search
best_tuned_model = random_search.best_estimator_
print("\nBest tuned model (optimizing for AUC) stored in 'best_tuned_model'.")

# Generate and plot the learning curve for the best tuned model (optimizing for AUC)
print("\nGenerating learning curve for the best tuned model (optimizing for AUC)...")
train_sizes, train_scores, test_scores = learning_curve(
    best_tuned_model, X_train_imputed, y_train, cv=3, n_jobs=-1, # Use imputed data for learning curve
    train_sizes=np.linspace(0.1, 1.0, 3), scoring='roc_auc' # reduced number of train sizes, scoring is 'roc_auc'
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score (AUC)")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score (AUC)")
plt.xlabel("Training Examples")
plt.ylabel("AUC Score")
plt.title("Learning Curve for Best Tuned Gradient Boosting Model (Optimizing for AUC)")
plt.legend(loc="best")
plt.grid()
plt.show()

print("\n✅ Learning curve generated and displayed (optimizing for AUC).")

## Reducing Overfitting

Based on the learning curve, the model shows signs of slight overfitting. Here are some strategies to potentially reduce overfitting in a Gradient Boosting model, which can be added to scope of project when compute and effort permits:

*   **Increase Regularization Parameters:**
    *   `max_depth`: Decrease the maximum depth of the individual trees. Smaller trees are less likely to capture noise.
    *   `min_samples_split`: Increase the minimum number of samples required to split an internal node. This prevents splitting on very small groups of samples.
    *   `min_samples_leaf`: Increase the minimum number of samples required to be at a leaf node. This also prevents creating leaves based on very few samples.
    *   `subsample`: Introduce randomness by training each tree on a random fraction of the training data (e.g., 0.8 for 80%). This is a form of bagging.
    *   `max_features`: Consider a random subset of features when looking for the best split.

*   **Increase `n_estimators` (with Early Stopping):** While increasing the number of estimators *can* lead to overfitting if not controlled, using early stopping in conjunction with a larger number of estimators allows the model to train only as long as performance on a validation set is improving.

*   **Reduce `learning_rate`:** A smaller learning rate requires more estimators but can lead to a more robust model that generalizes better.

*   **More Data:** As observed in the learning curve, increasing the amount of training data often helps reduce overfitting.

*   **Feature Selection/Engineering:** Reducing the number of features or creating more meaningful features can sometimes help prevent the model from learning spurious correlations.

We can modify the hyperparameter tuning search space and learning curve generation to incorporate some of these strategies to see if we can reduce the observed overfitting, when the effort permits. For now, this will be out of scope of current ietration.

## Evaluate Tuned Model
Evaluate the best tuned model on the test set.

In [None]:
print("\nEvaluating the best tuned model on the test set...")

# Make predictions with the best tuned model
y_pred_tuned = best_tuned_model.predict(X_test_imputed)

# Calculate evaluation metrics for the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
precision_tuned = precision_score(y_test, y_pred_tuned)
recall_tuned = recall_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned)

# Calculate AUC if the model supports probability predictions
auc_tuned = None
if hasattr(best_tuned_model, 'predict_proba'):
    y_pred_proba_tuned = best_tuned_model.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
    auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned)

# Store the tuned model's metrics
tuned_model_metrics = {
    'Accuracy': accuracy_tuned,
    'Precision': precision_tuned,
    'Recall': recall_tuned,
    'F1-score': f1_tuned
}
if auc_tuned is not None:
    tuned_model_metrics['AUC'] = auc_tuned

evaluation_metrics['Tuned_GradientBoosting'] = tuned_model_metrics

print("✅ Evaluation of the best tuned model complete.")
print(f"  Accuracy (Tuned): {accuracy_tuned:.4f}")
print(f"  Precision (Tuned): {precision_tuned:.4f}")
print(f"  Recall (Tuned): {recall_tuned:.4f}")
print(f"  F1-score (Tuned): {f1_tuned:.4f}")
if auc_tuned is not None:
    print(f"  AUC (Tuned): {auc_tuned:.4f}")

## Visual Metrics (Updated)
Visualize the evaluation metrics, before and after the tuning the best model selected before, to compare performance.

In [None]:
# 1. Convert the updated evaluation_metrics dictionary into a pandas DataFrame
eval_df_tuned = pd.DataFrame(evaluation_metrics).T # Transpose to have models as rows

# Filter to include only the best model before tuning and the best tuned model
# Assuming 'best_model_name' stores the name of the best model before tuning
# and 'Tuned_GradientBoosting' is the name used for the best model after tuning.
models_to_compare = [best_model_name, 'Tuned_GradientBoosting']
eval_df_comparison = eval_df_tuned.loc[models_to_compare]


# 2. Create bar plots for each metric
metrics_to_plot = ['Accuracy', 'AUC', 'F1-score', 'Precision', 'Recall']

print("Generating updated plots for evaluation metrics (Tuned vs. Before Tuning)...")

# Calculate the number of rows needed (ceil(num_metrics / 2))
n_metrics = len(metrics_to_plot)
nrows = (n_metrics + 1) // 2
ncols = 2 # Two plots per line

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 4 * nrows)) # Adjusted figsize and subplots layout
fig.suptitle('Model Performance Comparison (Tuned vs. Before Tuning)', y=1.02, fontsize=16)

# Flatten the axes array for easy iteration
axes = axes.flatten()

for i, metric in enumerate(metrics_to_plot):
    if metric in eval_df_comparison.columns:
        eval_df_comparison[metric].plot(kind='bar', ax=axes[i], color=sns.color_palette('viridis', len(eval_df_comparison)))
        axes[i].set_title(f'{metric} Comparison')
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=45)
        axes[i].grid(axis='y', linestyle='--', alpha=0.7)
    else:
        print(f"Warning: Metric '{metric}' not found in evaluation_metrics DataFrame.")

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])


plt.tight_layout()
plt.show()

print("\n✅ Updated evaluation metrics visualized.")

### Conclusion: Impact of Hyperparameter Tuning

Based on the comparison of the Gradient Boosting model's performance before and after hyperparameter tuning (as shown in the table and graphs above):

*   **Accuracy:** Accuracy remained very high (around 0.994) both before and after tuning. However, due to the severe class imbalance, Accuracy is not the most informative metric for this problem.
*   **Precision:** Precision saw a significant increase after tuning (from 0.0909 to 0.3333). This indicates that when the tuned model predicts a positive case, it is more likely to be correct compared to the untuned model. This is a positive outcome of tuning.
*   **Recall:** Recall remained low (around 0.0667) both before and after tuning. This suggests that even with tuning, the model still struggles to identify a large proportion of the actual positive cases.
*   **F1-score:** The F1-score, which is the harmonic mean of Precision and Recall, also saw an improvement after tuning (from 0.0769 to 0.1111). This reflects the gain in Precision, although the low Recall still keeps the F1-score relatively low.
*   **AUC:** The AUC score saw a slight increase after tuning (from 0.6357 to 0.6368). While the improvement is marginal, AUC is a robust metric for imbalanced data and shows a small positive impact of tuning on the model's ability to distinguish between the two classes.

**Overall Conclusion:**

Hyperparameter tuning of the Gradient Boosting model resulted in a noticeable improvement in **Precision** and a slight increase in **AUC**. While **Recall** remains a challenge, the tuned model is better at avoiding false positives when it does predict a positive case. This suggests that the tuning process helped to refine the model's decision boundaries to be more accurate for the minority class, although the difficulty in identifying all positive cases (Recall) persists due to the dataset's imbalance and potentially the inherent complexity of the problem.

# Improvement on Sampling method
Implement and compare oversampling (SMOTE) and undersampling techniques on the training data, train the best-tuned model on the resampled data, evaluate their performance on the original test set, and determine the best resampling strategy for the Bosch dataset.

## Implement oversampling (SMOTE)
Apply the SMOTE technique to the training data to create synthetic samples for the minority class.


In [None]:
# 1. Import the SMOTE class (if not already imported)
from imblearn.over_sampling import SMOTE

# 2. Instantiate a SMOTE object
smote = SMOTE(random_state=42)

# 3. Apply SMOTE to the imputed training data
print("Applying SMOTE to the training data...")
X_train_resampled_smote, y_train_resampled_smote = smote.fit_resample(X_train_imputed, y_train)
print("SMOTE application complete.")

# 4. Store the oversampled features and target (already done in step 3)

print(f"Original training data shape: {X_train_imputed.shape}")
print(f"Oversampled training data shape (SMOTE): {X_train_resampled_smote.shape}")
print(f"Original target distribution:\n{y_train.value_counts()}")
print(f"Oversampled target distribution (SMOTE):\n{y_train_resampled_smote.value_counts()}")


## Train and evaluate model with oversampling
Train the best-tuned model on the oversampled training data and evaluate its performance on the original test set.


In [None]:
print("\nTraining the best tuned model on SMOTE oversampled training data...")

# Train the best_tuned_model using the oversampled training data
best_tuned_model_smote = best_tuned_model # Start with the best tuned model structure
best_tuned_model_smote.fit(X_train_resampled_smote, y_train_resampled_smote)

print("✅ Training on SMOTE oversampled data complete.")

# Make predictions on the original imputed test set
print("\nEvaluating the model trained on SMOTE data on the test set...")
y_pred_tuned_smote = best_tuned_model_smote.predict(X_test_imputed)

# Calculate evaluation metrics
accuracy_tuned_smote = accuracy_score(y_test, y_pred_tuned_smote)
precision_tuned_smote = precision_score(y_test, y_pred_tuned_smote)
recall_tuned_smote = recall_score(y_test, y_pred_tuned_smote)
f1_tuned_smote = f1_score(y_test, y_pred_tuned_smote)

# Calculate AUC if the model supports probability predictions
auc_tuned_smote = None
if hasattr(best_tuned_model_smote, 'predict_proba'):
    y_pred_proba_tuned_smote = best_tuned_model_smote.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
    auc_tuned_smote = roc_auc_score(y_test, y_pred_proba_tuned_smote)

# Store the tuned model's metrics
tuned_model_smote_metrics = {
    'Accuracy': accuracy_tuned_smote,
    'Precision': precision_tuned_smote,
    'Recall': recall_tuned_smote,
    'F1-score': f1_tuned_smote
}
if auc_tuned_smote is not None:
    tuned_model_smote_metrics['AUC'] = auc_tuned_smote

evaluation_metrics['Tuned_GradientBoosting_SMOTE'] = tuned_model_smote_metrics

print("✅ Evaluation complete for model trained on SMOTE data.")
print(f"  Accuracy (Tuned + SMOTE): {accuracy_tuned_smote:.4f}")
print(f"  Precision (Tuned + SMOTE): {precision_tuned_smote:.4f}")
print(f"  Recall (Tuned + SMOTE): {recall_tuned_smote:.4f}")
print(f"  F1-score (Tuned + SMOTE): {f1_tuned_smote:.4f}")
if auc_tuned_smote is not None:
    print(f"  AUC (Tuned + SMOTE): {auc_tuned_smote:.4f}")

## Implement undersampling
Apply an undersampling technique (e.g., RandomUnderSampler) to the training data to reduce the number of samples in the majority class.


In [None]:
# Instantiate a RandomUnderSampler object
rus = RandomUnderSampler(random_state=42)

# Apply the undersampling technique to the imputed training data
print("Applying RandomUnderSampler to the training data...")
X_train_resampled_rus, y_train_resampled_rus = rus.fit_resample(X_train_imputed, y_train)
print("RandomUnderSampler application complete.")

# Print the original and undersampled training data shapes and target distributions
print(f"\nOriginal training data shape: {X_train_imputed.shape}")
print(f"Undersampled training data shape (RandomUnderSampler): {X_train_resampled_rus.shape}")
print(f"\nOriginal target distribution:\n{y_train.value_counts()}")
print(f"\nUndersampled target distribution (RandomUnderSampler):\n{y_train_resampled_rus.value_counts()}")

In [None]:
print("\nTraining the best tuned model on RandomUnderSampler undersampled training data...")

# Train the best_tuned_model using the undersampled training data
best_tuned_model_rus = best_tuned_model # Start with the best tuned model structure
best_tuned_model_rus.fit(X_train_resampled_rus, y_train_resampled_rus)

print("✅ Training on RandomUnderSampler undersampled data complete.")

# Make predictions on the original imputed test set
print("\nEvaluating the model trained on RandomUnderSampler data on the test set...")
y_pred_tuned_rus = best_tuned_model_rus.predict(X_test_imputed)

# Calculate evaluation metrics
accuracy_tuned_rus = accuracy_score(y_test, y_pred_tuned_rus)
precision_tuned_rus = precision_score(y_test, y_pred_tuned_rus)
recall_tuned_rus = recall_score(y_test, y_pred_tuned_rus)
f1_tuned_rus = f1_score(y_test, y_pred_tuned_rus)

# Calculate AUC if the model supports probability predictions
auc_tuned_rus = None
if hasattr(best_tuned_model_rus, 'predict_proba'):
    y_pred_proba_tuned_rus = best_tuned_model_rus.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
    auc_tuned_rus = roc_auc_score(y_test, y_pred_proba_tuned_rus)

# Store the tuned model's metrics
tuned_model_rus_metrics = {
    'Accuracy': accuracy_tuned_rus,
    'Precision': precision_tuned_rus,
    'Recall': recall_tuned_rus,
    'F1-score': f1_tuned_rus
}
if auc_tuned_rus is not None:
    tuned_model_rus_metrics['AUC'] = auc_tuned_rus

evaluation_metrics['Tuned_GradientBoosting_RUS'] = tuned_model_rus_metrics

print("✅ Evaluation complete for model trained on RandomUnderSampler data.")
print(f"  Accuracy (Tuned + RUS): {accuracy_tuned_rus:.4f}")
print(f"  Precision (Tuned + RUS): {precision_tuned_rus:.4f}")
print(f"  Recall (Tuned + RUS): {recall_tuned_rus:.4f}")
print(f"  F1-score (Tuned + RUS): {f1_tuned_rus:.4f}")
if auc_tuned_rus is not None:
    print(f"  AUC (Tuned + RUS): {auc_tuned_rus:.4f}")


## Compare resampling techniques
Compare the evaluation metrics obtained from the models trained with oversampling and undersampling to determine which technique is more effective for this dataset.


In [None]:
# Convert the evaluation_metrics dictionary into a pandas DataFrame
evaluation_df = pd.DataFrame(evaluation_metrics).T

# Display the DataFrame for comparison
print("Comparison of Model Performance with Resampling Techniques:")
display(evaluation_df)

# Analyze the metrics
print("\nAnalysis of Metrics:")
print("- The 'Tuned_GradientBoosting' model is the baseline after initial tuning without resampling.")
print("- 'Tuned_GradientBoosting_SMOTE' shows performance after applying SMOTE oversampling.")
print("- 'Tuned_GradientBoosting_RUS' shows performance after applying RandomUnderSampler.")

print("\nKey observations based on metrics (Accuracy, Precision, Recall, F1-score, AUC):")
print("- **Accuracy:** High for models trained on original and SMOTE data, lower for RUS. This is expected due to class imbalance; high accuracy on imbalanced data can be misleading if the model simply predicts the majority class.")
print("- **Precision, Recall, F1-score:** Generally low across all models, indicating difficulty in correctly identifying the minority class (Response=1). RUS shows a higher Recall but significantly lower Precision and F1-score compared to SMOTE and the original tuned model, suggesting it identifies more positive cases but at the cost of many false positives.")
print("- **AUC:** This is a better metric for imbalanced datasets. The AUC scores are relatively similar across the tuned models (original, SMOTE, RUS), ranging from approximately 0.60 to 0.63. The original tuned model and the SMOTE-trained model have slightly higher AUCs compared to the RUS-trained model on the test set.")

print("\nBased on this comparison, SMOTE appears to be a slightly more effective resampling technique than RandomUnderSampler for this dataset when considering the balance between identifying positive cases (Recall) and minimizing false positives (Precision), while maintaining a competitive AUC.")


## More Discussion : Sampling Method - OverSampling vs UnderSampling

*   Applying SMOTE oversampling successfully balanced the training data distribution, increasing the number of minority class samples from 61 to 10424, matching the majority class. The total training data size increased from 10485 to 20848 samples.
*   Training the tuned Gradient Boosting model on SMOTE data resulted in test set metrics: Accuracy 0.9920, Precision 0.1250, Recall 0.0667, F1-score 0.0870, and AUC 0.6241.
*   Applying RandomUnderSampler undersampling reduced the majority class samples from 10424 to 61, balancing the distribution. The total training data size was significantly reduced from 10485 to 122 samples.
*   Training the tuned Gradient Boosting model on RandomUnderSampler data resulted in test set metrics: Accuracy 0.6159, Precision 0.0079, Recall 0.5333, F1-score 0.0156, and AUC 0.6062.
*   Comparing the performance on the original test set, the model trained with SMOTE oversampling showed better overall metrics (Precision, F1-score, and AUC) compared to the model trained with RandomUnderSampler, despite the latter having a much higher Recall.
*   The AUC scores, which are less sensitive to class imbalance, were similar across the tuned models (original, SMOTE, RUS), ranging from approximately 0.60 to 0.63, with SMOTE having a slightly higher AUC than RUS.

### Insights or Next Steps

*   SMOTE appears to be a more effective resampling strategy than RandomUnderSampler for this dataset, offering a better trade-off between identifying positive cases (Recall) and minimizing false positives (Precision) while maintaining a competitive AUC.
*   Given the continued low Precision and F1-score even with SMOTE, further investigation into more advanced resampling techniques or exploring ensemble methods specifically designed for imbalanced data might be beneficial to improve the model's ability to correctly identify the minority class.


## Explore advanced resampling
Implement and evaluate more advanced oversampling techniques (e.g., ADASYN, Borderline-SMOTE) or a combination of oversampling and undersampling (e.g., SMOTE-ENN, SMOTE-Tomek).


In [None]:
# Instantiate each advanced resampling technique
adasyn = ADASYN(random_state=42)
borderline_smote = BorderlineSMOTE(random_state=42)
smote_enn = SMOTEENN(random_state=42)
smote_tomek = SMOTETomek(random_state=42)

resamplers = {
    'ADASYN': adasyn,
    'BorderlineSMOTE': borderline_smote,
    'SMOTEENN': smote_enn,
    'SMOTETomek': smote_tomek
}

# Apply resampling, train model, evaluate, and store metrics
print("\nExploring advanced resampling techniques...")

for name, resampler in resamplers.items():
    print(f"\nApplying {name}...")
    X_train_resampled, y_train_resampled = resampler.fit_resample(X_train_imputed, y_train)
    print(f"{name} application complete. Resampled shape: {X_train_resampled.shape}")
    print(f"Resampled target distribution:\n{y_train_resampled.value_counts()}")

    print(f"Training best tuned model on {name} resampled data...")
    model_resampled = best_tuned_model # Start with the best tuned model structure
    model_resampled.fit(X_train_resampled, y_train_resampled)
    print("Training complete.")

    print(f"Evaluating model trained on {name} data on the test set...")
    y_pred_resampled = model_resampled.predict(X_test_imputed)

    # Calculate evaluation metrics
    accuracy_resampled = accuracy_score(y_test, y_pred_resampled)
    precision_resampled = precision_score(y_test, y_pred_resampled)
    recall_resampled = recall_score(y_test, y_pred_resampled)
    f1_resampled = f1_score(y_test, y_pred_resampled)

    # Calculate AUC if the model supports probability predictions
    auc_resampled = None
    if hasattr(model_resampled, 'predict_proba'):
        y_pred_proba_resampled = model_resampled.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
        auc_resampled = roc_auc_score(y_test, y_pred_proba_resampled)

    # Store the model's metrics
    resampled_model_metrics = {
        'Accuracy': accuracy_resampled,
        'Precision': precision_resampled,
        'Recall': recall_resampled,
        'F1-score': f1_resampled
    }
    if auc_resampled is not None:
        resampled_model_metrics['AUC'] = auc_resampled

    evaluation_metrics[f'Tuned_GradientBoosting_{name}'] = resampled_model_metrics

    # 7. Print the evaluation metrics
    print(f"✅ Evaluation complete for model trained on {name} data.")
    print(f"  Accuracy (Tuned + {name}): {accuracy_resampled:.4f}")
    print(f"  Precision (Tuned + {name}): {precision_resampled:.4f}")
    print(f"  Recall (Tuned + {name}): {recall_resampled:.4f}")
    print(f"  F1-score (Tuned + {name}): {f1_resampled:.4f}")
    if auc_resampled is not None:
        print(f"  AUC (Tuned + {name}): {auc_resampled:.4f}")

print("\n✅ Evaluation of advanced resampling techniques complete.")

## Explore ensemble methods for imbalanced data
Train and evaluate ensemble methods specifically designed for imbalanced datasets (e.g., BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier).


In [None]:
# Instantiate each of the selected imbalanced ensemble classifiers
# Create a dictionary to store these imbalanced ensemble classifiers
imbalanced_ensemble_models = {
    'BalancedBaggingClassifier': BalancedBaggingClassifier(random_state=42),
    'EasyEnsembleClassifier': EasyEnsembleClassifier(random_state=42),
    'BalancedRandomForestClassifier': BalancedRandomForestClassifier(random_state=42)
}

# Iterate through the imbalanced ensemble classifiers and train/evaluate
print("\nTraining and evaluating imbalanced ensemble methods...")

for name, model in imbalanced_ensemble_models.items():
    print(f"\nTraining {name}...")
    # Train the classifier on the imputed training data
    # These methods handle the imbalance internally
    model.fit(X_train_imputed, y_train)
    print(f"  {name} training complete.")

    # Make predictions on the imputed test data
    print(f"  Evaluating {name} on the test set...")
    y_pred = model.predict(X_test_imputed)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Calculate AUC if the model supports probability predictions
    auc = None
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class
        auc = roc_auc_score(y_test, y_pred_proba)

    # Store the calculated metrics
    model_metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }
    if auc is not None:
        model_metrics['AUC'] = auc

    evaluation_metrics[name] = model_metrics

    # Print the evaluation metrics
    print(f"  ✅ Evaluation complete for {name}.")
    print(f"    Accuracy: {accuracy:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall: {recall:.4f}")
    print(f"    F1-score: {f1:.4f}")
    if auc is not None:
        print(f"    AUC: {auc:.4f}")

# Print a confirmation message
print("\n✅ Evaluation of imbalanced ensemble methods complete.")

## Compare Advanced Resampling and Ensemble Methods
Compare the evaluation metrics obtained from the models trained with advanced resampling and imbalanced ensemble methods to determine the most effective strategy for this dataset.

In [None]:
# Convert the evaluation_metrics dictionary into a pandas DataFrame for comprehensive comparison
evaluation_df_all = pd.DataFrame(evaluation_metrics).T

# Display the DataFrame, sorted by AUC for easier comparison of performance on imbalanced data
print("Comprehensive Comparison of Model Performance with Various Techniques:")
display(evaluation_df_all.sort_values(by='AUC', ascending=False))

# Provide an analysis of the results
print("\nAnalysis of Comprehensive Model Performance:")
print("- This table shows the performance metrics for the baseline model, the initially tuned model, models trained with different resampling techniques (SMOTE, RUS, ADASYN, BorderlineSMOTE, SMOTEENN, SMOTETomek), and imbalanced ensemble methods (BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier).")
print("- The metrics are sorted by AUC in descending order, as AUC is a suitable metric for comparing models on imbalanced datasets.")

print("\nKey Findings:")

print("\nDetermining the Best Strategy:")

best_overall_model_name = evaluation_df_all['AUC'].idxmax()
best_overall_model_auc = evaluation_df_all['AUC'].max()

print(f"\nBased on the comprehensive evaluation of all techniques, the best performing approach based on AUC is: {best_overall_model_name}")
print(f"It achieved the highest AUC score of {best_overall_model_auc:.4f}.")
print("\nReasoning for selecting based on AUC:")
print("- AUC is a robust metric for imbalanced datasets, as it measures the model's ability to discriminate between positive and negative classes across various thresholds.")
print("- While other metrics like Precision, Recall, and F1-score are important, they are highly sensitive to the classification threshold. AUC provides an overall measure of the model's performance regardless of the threshold.")
print("- In this analysis, we prioritized AUC as the primary metric for selecting the best overall modeling approach.")

# Note: While AUC is used for initial ranking, the 'best' strategy should also consider the application's specific needs regarding Precision and Recall when making deployment decisions.
# Precision recall will be analysed for optimization and optimized if required, later.

## Visual Metrics (All Models)
Visualize the evaluation metrics for all models to compare their performance.

In [None]:
# 1. Convert the evaluation_metrics dictionary into a pandas DataFrame
eval_df_all = pd.DataFrame(evaluation_metrics).T

# Filter to include the best tuned model and all advanced resampling/imbalanced ensemble models
models_to_compare_advanced = ['Tuned_GradientBoosting'] + [
    name for name in evaluation_metrics.keys()
    if name.startswith('Tuned_GradientBoosting_') or name in imbalanced_ensemble_models.keys()
]

# Ensure unique model names and maintain order
models_to_compare_advanced = list(dict.fromkeys(models_to_compare_advanced))

# Create a comparison DataFrame
eval_df_advanced_comparison = eval_df_all.loc[models_to_compare_advanced]


# 2. Create bar plots for each metric
metrics_to_plot = ['Accuracy', 'AUC', 'F1-score', 'Precision', 'Recall']

print("Generating plots comparing Best Tuned Model with Advanced Resampling and Ensemble Methods...")

# Calculate the number of rows needed (one row per metric)
n_metrics = len(metrics_to_plot)
nrows = n_metrics
ncols = 1 # One plot per line

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(10, 6 * nrows)) # Adjusted figsize and subplots layout
fig.suptitle('Model Performance Comparison (Tuned GB vs. Advanced Techniques)', y=1.02, fontsize=16)

# Flatten the axes array for easy iteration if needed (not strictly necessary with ncols=1)
if nrows > 1:
    axes = axes.flatten()
else:
    axes = [axes]


for i, metric in enumerate(metrics_to_plot):
    if metric in eval_df_advanced_comparison.columns:
        eval_df_advanced_comparison[metric].sort_values(ascending=False).plot(kind='bar', ax=axes[i], color=sns.color_palette('viridis', len(eval_df_advanced_comparison)))
        axes[i].set_title(f'{metric} Comparison')
        axes[i].set_ylabel(metric)
        axes[i].tick_params(axis='x', rotation=90) # Rotate labels for better readability with many models
        axes[i].grid(axis='y', linestyle='--', alpha=0.7)
    else:
        print(f"Warning: Metric '{metric}' not found in evaluation_metrics DataFrame.")

# Hide any unused subplots (not needed with ncols=1 and nrows = n_metrics)


plt.tight_layout()
plt.show()

print("\n✅ Advanced evaluation metrics visualized.")

## Model Selection Justification
Detailed justification for selecting the final model based on the evaluation metrics and problem requirements.

## Model Selection

Based on the comprehensive evaluation of various models and resampling techniques, the **Gradient Boosting Classifier with ADASYN Resampling** is selected as the final model for predicting product failures. Here is the detailed justification:

1.  **Objective of the Problem:** The core objective is to accurately predict product failures in a high-dimensional manufacturing process. This requires a model that can effectively handle the class imbalance inherent in manufacturing failure data and provide reliable predictions of the minority class (failures).

2.  **Evaluation Metrics:** Given the imbalanced nature of the dataset (very few failures compared to non-failures), metrics like Accuracy can be misleading. Instead, metrics that specifically assess performance on the minority class and the model's ability to distinguish between classes are crucial.
    *   **AUC (Area Under the ROC Curve):** This metric measures the model's ability to discriminate between positive and negative classes across all possible classification thresholds. It is robust to class imbalance and provides a good overall measure of model performance. The **Gradient Boosting Classifier with ADASYN Resampling** achieved the highest AUC (0.6522) among all models evaluated, indicating superior discriminative power.
    *   **Precision:** This metric measures the accuracy of the positive predictions (i.e., out of all predicted failures, how many were actual failures). A higher precision is desirable to minimize false positives, which can lead to unnecessary inspections or interventions. While still relatively low, the tuned Gradient Boosting model *without* resampling showed the highest precision (0.3333). However, this came at the cost of very low recall.
    *   **Recall:** This metric measures the model's ability to find all the positive cases (i.e., out of all actual failures, how many were correctly identified). A higher recall is important to minimize false negatives, which means failing to detect a faulty product. Undersampling (RUS) showed a high recall (0.4667), but with extremely low precision and AUC.
    *   **F1-score:** This metric is the harmonic mean of Precision and Recall, providing a single score that balances both. It is a good indicator of a model's performance on the minority class.

3.  **Handling Class Imbalance:** The severe class imbalance is a major challenge. Techniques like oversampling (SMOTE, ADASYN, BorderlineSMOTE), combined resampling (SMOTE-ENN, SMOTE-Tomek), and imbalanced ensemble methods (BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier) were explored to address this.
    *   **ADASYN (Adaptive Synthetic Sampling):** This technique is similar to SMOTE but focuses on generating synthetic samples for minority class instances that are harder to learn (near the decision boundary). The evaluation showed that training the tuned Gradient Boosting model on ADASYN resampled data resulted in the highest AUC, suggesting that ADASYN helped the model better distinguish between the classes in this specific dataset compared to other resampling methods.
    *   While some imbalanced ensemble methods (like EasyEnsembleClassifier) achieved competitive AUCs, they often resulted in extremely low precision, making them less practical for a manufacturing setting where minimizing false positives is also important.

4.  **Model Interpretability:** Gradient Boosting models, while powerful, can be less directly interpretable than simpler models like Logistic Regression or Decision Trees. However, techniques like feature importance (which can be derived from the trained model) and permutation importance can still provide insights into which features are most influential in the predictions. This aligns with the problem requirement for interpretable insights.

**Conclusion:**

The **Gradient Boosting Classifier with ADASYN Resampling** strikes the best balance among the evaluated approaches for this problem. It demonstrates the highest discriminative power (AUC) on the test set, indicating its superior ability to rank failure instances higher than non-failure instances. While Precision and Recall remain challenging due to the dataset's nature, the combination of a tuned Gradient Boosting model and ADASYN resampling provides the most promising performance profile according to the chosen evaluation metrics for an imbalanced classification task. Future work can explore further tuning of this specific model and resampling technique, as well as delving deeper into feature importance analysis for interpretability.

## Implement and Visualize Final Model Predictions
Implement the final selected model (Gradient Boosting with ADASYN) on the test set and visualize its predictions to demonstrate its performance.

In [None]:
print("Implementing and visualizing predictions of the final model (Gradient Boosting with ADASYN) on the test set...")

# Retrieve the best performing model (Gradient Boosting with ADASYN)
# Based on the previous analysis, the model trained on ADASYN data had the highest AUC.
# The model object itself is the best_tuned_model trained on X_train_resampled_adasyn, y_train_resampled_adasyn
# We need to retrain this model explicitly here if we don't have the resampled data readily available from the best model object

# We already have the best_tuned_model object which contains the structure and hyperparameters
# We need to get the resampled data used for the 'Tuned_GradientBoosting_ADASYN' training
# X_train_resampled_adasyn and y_train_resampled_adasyn

# Find the ADASYN resampler again
adasyn = ADASYN(random_state=42)

# Apply ADASYN to the imputed training data to get the resampled data used for training
# Note: This step is done again to ensure we have the correct resampled data if it wasn't stored globally
print("Applying ADASYN resampling to the imputed training data for final model implementation...")
X_train_resampled_adasyn, y_train_resampled_adasyn = adasyn.fit_resample(X_train_imputed, y_train)
print("ADASYN resampling complete.")

# Train a *new* instance of the best_tuned_model on the ADASYN resampled data
# This ensures we use the correct model trained on the specific data for reporting
final_model = GradientBoostingClassifier(**best_params, random_state=42) # Use the best hyperparameters
final_model.fit(X_train_resampled_adasyn, y_train_resampled_adasyn)
print("Final model (Gradient Boosting with ADASYN) trained on resampled data.")


# Make predictions on the original imputed test set
y_pred_final = final_model.predict(X_test_imputed)
y_pred_proba_final = final_model.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class

print("\nGenerating classification report and confusion matrix for the final model...")

# Generate Classification Report
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print(classification_report(y_test, y_pred_final))

# Generate Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_final)

print("\nConfusion Matrix:")
print(conf_matrix)

# Visualize the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Final Model')
plt.show()

# Optional: Visualize the ROC curve
from sklearn.metrics import roc_curve, roc_auc_score

print("\nGenerating ROC curve for the final model...")
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_final)
auc = roc_auc_score(y_test, y_pred_proba_final)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

print("\n✅ Final model predictions implemented and visualized.")


# **Tying up some missing aspects**

> F1 assessment, Outliers logging and feature importances



## Compare Model Performance: AUC vs. F1 Optimization

Based on the previous evaluations, we will compare the performance of the best model tuned for AUC (Gradient Boosting with ADASYN) with other models or techniques that showed promising F1-scores, Precision, or Recall, to determine which approach is most effective for achieving higher prediction of failure with better precision and recall.

In [None]:
# Convert the evaluation_metrics dictionary into a pandas DataFrame for easier comparison
evaluation_df_all = pd.DataFrame(evaluation_metrics).T

print("Comparing Model Performance (Focus on Precision, Recall, and F1-score):")
# Display relevant columns and sort by a metric like F1-score or Recall to highlight models that perform well on the minority class
display(evaluation_df_all[[ 'Precision', 'Recall', 'F1-score', 'AUC']].sort_values(by='F1-score', ascending=False))

print("\nAnalysis:")
print("- The table above shows the Precision, Recall, F1-score, and AUC for all evaluated models and techniques, sorted by F1-score.")
print("- We are specifically looking for approaches that yield a better balance between Precision and Recall for the minority class (failures).")

# Discuss the trade-offs observed (e.g., higher Recall often comes with lower Precision).
best_f1_model_name = evaluation_df_all['F1-score'].idxmax()
best_f1_metrics = evaluation_df_all.loc[best_f1_model_name]

print(f"\nModel with the highest F1-score: {best_f1_model_name}")
print(f"Metrics for {best_f1_model_name}:")
print(f"  Precision: {best_f1_metrics['Precision']:.4f}")
print(f"  Recall: {best_f1_metrics['Recall']:.4f}")
print(f"  F1-score: {best_f1_metrics['F1-score']:.4f}")
print(f"  AUC: {best_f1_metrics['AUC']:.4f}")


# Compare this to the model tuned for AUC (Tuned_GradientBoosting_ADASYN)
adasyn_model_name = 'Tuned_GradientBoosting_ADASYN'
if adasyn_model_name in evaluation_df_all.index:
    adasyn_metrics = evaluation_df_all.loc[adasyn_model_name]
    print(f"\nMetrics for {adasyn_model_name} (Highest AUC):")
    print(f"  Precision: {adasyn_metrics['Precision']:.4f}")
    print(f"  Recall: {adasyn_metrics['Recall']:.4f}")
    print(f"  F1-score: {adasyn_metrics['F1-score']:.4f}")
    print(f"  AUC: {adasyn_metrics['AUC']:.4f}")

    print("\nComparison Insights:")

    # Add insights based on the comparison:
    if best_f1_model_name != adasyn_model_name:
         print(f"- The model with the highest F1-score ({best_f1_model_name}) does not have the highest AUC.")
         print(f"- Comparing Precision and Recall:")
         print(f"  - {best_f1_model_name}: Precision = {best_f1_metrics['Precision']:.4f}, Recall = {best_f1_metrics['Recall']:.4f}")
         print(f"  - {adasyn_model_name}: Precision = {adasyn_metrics['Precision']:.4f}, Recall = {adasyn_metrics['Recall']:.4f}")
         # Add further analysis here based on the specific values and business problem
    else:
         print(f"- The model with the highest F1-score is also the one with the highest AUC ({best_f1_model_name}).")
         print("  This indicates that ADASYN resampling combined with Gradient Boosting tuning was effective.")

# Based on the analysis, reiterate which model/technique seems most promising for the objective
print("\nConclusion for predicting failures with higher Precision and Recall:")
# State which model appears to offer the best trade-off or performance for predicting failures, considering the business goal.
print(f"Based on the metrics, the '{best_f1_model_name}' model currently shows the highest F1-score, indicating the best balance between Precision and Recall for predicting failures. However, the overall F1-score is still low, highlighting the challenge of this imbalanced dataset.")
print("Further analysis of the trade-offs between Precision and Recall based on the specific costs of false positives and false negatives in the manufacturing process would be needed for a definitive 'best' model for deployment.")

## Implement and Visualize Highest F1-score Model Predictions


In [None]:
print("Implementing and visualizing predictions of the highest F1-score model ('Tuned_GradientBoosting') on the test set...")

# Retrieve the 'Tuned_GradientBoosting' model object
# This model was trained and evaluated in earlier steps (e.g., MyBy5e_R1OKI and 9598defb)
# Assuming the 'best_tuned_model' variable holds the Tuned_GradientBoosting model (tuned for AUC originally, but used as the base for comparison)
# The comparison in c30a4c34 identified 'Tuned_GradientBoosting' as having the highest F1. This corresponds to the model stored in 'best_tuned_model'.
try:
    best_tuned_model
except NameError:
    print("Variable 'best_tuned_model' not found. Please ensure hyperparameter tuning (cell H6z0IDGD4to5) was run successfully.")
    # Exit or handle the error appropriately if the variable is not available
    # For now, assume best_tuned_model is available


# Use the 'best_tuned_model' which corresponds to 'Tuned_GradientBoosting'
highest_f1_model = best_tuned_model

# Make predictions on the original imputed test set
y_pred_highest_f1 = highest_f1_model.predict(X_test_imputed)
y_pred_proba_highest_f1 = highest_f1_model.predict_proba(X_test_imputed)[:, 1] # Probability of the positive class

print("\nGenerating classification report and confusion matrix for the highest F1-score model...")

# Generate Classification Report
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

print("\nClassification Report:")
print(classification_report(y_test, y_pred_highest_f1))

# Generate Confusion Matrix
conf_matrix_highest_f1 = confusion_matrix(y_test, y_pred_highest_f1)

print("\nConfusion Matrix:")
print(conf_matrix_highest_f1)

# Visualize the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_highest_f1, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Highest F1-score Model (Tuned Gradient Boosting)')
plt.show()

# Visualize the ROC curve
from sklearn.metrics import roc_curve

print("\nGenerating ROC curve for the highest F1-score model...")
fpr_highest_f1, tpr_highest_f1, thresholds_highest_f1 = roc_curve(y_test, y_pred_proba_highest_f1)
auc_highest_f1 = roc_auc_score(y_test, y_pred_proba_highest_f1)

plt.figure(figsize=(8, 6))
plt.plot(fpr_highest_f1, tpr_highest_f1, color='blue', lw=2, label=f'ROC Curve (AUC = {auc_highest_f1:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Highest F1-score Model')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

print("\n✅ Highest F1-score model predictions implemented and visualized.")

# Print the technical report for the highest F1 model
# try:
#     print(report_f1)
# except NameError:
#     print("\nTechnical report for Highest F1 model not available. Please run the report generation cell (0847f5d2) first.")

## Outlier logging

Outliers have been detected and logged in the data using the Interquartile Range (IQR) method (as shown in the code below). However, they were **not handled** (e.g., removed, transformed, or imputed) in this analysis.

This decision was made because:
*   Handling outliers effectively, especially in a high-dimensional and complex manufacturing dataset, requires careful consideration and **domain expertise** to understand their potential causes and impact on the manufacturing process.
*   Applying standard outlier handling methods without domain knowledge could inadvertently **change the nature of the data** and potentially remove valuable information or introduce bias.
*   Addressing outlier handling was considered **out of the scope of the current time and effort availability** for this project iteration.

The detected outlier indices have been logged to files for **review by a domain expert**. A domain expert could provide insights into whether these outliers represent true anomalies, measurement errors, or critical process variations, and guide the appropriate handling strategy.

**Potential Outlier Handling Methods (for Domain Expert Review):**

Under the supervision and guidance of a domain expert, the following methods could be considered for handling outliers:

*   **Removal:** Removing rows or columns containing outliers. This is often the simplest but can lead to loss of valuable data, especially in imbalanced datasets.
*   **Capping/Winsorizing:** Limiting extreme values to a certain threshold (e.g., replacing values outside 1.5 or 3 times the IQR with the boundary value). This reduces the impact of outliers without removing data.
*   **Transformation:** Applying mathematical transformations (e.g., logarithmic, square root) to features to reduce the skewness caused by outliers.
*   **Imputation:** Treating outliers as missing values and using imputation techniques (potentially robust ones like median or trimmed mean imputation, or model-based imputation) to replace them.
*   **Model-Based Methods:** Using models that are less sensitive to outliers (e.g., tree-based models like Random Forest or Gradient Boosting are generally more robust than linear models or SVMs to outliers).
*   **Separate Modeling:** In some cases, outliers might represent a distinct class or phenomenon that could be modeled separately.

The appropriate method would depend heavily on the nature of the outliers and the goals of the predictive model, as determined by a manufacturing domain expert.

In [None]:
# Code block for Outlier Detection and Logging

print("Performing basic outlier detection and logging...")

# Re-apply the preprocessing steps up to imputation to get the data state before modeling for outlier detection
# (assuming we want to detect outliers in the imputed data before feeding it to models)

# Note: A more thorough outlier analysis might involve detecting outliers BEFORE imputation as well,
# or using domain-specific methods. This is a basic example.

# We will use the X_train_imputed and X_test_imputed data for outlier detection.
# A simple method is using the Interquartile Range (IQR).

def detect_outliers_iqr(df):
    outlier_indices = set()
    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number): # Only check numeric columns
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            # Find indices where values are outside the bounds
            col_outlier_indices = df[(df[col] < lower_bound) | (df[col] > upper_bound)].index
            outlier_indices.update(col_outlier_indices)
    return list(outlier_indices)

# Detect outliers in the imputed training data
train_outlier_indices = detect_outliers_iqr(pd.DataFrame(X_train_imputed, columns=[f'feature_{i}' for i in range(X_train_imputed.shape[1])])) # Convert back to DataFrame for column handling

# Detect outliers in the imputed test data
test_outlier_indices = detect_outliers_iqr(pd.DataFrame(X_test_imputed, columns=[f'feature_{i}' for i in range(X_test_imputed.shape[1])])) # Convert back to DataFrame

print(f"Found {len(train_outlier_indices)} outliers in the imputed training data (based on IQR).")
print(f"Found {len(test_outlier_indices)} outliers in the imputed test data (based on IQR).")

# Log the indices of detected outliers to files
os.makedirs("outlier_logs", exist_ok=True)

train_outlier_log_path = "outlier_logs/train_outlier_indices_iqr.txt"
with open(train_outlier_log_path, "w") as f:
    for idx in train_outlier_indices:
        f.write(str(idx) + "\n")
print(f"Logged training outlier indices to {train_outlier_log_path}")

test_outlier_log_path = "outlier_logs/test_outlier_indices_iqr.txt"
with open(test_outlier_log_path, "w") as f:
    for idx in test_outlier_indices:
        f.write(str(idx) + "\n")
print(f"Logged test outlier indices to {test_outlier_log_path}")

print("\nNote: Outliers were detected and logged but not removed or transformed in this analysis, as per the limitations identified.")

## SHAP Feature Importance Analysis

Using SHAP (SHapley Additive exPlanations) to understand feature importance and how features influence model predictions.

In [None]:
print("Performing SHAP feature importance analysis...")

# Use the best_tuned_model (Gradient Boosting tuned for AUC)
try:
    best_tuned_model
except NameError:
    print("Variable 'best_tuned_model' not found. Please ensure hyperparameter tuning (cell H6z0IDGD4to5) was run successfully.")
    # Exit or handle the error appropriately if the variable is not available
    # For now, assume best_tuned_model is available

# Use a subset of the training data for SHAP calculations to reduce computation time
# Calculating SHAP values for the entire dataset can be very slow
# Let's use a sample of the imputed training data
# Corrected: Create DataFrame with generic column names to match the imputed data shape
shap_data_sample = pd.DataFrame(X_train_imputed, columns=[f'Feature {i}' for i in range(X_train_imputed.shape[1])]).sample(n=1000, random_state=42) # Use generic column names

# Create a SHAP TreeExplainer for tree-based models
explainer = shap.TreeExplainer(best_tuned_model)

# Calculate SHAP values for the sample data
print("Calculating SHAP values (this may take some time)...")
shap_values = explainer.shap_values(shap_data_sample)
print("SHAP value calculation complete.")

# Visualize the SHAP summary plot (global feature importance)
print("\nGenerating SHAP summary plot...")
shap.summary_plot(shap_values, shap_data_sample, plot_type="bar", show=False)
plt.title("SHAP Feature Importance (Mean Absolute SHAP Value)")
plt.show()

# Visualize the SHAP summary plot (impact and direction)
print("\nGenerating SHAP summary plot (impact and direction)...")
shap.summary_plot(shap_values, shap_data_sample, show=False)
plt.show()


print("\n✅ SHAP feature importance analysis complete.")

# Note: For a deeper dive, you could analyze individual instance SHAP values (e.g., shap.initplot)
# or interaction effects (e.g., shap.dependence_plot).

## Limitations and Future Work

This section identifies the key limitations of the current model and approach, along with potential areas for future improvement and further research, taking into account the constraints faced during this project.

### Limitations of the Current Approach:

1.  **Data Usage due to Compute Resources:** Due to limitations in compute resources and memory, the analysis was performed on a sampled subset of the complete dataset. This means the model was trained on a smaller portion of the available data, which might impact its ability to capture all patterns present in the full dataset and generalize to all manufacturing variations. Processing the data in chunks and creating a smaller dataset was a necessary step to make the analysis feasible within the available environment.
2.  **Simplified Imputation Strategy:** Simple median imputation was used for missing values in both numeric and date features. More sophisticated imputation techniques like KNNImputer or IterativeImputer were not used initially due to complexity, time, and compute constraints. These methods can potentially provide more accurate imputations by considering the relationships between features.
3.  **Imputation Order and Potential (though minimal) Data Leakage:** While imputation was primarily done before the train-test split for simplicity and manageability in a high-dimensional dataset, there is a theoretical potential for minimal data leakage.
    *   **Justification for Minimal Leakage:** For the initial column-wise imputation steps (100% missing, low variance, constant values), these were based on properties inherent to the columns themselves, independent of specific row values. For the median imputation *before* the split, while using the overall median might introduce a slight leak (as the test set median is implicitly used), the impact is expected to be minimal given the large dataset size and the nature of median imputation compared to methods that might use more complex patterns from the entire dataset. Deferring all imputation until after the split, especially for such a high-dimensional and complex dataset with various data types, was deemed beyond the scope of current time and effort availability, particularly when considering more advanced imputation methods.
4.  **Handling of Advanced Preprocessing and Derived Features:** While some derived date features were created, more extensive feature engineering, especially involving interactions between features or creating features based on domain knowledge across the merged dataset, was limited by time and complexity. Implementing imputation (particularly KNN) after merging the large number of derived and original columns was also beyond the scope of the current effort.
5.  **Feature Interactions:** Due to the large number of features, comprehensive checking and engineering of interaction terms among features was not performed. Potential synergistic effects between different sensor signals that might be predictive of failure may have been missed.
6.  **Outlier Detection and Handling:** Outlier detection and handling were not explicitly performed in this analysis. Outliers in the data could potentially influence the model training and performance, especially for models sensitive to extreme values. A thorough outlier analysis often requires domain expertise to determine appropriate handling strategies (e.g., capping, transformation, removal).
7.  **Randomized Search vs. Grid Search:** Hyperparameter tuning was performed using `RandomizedSearchCV` rather than `GridSearchCV`. While `RandomizedSearchCV` is more computationally efficient and often finds good hyperparameters, `GridSearchCV` performs an exhaustive search over the defined parameter grid, which could potentially find a better combination of hyperparameters if compute resources were not a constraint.
8.  **Model Regularization:** The Gradient Boosting model has built-in regularization parameters (`learning_rate`, `max_depth`, `min_samples_split`, `min_samples_leaf`, `subsample`, etc.). Tuning these parameters (as done in the hyperparameter tuning step) helps to control model complexity and reduce overfitting.
    *   **Addressing Overfitting Resolution:** The learning curve for the model showed signs of slight overfitting (training score higher than cross-validation score). The regularization parameters tuned during `RandomizedSearchCV` (reducing `max_depth`, increasing `min_samples_split`/`min_samples_leaf`, using a smaller `learning_rate` with sufficient `n_estimators`) are intended to mitigate this. Increasing `n_estimators` alone can worsen overfitting without other controls. More data is generally beneficial but, in this case, we used a sampled chunk of the available data. While we couldn't use *all* data at once due to compute limits, the approach of sampling allows for potentially training on *different* chunks of data in the future if needed. Exploring more aggressive regularization techniques or simpler model architectures could also help if overfitting persists.
9.  **Evaluation Metric Focus:** The initial hyperparameter tuning and model selection primarily focused on maximizing AUC, which is a good overall metric for imbalanced datasets. However, for predicting product failures, Precision (minimizing false positives) and Recall (minimizinf false negatives) are often critical business metrics. While AUC was prioritized, the analysis did look at other metrics, and a comparison specifically focused on optimizing for F1-score (balancing Precision and Recall) was suggested as a future step.
    *   **Achieving Higher Precision and Recall:** The current model (Gradient Boosting with ADASYN, tuned for AUC) achieved a Precision of 0.1111 and a Recall of 0.0667 on the test set. Ideally, for a manufacturing failure prediction system, we would aim for significantly higher values for both Precision and Recall to minimize both false positives and false negatives. The F1-score was 0.0833.
    *   **Outcome of F1 Optimization Attempt:** We compared models based on F1-score (in cell `c30a4c34`) and found the highest F1-score achieved by any model evaluated was **0.1111** (by the Tuned Gradient Boosting model, without explicit resampling before tuning). Threshold adjustment on the AUC-tuned model (in cell `d81389f6`) could slightly improve its F1-score to **0.1053**, but did not surpass the highest observed F1. This indicates that optimizing solely for F1 or adjusting the threshold on the current model did not lead to a significant breakthrough in balancing Precision and Recall. Achieving substantially higher values for both metrics likely requires addressing the fundamental limitations outlined above.

### Future Work and Potential Improvements:

1.  **Explore Full Dataset or Larger Chunks:** If computational resources become available, train the model on the full dataset or significantly larger sampled chunks to potentially improve generalization.
2.  **Advanced Imputation Techniques:** Investigate and implement more sophisticated imputation methods (e.g., KNNImputer, IterativeImputer) after the train-test split to potentially handle missing values more effectively and prevent any form of data leakage.
3.  **Comprehensive Outlier Analysis and Handling:** Perform a dedicated outlier detection and handling step, potentially involving domain experts to understand the nature of outliers and choose appropriate mitigation strategies. Log detected outliers for review.
4.  **Grid Search for Hyperparameter Tuning:** Conduct a more exhaustive `GridSearchCV` for hyperparameter tuning if compute resources allow, to potentially find a better set of hyperparameters.
5.  **Cost-Sensitive Learning:** Implement cost-sensitive learning approaches, where the model is trained to incur a higher penalty for misclassifying the minority class (false negatives) or the majority class (false positives), based on the specific business costs associated with each type of error.
6.  **Explore Different Resampling Strategies or Ratios:** Experiment with other advanced resampling techniques (e.g., Borderline-SMOTE variants, SMOTE-NC for mixed data types if applicable) or different resampling ratios to find the optimal balance for the dataset.
7.  **Investigate Other Imbalanced Ensemble Methods:** Further tune and evaluate imbalanced ensemble methods (like EasyEnsembleClassifier) which showed promising Recall or AUC in initial runs, to see if their Precision can be improved.
8.  **Deep Feature Engineering:** Conduct more in-depth feature engineering based on domain expertise to create features that are more predictive of product failures.
9.  **Explore Alternative Model Architectures:** Evaluate other model architectures that are known to perform well on imbalanced data or high-dimensional sparse data (e.g., LightGBM with appropriate parameters for imbalance, CatBoost, or even deep learning approaches if justified by data volume and complexity).
10. **Threshold Adjustment:** Analyze the precision-recall curve and potentially adjust the classification threshold of the final model to favor higher Recall or higher Precision, depending on the specific business requirements (e.g., is it more critical to catch all failures, even with false alarms, or to minimize false alarms, even if some failures are missed?).
11. **Feature Importance and Interpretability Deep Dive:** Conduct a detailed analysis of feature importance (using methods like permutation importance) to gain deeper insights into which specific sensor signals are most indicative of failure risk, providing actionable insights for manufacturing engineers.

## Addon: Explore Classification Threshold Adjustment to verify if we can achieve any more precision recall simultaneously under given conditions

Given the low Precision and Recall of the current models, let's explore how adjusting the classification threshold of the best tuned model (optimized for AUC) impacts these metrics. This can help us find a threshold that offers a better trade-off between Precision and Recall based on the specific business needs.

In [None]:
print("Exploring classification threshold adjustment...")

# Use the predicted probabilities from the final model (tuned for AUC + ADASYN)
# Assuming y_pred_proba_final from cell 7eae6122 is available
try:
    y_pred_proba_final
except NameError:
    print("Predicted probabilities (y_pred_proba_final) not found. Please run cell 7eae6122 first.")
    # Exit or handle the error appropriately if the variable is not available
    # For now, let's assume it's available from the previous execution

# Calculate Precision-Recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_final)

# Calculate AUC for the Precision-Recall curve
pr_auc = auc_score(recall, precision)

print(f"Precision-Recall AUC: {pr_auc:.4f}")

# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label=f'Precision-Recall Curve (AUC = {pr_auc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve for Final Model')
plt.legend()
plt.grid(True)

print("\nAnalyzing metrics at different thresholds...")

# Create a DataFrame to view Precision, Recall, and F1-score at different thresholds
# Align the arrays: thresholds has one less element than precision and recall
# Use thresholds as the index and slice precision and recall to match its length
threshold_metrics = pd.DataFrame({
    'Threshold': thresholds,
    'Precision': precision[:-1], # Slice to exclude the last element
    'Recall': recall[:-1]       # Slice to exclude the last element
})

# Calculate F1-score for each threshold
# Avoid division by zero if Precision and Recall are both 0
threshold_metrics['F1-score'] = 2 * (threshold_metrics['Precision'] * threshold_metrics['Recall']) / (threshold_metrics['Precision'] + threshold_metrics['Recall'])
threshold_metrics['F1-score'] = threshold_metrics['F1-score'].fillna(0) # Fill NaN (where P+R=0) with 0

# Display metrics at a few key thresholds or sorted by F1-score
print("\nMetrics at various classification thresholds (sorted by F1-score):")
display(threshold_metrics.sort_values(by='F1-score', ascending=False).head(10)) # Display top 10 by F1-score

print("\nMetrics at various classification thresholds (sorted by Threshold):")
display(threshold_metrics.sort_values(by='Threshold', ascending=True).head(10)) # Display first 10 thresholds

# Find the threshold that maximizes F1-score
best_f1_threshold_row = threshold_metrics.loc[threshold_metrics['F1-score'].idxmax()]
best_f1_threshold = best_f1_threshold_row['Threshold']
best_f1_at_threshold = best_f1_threshold_row['F1-score']
precision_at_best_f1_threshold = best_f1_threshold_row['Precision']
recall_at_best_f1_threshold = best_f1_threshold_row['Recall']


print(f"\nThreshold that maximizes F1-score: {best_f1_threshold:.4f}")
print(f"  F1-score at this threshold: {best_f1_at_threshold:.4f}")
print(f"  Precision at this threshold: {precision_at_best_f1_threshold:.4f}")
print(f"  Recall at this threshold: {recall_at_best_f1_threshold:.4f}")

# Add a marker to the plot at the point maximizing F1-score
plt.plot(recall_at_best_f1_threshold, precision_at_best_f1_threshold, 'ro', markersize=8, label=f'Max F1 (F1={best_f1_at_threshold:.4f})')
plt.legend()


# We may also find thresholds that give a specific level of Recall or Precision
# Example: Find the threshold for a Recall of at least 0.5 (if achievable)
desired_recall = 0.5
thresholds_for_desired_recall = threshold_metrics[threshold_metrics['Recall'] >= desired_recall]

if not thresholds_for_desired_recall.empty:
    # Find the threshold among these that maximizes Precision
    best_threshold_for_recall = thresholds_for_desired_recall.sort_values(by='Precision', ascending=False).iloc[0]
    print(f"\nThreshold to achieve Recall >= {desired_recall}: {best_threshold_for_recall['Threshold']:.4f}")
    print(f"  Precision at this threshold: {best_threshold_for_recall['Precision']:.4f}")
    print(f"  Recall at this threshold: {best_threshold_for_recall['Recall']:.4f}")
    print(f"  F1-score at this threshold: {best_threshold_for_recall['F1-score']:.4f}")
else:
    print(f"\nCould not achieve a Recall of {desired_recall} with the current model by adjusting the threshold.")

# Create a directory for saving images if it doesn't exist
if not os.path.exists('images'):
    os.makedirs('images')

# Save the plot to a file
plt.savefig('images/precision_recall_curve.png', bbox_inches='tight')

plt.show() # Ensure plot is shown after adding the marker


print("\n✅ Classification threshold analysis complete.")

## Deduction from classification threshold checks

Based on the comprehensive analysis of various models, hyperparameter tuning (optimizing for AUC), exploration of resampling techniques, and analysis of classification threshold adjustments, we have made significant progress in building a predictive model for product failures in a high-dimensional manufacturing process.

We explored different approaches to address the severe class imbalance, including oversampling (SMOTE, ADASYN, BorderlineSMOTE), undersampling (RandomUnderSampler), combined techniques (SMOTE-ENN, SMOTE-Tomek), and imbalanced ensemble methods. Hyperparameter tuning was performed on the Gradient Boosting Classifier, initially optimizing for AUC.

The model selected for detailed analysis and final implementation was the **Gradient Boosting Classifier with ADASYN Resampling**, primarily because it achieved the highest AUC (0.6522) among the evaluated models. However, the evaluation revealed that achieving high Precision and Recall for the minority class (failures) remains a significant challenge with this model and dataset under the current constraints.

The highest F1-score achieved by any model evaluated was **0.1111** (by the Tuned Gradient Boosting model, without explicit resampling before tuning). While threshold adjustment on the AUC-tuned model could slightly improve its F1-score, it did not surpass this value. The analysis clearly showed the inherent trade-off between Precision and Recall on this imbalanced dataset, as visualized by the Precision-Recall curve. Achieving high Recall (e.g., 0.80) resulted in extremely low Precision (e.g., 0.0102), and vice versa.

**Conclusion on Achieving Aim:**

While the project successfully implemented a pipeline for preprocessing high-dimensional data and training classification models, and identified the Gradient Boosting Classifier with ADASYN Resampling as the best performer based on AUC, the goal of achieving a high prediction rate for failures with both high Precision and Recall was not fully met with the current methods and computational limitations. The low F1-score (maximum 0.1111) highlights that the model still struggles to balance identifying actual failures with minimizing false positives.

**Conclusion on Further Improvement (Modeling Part):**

Given the comprehensive exploration within the current constraints (sampled data, simplified imputation, etc.), it is likely that significant improvements in Precision and Recall are **not achievable solely by further tuning of these specific models or adjusting thresholds** without addressing the underlying limitations. While minor gains might be possible, a breakthrough in performance for the minority class would likely require tackling the challenges outlined in the "Limitations and Future Work" section.

**Overall Summary:**

The project successfully demonstrated a workflow for tackling this complex, imbalanced dataset. The best model achieved an AUC of 0.6522. However, the critical business metrics of Precision and Recall for predicting failures remain low (highest F1 of 0.1111).

**Next Steps:**

To significantly improve the prediction of product failures with higher Precision and Recall, the future work should focus on addressing the key limitations, particularly:

*   Leveraging the full dataset (if compute permits).
*   Implementing more sophisticated imputation and feature engineering techniques.
*   Exploring cost-sensitive learning or more advanced ensemble methods designed for extreme imbalance.
*   Potentially incorporating domain expertise for feature engineering and outlier handling.

The analysis provides a solid foundation and a clear understanding of the challenges and the most promising avenues for future research to build a truly production-ready model for predicting manufacturing failures.

# Technical Brief: Product Failure Prediction

This report summarizes the machine learning project aimed at predicting product failures in a high-dimensional manufacturing process, detailing the problem, methodology, models, and evaluation.

## 1. Problem Statement

**Define the Problem Statement:**
The goal of this project is to accurately predict product failures in a high-dimensional manufacturing process using presence-based signal extraction and interpretable machine learning techniques. The challenge lies in the severe class imbalance, where failures are rare events. By identifying failure-prone products early, manufacturers can implement proactive quality control measures, reducing costs and risks.

## 2. Model Outcomes or Predictions

**Type of Learning and Expected Output:**
This project uses **supervised machine learning** for a **binary classification** task. The expected output of the selected models is a prediction of whether a product is likely to fail (Class 1) or not fail (Class 0). The models also output probability scores, which are used for evaluation metrics like AUC and for classification threshold adjustment.

## 3. Data Acquisition

**Data Source and Analysis:**
The data used for this project is the **Bosch Production Line Performance dataset**, originally sourced from Kaggle.

*Note: Due to computational constraints, the analysis in this notebook was performed on a sampled subset of the complete dataset. The original data consisted of numerous sensor readings (numeric and categorical) and timestamp data across different production lines.*

*Initial EDA (performed prior to this notebook) would typically involve visualizations to assess the raw data's potential, such as examining the distribution of the target variable (Response), visualizing missing data patterns, and exploring initial correlations or distributions of key features. Such detailed visualizations on the full, high-dimensional raw dataset were not feasible within this notebook's environment after sampling.*

**Initial Data Visualizations (Placeholders):**

![Initial Data Acquisition Visuals Placeholder](path/to/initial_data_acquisition_visuals.png)

![EDA Visuals Placeholder](path/to/eda_visuals.png)


## 4. Data Preprocessing/Preparation

**Data Cleaning and Preparation Techniques:**
The data preprocessing involved several steps to handle missing values, inconsistencies, and prepare the data for modeling in a high-dimensional context:

a.  **Missing Values and Inconsistencies:**
    *   Columns with 100% missing values were dropped.
    *   For numeric features, missing values were imputed using the **median** of each column.
    *   For categorical features, missing values were imputed with a dedicated **'Missing'** label.
    *   For date features, missing values in derived numerical features (days since min date) were imputed using the **median**.
    *   Basic outlier detection using the Interquartile Range (IQR) was performed and outlier indices were logged for future domain expert review, but outliers were **not removed or transformed** in this analysis due to the need for domain expertise and time constraints.

b.  **Data Splitting:**
    *   The data was split into training (80%) and testing (20%) sets using **stratified sampling** based on the 'Response' variable to ensure that the rare failure instances were proportionally represented in both sets.

c.  **Analysis and Encoding Steps:**
    *   **Feature Selection:**
        *   Low/zero variance numeric columns were dropped.
        *   Constant value categorical/date columns were dropped.
        *   High-cardinality categorical columns (>50 unique values) were dropped.
        *   Highly correlated numeric features (>0.9 correlation) were dropped based on the training set correlation matrix.
        *   Features with low correlation (<0.01) with the target were dropped based on the training set.
    *   **Feature Scaling:** Numeric features were scaled using **StandardScaler**.
    *   **Encoding:** Categorical features were encoded using **Mean Response Encoding** based on the training set target variable.
    *   **Derived Features:** Numerical features (days since minimum date) were derived from date columns.

*Note: Due to the high dimensionality and compute constraints, more advanced preprocessing techniques like KNN imputation after merging all features, or extensive feature engineering including interaction terms, were not fully explored within the scope of this project.*

## 5. Modeling

**Machine Learning Algorithms Selected:**
A range of classification algorithms were considered and evaluated for this problem:

*   **Baseline Models:** Logistic Regression, Random Forest, Gradient Boosting, Support Vector Machine (SVC), and K-Nearest Neighbors.
*   **Tuned Model:** Gradient Boosting Classifier (hyperparameter tuned).
*   **Models with Resampling:** Gradient Boosting Classifier trained on data resampled using SMOTE (Oversampling), RandomUnderSampler (Undersampling), ADASYN, BorderlineSMOTE, SMOTEENN, and SMOTETomek.
*   **Imbalanced Ensemble Methods:** BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier.

## 6. Model Evaluation

**Evaluation Metrics and Optimal Model Determination:**
Given the severe class imbalance, standard Accuracy is not a reliable evaluation metric. The primary metrics used were:

*   **AUC (Area Under the ROC Curve):** Measures the model's ability to discriminate between positive and negative classes. Robust to class imbalance.
*   **Precision:** Proportion of correctly predicted positive instances out of all instances predicted as positive. Important for minimizing false positives.
*   **Recall:** Proportion of correctly predicted positive instances out of all actual positive instances. Important for minimizing false negatives.
*   **F1-score:** Harmonic mean of Precision and Recall, balancing both metrics.

**Optimal Model Determination:**
Models were evaluated on the original, unseen test set. The initial selection and tuning focused on maximizing **AUC** as a primary indicator of overall model performance on imbalanced data.

*   The **Gradient Boosting Classifier with ADASYN Resampling** achieved the highest AUC (0.6522).
*   However, when focusing on balancing Precision and Recall, the **Tuned Gradient Boosting** model (without explicit resampling before tuning) achieved the highest F1-score (0.1111).

The analysis of the Precision-Recall curve showed the inherent trade-off: increasing Recall significantly reduced Precision, and vice versa. Adjusting the classification threshold of the AUC-tuned model could slightly improve its F1-score (to 0.1053) but did not surpass the highest F1 observed.

Due to the low F1-scores across all evaluated models under the current constraints, achieving a high prediction rate for failures with simultaneously high Precision and Recall was not fully realized.

**Summary of Key Model Performances:**

*   **Highest AUC Model:** Gradient Boosting with ADASYN Resampling
    *   Accuracy: 0.9916
    *   Precision: 0.1111
    *   Recall: 0.0667
    *   F1-score: 0.0833
    *   AUC: 0.6522

*   **Highest F1 Model:** Tuned Gradient Boosting (No explicit resampling before tuning)
    *   Accuracy: 0.9939
    *   Precision: 0.3333
    *   Recall: 0.0667
    *   F1-score: 0.1111
    *   AUC: 0.6368

**Visual Justification of Precision-Recall Trade-off (Placeholder):**

[Embed Precision-Recall Curve Plot Here - from cell `d81389f6`]

## Conclusion on Modeling:

While the Gradient Boosting Classifier with ADASYN Resampling showed the best discriminative power (AUC), and the Tuned Gradient Boosting model achieved the highest F1-score, the overall performance in terms of accurately identifying the minority class with a good balance of Precision and Recall remains low. Significant improvements in these metrics would likely require addressing the limitations outlined in the notebook's "Limitations and Future Work" section, rather than further tuning or threshold adjustment alone with the current data and processing.

# Project Summary for Business Audience: Predicting Product Failures

This report summarizes our project to predict product failures in manufacturing, focusing on the business impact and future potential.

## 1. Problem Statement: Minimizing Costly Failures

Our core problem is to proactively identify products likely to fail during or after production. Undetected failures are costly, leading to rework, warranty expenses, and damage to our brand. A successful prediction system would allow us to intervene early, improving quality and reducing waste. A key challenge is the rarity of failures – predicting these few events among many good products is difficult.

## 2. Model Outcomes: Identifying Risk

Our system uses data analysis to predict whether a product is high-risk (likely to fail) or low-risk (likely to be good). It provides a risk score for each product. This is a supervised learning approach, meaning the system learned from past examples of products that either failed or did not fail.

## 3. Data Used: Learning from Production Signals

We used data from our production line, specifically the Bosch Production Line Performance dataset. This data contains valuable signals and measurements captured during manufacturing.

*Note: Due to the immense size and complexity of the original dataset, our analysis was conducted on a representative sample. While a full analysis would ideally use all available data, this approach allowed us to build and evaluate initial models within practical limits.*

**Visualizing Data Potential:** (To be filled with business-friendly visuals from initial data exploration, if available. E.g., graphs showing the distribution of good vs. bad products, patterns in key measurements related to failures.)

![Initial Data Acquisition Visuals Placeholder](path/to/initial_data_acquisition_visuals.png)

![EDA Visuals Placeholder](path/to/eda_visuals.png)


## 4. Data Preparation: Getting Data Ready for Prediction

We prepared the raw production data for analysis:

*   **Cleaning Missing Information:** We addressed gaps in the data using automated methods to ensure the models received complete information.
*   **Structuring Data:** We organized the data and created new measurements (e.g., time-based signals) to help the models find patterns.
*   **Splitting for Testing:** We set aside a portion of the data as a "blind test" to ensure our models work well on products they haven't seen before, simulating real-world performance.

*Approach Note: Some advanced data preparation techniques that could further improve results were not fully explored due to project constraints.*

## 5. Predictive Models Explored

We evaluated several predictive modeling techniques suitable for identifying risk based on complex data. We focused on methods capable of handling the challenge of rare failures, including standard models, tuned versions, and specialized techniques for imbalanced data.

## 6. Model Performance: Strengths and Areas for Improvement

We measured our models' performance using metrics relevant to identifying failures:

*   **Overall Risk Ranking (AUC):** How well the model ranks high-risk products above low-risk ones. Our best model achieved an AUC of **0.6522**, showing a better-than-random ability to differentiate risk, but indicating room for improvement in its overall discriminative power.
*   **Minimizing False Alarms (Precision):** When the model flags a product as high-risk, how often is it genuinely faulty? Our analysis showed the highest Precision achieved was **0.3333** (meaning 1 in 3 flagged products were actual failures) with one model, but this often came at the cost of missing many actual failures. With our primary selected model (optimized for overall risk ranking), Precision was **0.1111**.
*   **Catching Actual Failures (Recall):** Out of all the products that actually failed, how many did our model successfully flag as high-risk? With our primary selected model, Recall was **0.0667**. To catch more failures, we could increase Recall (e.g., to **0.80**), but this drastically increases false alarms, driving Precision down to **0.0102**.
*   **Balancing Alarms and Misses (F1-score):** A single score balancing Precision and Recall. The highest F1-score achieved by any model was **0.1111**. This is relatively low and highlights the difficulty in simultaneously minimizing false alarms and catching most failures with the current approach.

**Visualizing the Trade-off:** (This graph shows that improving the rate of catching failures (Recall) currently increases the rate of false alarms (reduces Precision), and vice versa. The ideal scenario is high on both, but this isn't achieved yet.)

![Precision-Recall Curve Plot Here - from cell d81389f6](path/to/precision_recall_curve.png)

**Key Business Findings:**

*   We have a functional system that can identify potential high-risk products better than chance.
*   The current system's ability to reliably flag *most* failures while keeping false alarms at a manageable level is limited.
*   Significant improvement in catching failures without excessive false alarms requires further work.

## Next Steps: Driving Better Predictions

To build a more effective prediction system for deployment, we recommend focusing on:

*   **Expand Data Usage:** Explore ways to leverage more of the full dataset for training, potentially uncovering richer patterns.
*   **Refine Data Insights:** Investigate more advanced data preparation techniques and incorporate manufacturing expertise to create stronger signals related to failure.
*   **Optimize for Business Impact:** Explore modeling techniques that can be specifically tuned to minimize the most costly type of error (e.g., missing a critical failure).

This project provides a solid foundation. Addressing these areas will be key to developing a prediction system that can significantly reduce product failures and associated costs in our manufacturing process.

# Project Summary for Business Audience: Predicting Product Failures

This report summarizes our project to predict product failures in manufacturing, focusing on the business impact and future potential.

## 1. Problem Statement: Minimizing Costly Failures

Our core problem is to proactively identify products likely to fail during or after production. Undetected failures are costly, leading to rework, warranty expenses, and damage to our brand. A successful prediction system would allow us to intervene early, improving quality and reducing waste. A key challenge is the rarity of failures – predicting these few events among many good products is difficult.

## 2. Model Outcomes: Identifying Risk

Our system uses data analysis to predict whether a product is high-risk (likely to fail) or low-risk (likely to be good). It provides a risk score for each product. This is a supervised learning approach, meaning the system learned from past examples of products that either failed or did not fail.

## 3. Data Used: Learning from Production Signals

We used data from our production line, specifically the Bosch Production Line Performance dataset. This data contains valuable signals and measurements captured during manufacturing.

*Note: Due to the immense size and complexity of the original dataset, our analysis was conducted on a representative sample. While a full analysis would ideally use all available data, this approach allowed us to build and evaluate initial models within practical limits.*

**Visualizing Data Potential:** (To be filled with business-friendly visuals from initial data exploration, if available. E.g., graphs showing the distribution of good vs. bad products, patterns in key measurements related to failures.)

![Initial Data Acquisition Visuals Placeholder](path/to/initial_data_acquisition_visuals.png)

![EDA Visuals Placeholder](path/to/eda_visuals.png)


## 4. Data Preparation: Getting Data Ready for Prediction

We prepared the raw production data for analysis:

*   **Cleaning Missing Information:** We addressed gaps in the data using automated methods to ensure the models received complete information.
*   **Structuring Data:** We organized the data and created new measurements (e.g., time-based signals) to help the models find patterns.
*   **Splitting for Testing:** We set aside a portion of the data as a "blind test" to ensure our models work well on products they haven't seen before, simulating real-world performance.

*Approach Note: While we utilized advanced data preparation techniques, not all possible advanced methods were explored due to the scope of effort, resources, and time allotted for this project.*

## 5. Predictive Models Explored

We evaluated several predictive modeling techniques suitable for identifying risk based on complex data. We focused on methods capable of handling the challenge of rare failures, including standard models, tuned versions, and specialized techniques for imbalanced data.

## 6. Model Performance: Strengths and Areas for Improvement

We measured our models' performance using metrics relevant to identifying failures:

*   **Overall Risk Ranking (AUC):** How well the model ranks high-risk products above low-risk ones. Our best model achieved an AUC of **0.6522**, showing a better-than-random ability to differentiate risk, but indicating room for improvement in its overall discriminative power.
*   **Minimizing False Alarms (Precision):** When the model flags a product as high-risk, how often is it genuinely faulty? Our analysis showed the highest Precision achieved was **0.3333** (meaning 1 in 3 flagged products were actual failures) with one model, but this often came at the cost of missing many actual failures. With our primary selected model (optimized for overall risk ranking), Precision was **0.1111**.
*   **Catching Actual Failures (Recall):** Out of all the products that actually failed, how many did our model successfully flag as high-risk? With our primary selected model, Recall was **0.0667**. To catch more failures, we could increase Recall (e.g., to **0.80**), but this drastically increases false alarms, driving Precision down to **0.0102**.
*   **Balancing Alarms and Misses (F1-score):** A single score balancing Precision and Recall. The highest F1-score achieved by any model was **0.1111**. This is relatively low and highlights the difficulty in simultaneously minimizing false alarms and catching most failures with the current approach.

**Visualizing the Trade-off:** (This graph shows that improving the rate of catching failures (Recall) currently increases the rate of false alarms (reduces Precision), and vice versa. The ideal scenario is high on both, but this isn't achieved yet.)

![Precision-Recall Curve Plot Here - from cell d81389f6](path/to/precision_recall_curve.png)

**Key Business Findings:**

*   We have a functional system that can identify potential high-risk products better than chance.
*   The current system's ability to reliably flag *most* failures while keeping false alarms at a manageable level is limited.
*   Significant improvement in catching failures without excessive false alarms requires further work.

## Next Steps: Driving Better Predictions

To build a more effective prediction system for deployment, we recommend focusing on:

*   **Expand Data Usage:** Explore ways to leverage more of the full dataset for training, potentially uncovering richer patterns.
*   **Refine Data Insights:** Investigate more advanced data preparation techniques and incorporate manufacturing expertise to create stronger signals related to failure.
*   **Optimize for Business Impact:** Explore modeling techniques that can be specifically tuned to minimize the most costly type of error (e.g., missing a critical failure).

This project provides a solid foundation. Addressing these areas will be key to developing a prediction system that can significantly reduce product failures and associated costs in our manufacturing process.

In [None]:
# ============================================
# 📊 Generate Major Evaluation Graphs For README
# ============================================

# Ensure figures folder exists
os.makedirs("figures", exist_ok=True)

# --------------------------------------------
# 1. Confusion Matrix
# --------------------------------------------
y_pred = best_tuned_model.predict(X_test_imputed)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix - Tuned Gradient Boosting")
plt.savefig("figures/confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

# --------------------------------------------
# 2. Precision–Recall Curve
# --------------------------------------------
y_proba = best_tuned_model.predict_proba(X_test_imputed)[:, 1]
prec, rec, thresh = precision_recall_curve(y_test, y_proba)

plt.figure(figsize=(7,6))
plt.plot(rec, prec, color="blue", lw=2)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision–Recall Curve - Tuned Gradient Boosting")
plt.grid(True)
plt.savefig("figures/precision_recall_curve.png", dpi=300, bbox_inches="tight")
plt.show()

# --------------------------------------------
# 3. Feature Importance (Top 20)
# --------------------------------------------
importances = best_tuned_model.feature_importances_
indices = np.argsort(importances)[-20:]  # top 20 features

plt.figure(figsize=(8,6))
plt.barh(range(len(indices)), importances[indices], align="center", color="green")
plt.yticks(range(len(indices)), [f"Feature {i}" for i in indices])
plt.xlabel("Importance")
plt.title("Top 20 Feature Importances - Tuned Gradient Boosting")
plt.tight_layout()
plt.savefig("figures/feature_importance.png", dpi=300, bbox_inches="tight")
plt.show()

# --------------------------------------------
# 4. Learning Curve
# --------------------------------------------
train_sizes, train_scores, test_scores = learning_curve(
    best_tuned_model, X_train_imputed, y_train, cv=3, scoring="roc_auc", n_jobs=-1
)

train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.figure(figsize=(8,6))
plt.plot(train_sizes, train_mean, "o-", color="red", label="Training AUC")
plt.plot(train_sizes, test_mean, "o-", color="green", label="Validation AUC")
plt.xlabel("Training Examples")
plt.ylabel("AUC")
plt.title("Learning Curve - Tuned Gradient Boosting")
plt.legend(loc="best")
plt.grid(True)
plt.savefig("figures/learning_curve.png", dpi=300, bbox_inches="tight")
plt.show()

# --------------------------------------------
# 5. Model Metrics Comparison (using actual notebook results)
# --------------------------------------------
metrics = {
    "Logistic Regression": [0.9943, 0.0000, 0.0000, 0.0000, 0.5170],
    "Random Forest": [0.9939, 0.0000, 0.0000, 0.0000, 0.6038],
    "Gradient Boosting": [0.9908, 0.0909, 0.0667, 0.0769, 0.6357],
    "SVC": [0.9943, 0.0000, 0.0000, 0.0000, 0.5009],
    "KNN": [0.9943, 0.0000, 0.0000, 0.0000, 0.5186],
    "Tuned GB": [0.9939, 0.3333, 0.0667, 0.1111, 0.6368]
}

df = pd.DataFrame(metrics, index=["Accuracy","Precision","Recall","F1","AUC"]).T

df.plot(kind="bar", figsize=(10,6))
plt.title("Model Metrics Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45, ha="right")
plt.legend(loc="upper right")
plt.tight_layout()
plt.savefig("figures/model_metrics.png", dpi=300, bbox_inches="tight")
plt.show()

# --------------------------------------------
# 6. Threshold Sweep (Precision, Recall, F1 vs Threshold)
# --------------------------------------------
thresholds = np.linspace(0.0, 1.0, 50)
precisions, recalls, f1s = [], [], []

for t in thresholds:
    y_pred_thresh = (y_proba >= t).astype(int)
    precisions.append(precision_score(y_test, y_pred_thresh, zero_division=0))
    recalls.append(recall_score(y_test, y_pred_thresh, zero_division=0))
    f1s.append(f1_score(y_test, y_pred_thresh, zero_division=0))

plt.figure(figsize=(8,6))
plt.plot(thresholds, precisions, label="Precision", color="blue", marker="o", markersize=3)
plt.plot(thresholds, recalls, label="Recall", color="green", marker="o", markersize=3)
plt.plot(thresholds, f1s, label="F1-score", color="red", marker="o", markersize=3)
plt.xlabel("Decision Threshold")
plt.ylabel("Score")
plt.title("Precision–Recall–F1 vs Threshold - Tuned Gradient Boosting")
plt.legend()
plt.grid(True)
plt.savefig("figures/threshold_sweep.png", dpi=300, bbox_inches="tight")
plt.show()
