In [1]:
import random
import numpy as np
import pandas as pd
import os
import re
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm
import glob
import shutil
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna
import json
import onnx
import onnxruntime as ort
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.neural_network import MLPClassifier
import pickle
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from concurrent.futures import ThreadPoolExecutor, as_completed
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import traceback
import logging

#set random seed for reproducibility
random.seed(123)
np.random.seed(123)
torch.manual_seed(123)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(123)

print("Random seed set to 123")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'optuna'

In [None]:
#set paths
metadata_path = "image_metadata.csv"
split_info_dir = "splits_info"
splits_dir = "splits_images"
os.makedirs(split_info_dir, exist_ok=True)
os.makedirs(splits_dir, exist_ok=True)

#load data
df_metadata = pd.read_csv(metadata_path)
df_metadata["shape"] = df_metadata["shape"].str.lower().str.strip()
image_paths = df_metadata["filepath"].tolist()
y = df_metadata["defect"].values

In [None]:
#distribution by defect types
plt.figure(figsize=(10, 5))
sns.countplot(data=df_metadata, x="defect", hue="defect", order=df_metadata["defect"].value_counts().index, palette="Set2", legend=False)
plt.title("Distribution by defect types")
plt.xlabel("Defect type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#tablet shapes
plt.figure(figsize=(10, 5))
sns.countplot(data=df_metadata, x="shape", hue="shape", order=df_metadata["shape"].value_counts().index, palette="Set2", legend=False)
plt.title("Distribution by tablet shapes")
plt.xlabel("Shape")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#hatmap of shape vs defect
pivot = df_metadata.pivot_table(index="shape", columns="defect", aggfunc="size", fill_value=0)
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt="d", cmap="Blues")
plt.title("Count of tablets by shape and defect type")
plt.xlabel("Defect type")
plt.ylabel("Shape")
plt.tight_layout()
plt.show()

#check that customer and defect are converted to strings
df_metadata["customer"] = df_metadata["customer"].astype(str)
df_metadata["defect"] = df_metadata["defect"].astype(str)
df_metadata["shape"] = df_metadata["shape"].astype(str)

#select customers
for customer in ["customer1", "customer2"]:
    df_customer = df_metadata[df_metadata["customer"].str.lower() == customer.lower()]
    
    #defect distribution
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df_customer, x="defect", hue="defect", order=df_customer["defect"].value_counts().index, palette="Set2", legend=False)
    plt.title(f"Defect distribution — customer: {customer}")
    plt.xlabel("Defect type")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
#define and modify cnn models
def get_modified_model(name):
    model = getattr(models, name)(weights=getattr(models, f"{name.split('_')[0].capitalize()}_Weights").IMAGENET1K_V1)
    model.eval().to(device)

    if "resnet" in name or "efficientnet" in name:
        model = torch.nn.Sequential(*(list(model.children())[:-1]))
    elif "vgg" in name:
        model.classifier = model.classifier[:-1]
    elif "googlenet" in name or "inception" in name:
        model.fc = torch.nn.Identity()
    elif "mobilenet" in name or "densenet" in name:
        model.classifier = torch.nn.Identity()
    elif any(x in name for x in ["swin", "vit", "convnext"]):
        model.head = torch.nn.Identity()

    return model.to(device)

#list of cnn model names
model_names = [
    "resnet50", "resnet101", "resnet152", "vgg16", "vgg19",
    "efficientnet_b2", "efficientnet_b3", "googlenet", "inception_v3",
    "mobilenet_v2", "mobilenet_v3_large", "mobilenet_v3_small",
    "densenet121", "swin_t", "vit_b_16", "convnext_base"
]

#load all cnn models
cnn_models = {name: get_modified_model(name) for name in model_names}

#define transform (no resizing, images are already 224x224)
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

#extract features
features_dict = {name: [] for name in cnn_models}
file_names = []

for img_path in tqdm(image_paths, desc="processing images", unit="image"):
    try:
        img = Image.open(img_path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)

        for model_name, model in cnn_models.items():
            with torch.no_grad():
                feature = model(img_tensor).squeeze().cpu().numpy().flatten()
                features_dict[model_name].append(feature)

        file_names.append(img_path)
    except Exception as e:
        print(f"error processing file {img_path}: {e}")

#save features to csv files
for model_name, features in features_dict.items():
    df_feat = pd.DataFrame(features)
    df_feat.insert(0, "filename", file_names)
    df_feat.to_csv(f"features_{model_name}.csv", index=False)

In [None]:
#define dataset splits
splits = {
    "all_data": df_metadata,
    "customer1": df_metadata[df_metadata["customer"] == "customer1"],
    "customer2": df_metadata[df_metadata["customer"] == "customer2"],
    "oval_round_oblong": df_metadata[df_metadata["shape"].isin(["oval", "round", "oblong"])],
    "capsules": df_metadata[df_metadata["shape"] == "capsule"],
    "tablets": df_metadata[df_metadata["shape"] != "capsule"],
    "broken": df_metadata[df_metadata["defect"].isin(["BROKEN", "PROPER"])],
    "double": df_metadata[df_metadata["defect"].isin(["DOUBLE", "PROPER"])],
    "minor_major": df_metadata[df_metadata["defect"].isin(["DEFECT_MINOR", "DEFECT_MAJOR", "PROPER"])]
}

#save split info and copy images
for split_name, split_df in splits.items():
    split_txt_file = os.path.join(split_info_dir, f"{split_name}.txt")
    split_folder = os.path.join(splits_dir, split_name)
    os.makedirs(split_folder, exist_ok=True)

    with open(split_txt_file, "w") as f:
        for path in split_df["filepath"]:
            f.write(path + "\n")
            shutil.copy(path, os.path.join(split_folder, os.path.basename(path)))

In [None]:
#train/val/test splits
for split_name, split_df in splits.items():
    split_txt_folder = os.path.join(split_info_dir, split_name)
    split_image_folder = os.path.join(splits_dir, split_name)

    os.makedirs(split_txt_folder, exist_ok=True)
    os.makedirs(split_image_folder, exist_ok=True)

    #train/validation/test split (70/10/20) 
    train_df, test_df = train_test_split(split_df, test_size=0.2, random_state=123, stratify=split_df["defect"])
    train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=123, stratify=train_df["defect"])

    #creating train/val/test folders for images
    train_txt_file = os.path.join(split_txt_folder, "train.txt")
    val_txt_file = os.path.join(split_txt_folder, "val.txt")
    test_txt_file = os.path.join(split_txt_folder, "test.txt")

    train_folder = os.path.join(split_image_folder, "train")
    val_folder = os.path.join(split_image_folder, "val")
    test_folder = os.path.join(split_image_folder, "test")

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    #saving txt files with paths and copy images
    with open(train_txt_file, "w") as f_train, open(val_txt_file, "w") as f_val, open(test_txt_file, "w") as f_test:
        for filepath in train_df["filepath"]:
            f_train.write(filepath + "\n") 
            dest_path = os.path.join(train_folder, os.path.basename(filepath))
            shutil.copy(filepath, dest_path)  

        for filepath in val_df["filepath"]:
            f_val.write(filepath + "\n")  
            dest_path = os.path.join(val_folder, os.path.basename(filepath))
            shutil.copy(filepath, dest_path)  

        for filepath in test_df["filepath"]:
            f_test.write(filepath + "\n")  
            dest_path = os.path.join(test_folder, os.path.basename(filepath))
            shutil.copy(filepath, dest_path)  

In [None]:
# loads train/val/test filename sets for a given split
def load_split_filenames(split_name, base_dir="splits_info"):
    base = os.path.join(base_dir, split_name)
    with open(os.path.join(base, "train.txt")) as f:
        train_files = set(f.read().splitlines())
    with open(os.path.join(base, "val.txt")) as f:
        val_files = set(f.read().splitlines())
    with open(os.path.join(base, "test.txt")) as f:
        test_files = set(f.read().splitlines())
    return train_files, val_files, test_files

#loads feature vectors and corresponding binary labels (0 = proper, 1 = defective)
def load_features_and_labels(feature_file, filenames):
    # read the feature file and ensure filenames are strings
    df = pd.read_csv(feature_file)
    df["filename"] = df["filename"].astype(str)
    df_subset = df[df["filename"].isin(filenames)]
    # X = feature vectors (excluding filename)
    X = df_subset.drop(columns="filename").values
    # y = labels based on whether filename contains 'proper'
    y = np.array([0 if "proper" in fn.lower() else 1 for fn in df_subset["filename"]])
    return X, y

#trains the model and prints precision, recall, and F1-score for class 1
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name="Model"):
    print(f"\ntraining {model_name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #compute metrics for class 1 (defective)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    print(f"{model_name} → precision: {report['1']['precision']:.4f}, recall: {report['1']['recall']:.4f}, f1-score: {report['1']['f1-score']:.4f}")

In [None]:
#load all data in split_data
split_data = {}
split_info_dir = "splits_info"
feature_files = glob.glob("features_*.csv")

for split_name in os.listdir(split_info_dir):
    split_path = os.path.join(split_info_dir, split_name)
    if not os.path.isdir(split_path):
        continue

    train_files, val_files, test_files = load_split_filenames(split_name)

    for feature_file in feature_files:
        model_name = os.path.basename(feature_file).replace("features_", "").replace(".csv", "")

        X_train, y_train = load_features_and_labels(feature_file, train_files)
        X_val, y_val = load_features_and_labels(feature_file, val_files)
        X_test, y_test = load_features_and_labels(feature_file, test_files)

        split_data[(split_name, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

In [None]:
model_params = {
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=123, n_jobs=-1),
    "SVM": SVC(C=1, kernel="rbf"),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), learning_rate_init=0.01, max_iter=500, random_state=123),
    "KNN": KNeighborsClassifier(n_neighbors=5, weights="uniform"),
    "LogisticRegression": LogisticRegression(C=1, solver="lbfgs", random_state=123),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=123, algorithm="SAMME"),
    "CatBoost": CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, verbose=0, random_seed=123, task_type="GPU"),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, verbosity=0, random_state=123, tree_method="gpu_hist", n_jobs=-1),
    "LightGBM": LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, verbosity=0, n_jobs=-1, random_state=123)
}

# use only selected combinations
selected_models = ["convnext_base", "resnet50"]

combinations = [
    (split_name, model_name)
    for (split_name, model_name) in split_data
    if model_name in selected_models
]

metrics_scores = {}

for split_name, model_name in combinations:
    print(f"\n=== running {model_name} on {split_name} ===")

    data = split_data[(split_name, model_name)]
    X_train_full = np.vstack((data["X_train"], data["X_val"]))
    y_train_full = np.hstack((data["y_train"], data["y_val"]))
    X_test = data["X_test"]
    y_test = data["y_test"]

    for model_label, model in model_params.items():
        print(f"\ntraining {model_label}")
        model.fit(X_train_full, y_train_full)

        y_test_pred = model.predict(X_test)
        report = classification_report(y_test, y_test_pred, output_dict=True, zero_division=0)

        precision = report["1"]["precision"]
        recall = report["1"]["recall"]
        f1 = report["1"]["f1-score"]

        print(f"test results for {model_label}:")
        print(f"precision: {report['1']['precision']:.4f}, recall: {report['1']['recall']:.4f}, f1-score: {report['1']['f1-score']:.4f}")

        #save metrics
        key = f"{model_label}__{split_name}__{model_name}"
        metrics_scores[key] = {
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }

#save results
with open("metrics_scores_default.json", "w") as f:
    json.dump(metrics_scores, f, indent=4)

In [None]:
#allowed models (feature extractors)
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "inception_v3", "resnet50", "densenet121", "resnet152"
]

# ml models to tune
ml_models = {
    "SVM": SVC(probability=True),
    "MLP": MLPClassifier(max_iter=500, random_state=123),
    "LogisticRegression": LogisticRegression(random_state=123),
    "CatBoost": CatBoostClassifier(verbose=0, random_seed=123),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

#optuna objective function

def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)

    elif model_type == "MLP":
        params = {
            "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(100,), (50, 50), (200,)]),
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True),
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)

    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)

    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)

    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]

    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh

    trial.set_user_attr("threshold", best_thresh)
    return best_recall

#optimization loop
best_params = {}
best_models = {}
metrics_scores = {}

for ml_model_name in ml_models.keys():
    best_params[ml_model_name] = {}

    for (split_name, model_name), data in split_data.items():
        if model_name not in allowed_models:
            continue

        X_train, y_train = data["X_train"], data["y_train"]
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)

        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        model = ml_models[ml_model_name].set_params(**best_hyperparams)
        model.fit(X_train, y_train)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        best_params[ml_model_name][(split_name, model_name)] = {
            "params": best_hyperparams,
            "threshold": best_thresh
        }

        metrics_scores[(ml_model_name, split_name, model_name)] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        model_path = f"best_model_{ml_model_name}_{split_name}_{model_name}.pkl"
        joblib.dump(model, model_path)
        print(f"Saved model: {model_path}")

#save results
with open("metrics_scores_all.json", "w") as f:
    json.dump({f"{k[0]}__{k[1]}__{k[2]}": v for k, v in metrics_scores.items()}, f, indent=4)

with open("best_hyperparams_all.json", "w") as f:
    json.dump({f"{model}__{split}__{feat}": val for model, vals in best_params.items() for (split, feat), val in vals.items()}, f, indent=4)

In [None]:
#selected models
selected_models = ["convnext_base", "efficientnet_b3", "mobilenet_v3_large", "densenet121", "vit_b_16"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#model loading
model_mapping = {
    "convnext_base": models.convnext_base(weights=models.ConvNeXt_Base_Weights.IMAGENET1K_V1),
    "efficientnet_b3": models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.IMAGENET1K_V1),
    "mobilenet_v3_large": models.mobilenet_v3_large(weights=models.MobileNet_V3_Large_Weights.IMAGENET1K_V1),
    "densenet121": models.densenet121(weights=models.DenseNet121_Weights.IMAGENET1K_V1),
    "vit_b_16": models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1),
}

cnn_models = {}
for name in selected_models:
    model = model_mapping[name].to(device).eval()
    if "efficientnet" in name or "resnet" in name:
        model = torch.nn.Sequential(*(list(model.children())[:-1]))
    elif "vgg" in name:
        model.classifier = model.classifier[:-1]
    elif "googlenet" in name or "inception" in name:
        model.fc = torch.nn.Identity()
    elif "mobilenet" in name or "densenet" in name:
        model.classifier = torch.nn.Identity()
    elif "swin" in name or "vit" in name or "convnext" in name:
        model.head = torch.nn.Identity()
    cnn_models[name] = model

#transforms
aug_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.1, contrast=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.05, 0.05)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

base_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

#main loop per split
splits_dir = "splits_info"
for split_name in os.listdir(splits_dir):
    split_path = os.path.join(splits_dir, split_name)
    if not os.path.isdir(split_path):
        continue

    sets = {}
    for split_type in ["train", "val", "test"]:
        with open(os.path.join(split_path, f"{split_type}.txt")) as f:
            sets[split_type] = [line.strip() for line in f if line.strip()]

    for split_type, file_list in sets.items():
        print(f"  - {split_type}: {len(file_list)} images")

        transform = aug_transform if split_type == "train" else base_transform

        features_dict = {name: [] for name in selected_models}
        file_names = []

        for img_path in tqdm(file_list, desc=f"{split_type} - {split_name}", unit="img"):
            try:
                img = Image.open(img_path).convert("RGB")
                img_tensor = transform(img).unsqueeze(0).to(device)

                feature_vectors = []
                for model_name, model in cnn_models.items():
                    with torch.no_grad():
                        feat = model(img_tensor).squeeze().cpu().numpy()
                        feature_vectors.append(feat.flatten())

                file_names.append(img_path)
                for i, model_name in enumerate(selected_models):
                    features_dict[model_name].append(feature_vectors[i])

            except Exception as e:
                print(f"Error processing {img_path}: {e}")

        #save per model
        for model_name, features in features_dict.items():
            df_features = pd.DataFrame(features)
            df_features.insert(0, "filename", file_names)
            out_name = f"features_aug_{split_name}_{model_name}_{split_type}.csv"
            df_features.to_csv(out_name, index=False)

In [None]:
#Augmentation
#train with data augmentation
split_info_dir = "splits_info"

allowed_models_aug = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "densenet121", "vit_b_16"
]

ml_models = {
    "SVM": SVC(probability=True),
    "MLP": MLPClassifier(max_iter=500, random_state=123),
    "LogisticRegression": LogisticRegression(random_state=123),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123),
    "LightGBM": LGBMClassifier(random_state=123),
}

#load augmented features
print("Loading augmented features...")
all_features = {}
for file in glob.glob("features_aug_*.csv"):
    model_name = file.replace("features_aug_", "").replace(".csv", "")
    if model_name in allowed_models_aug:
        df = pd.read_csv(file)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df

#load splits
print("Loading splits...")
split_data = {}
for split_name in os.listdir(split_info_dir):
    split_path = os.path.join(split_info_dir, split_name)
    if not os.path.isdir(split_path):
        continue

    with open(os.path.join(split_path, "train.txt")) as f:
        train_files = {line.strip() for line in f}
    with open(os.path.join(split_path, "val.txt")) as f:
        val_files = {line.strip() for line in f}
    with open(os.path.join(split_path, "test.txt")) as f:
        test_files = {line.strip() for line in f}

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"]).values
        y_train = [0 if 'proper' in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"]).values
        y_val = [0 if 'proper' in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"]).values
        y_test = [0 if 'proper' in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split_name, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#new JSONs to store results
metrics_file = "metrics_scores_aug.json"
params_file = "best_hyperparams_aug.json"

metrics_scores = {}
best_params = {}

#optuna objective
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        params = {
            "hidden_layer_sizes": trial.suggest_categorical(
                "hidden_layer_sizes", [(100,), (50, 50), (200,)]
            ),
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True),
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]

    best_recall, best_thresh = 0, 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)
        if report["1"]["recall"] > best_recall:
            best_recall = report["1"]["recall"]
            best_thresh = thresh

    trial.set_user_attr("threshold", best_thresh)
    return best_recall

#training function
def train_model(ml_model_name, split_name, model_name, data):
    key = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_aug_{ml_model_name}_{split_name}_{model_name}.pkl"

    print(f"\n[TRAIN] {key}")
    X_train, y_train = data["X_train"], data["y_train"]
    X_val, y_val = data["X_val"], data["y_val"]

    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
    study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)

    best_hyperparams = study.best_params
    best_thresh = study.best_trial.user_attrs["threshold"]

    model = ml_models[ml_model_name].set_params(**best_hyperparams)
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)

    joblib.dump(model, model_path)
    print(f"[SAVED] {model_path} → F1: {report['1']['f1-score']:.4f}")

    #update dictionaries
    metrics_scores[key] = {
        "recall": report["1"]["recall"],
        "precision": report["1"]["precision"],
        "f1_score": report["1"]["f1-score"]
    }

    if ml_model_name not in best_params:
        best_params[ml_model_name] = {}
    best_params[ml_model_name][f"{split_name}__{model_name}"] = {
        "params": best_hyperparams,
        "threshold": best_thresh
    }

    #save progress
    with open(metrics_file, "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open(params_file, "w") as f:
        json.dump(best_params, f, indent=4)

#launch training
tasks = []
for ml_model_name in ml_models.keys():
    for (split_name, model_name), data in split_data.items():
        tasks.append((ml_model_name, split_name, model_name, data))

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(train_model, *task) for task in tasks]
    for f in futures:
        f.result()

In [None]:
# Downsampling 1:1
logging.basicConfig(filename="train_log_down.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
split_info_dir = "splits_info"

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_down.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_down.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    df_major_down = resample(df_major, replace=False, n_samples=len(df_minor), random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].values
    X_balanced = df_balanced.drop(columns=["label"]).values
    return X_balanced, y_balanced

#load features
print("Loading features...")
all_features = {}
for model_name in allowed_models:
    path = f"features_{model_name}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df
    else:
        logging.warning(f"Missing features: {path}")

#split_data
split_data = {}
for split in splits:
    split_path = os.path.join(split_info_dir, split)
    try:
        with open(os.path.join(split_path, "train.txt")) as f:
            train_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "val.txt")) as f:
            val_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "test.txt")) as f:
            test_files = {line.strip() for line in f}
    except Exception as e:
        logging.warning(f"Skipping split {split}: {str(e)}")
        continue

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"])
        y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"])
        y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"])
        y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#load jsons 
metrics_scores = json.load(open("metrics_scores_down.json")) if os.path.exists("metrics_scores_down.json") else {}
best_params = json.load(open("best_hyperparams_down.json")) if os.path.exists("best_hyperparams_down.json") else {}

#objective function 
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

#training wrapper
def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_down_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#training 
tasks = [(ml, split, model, split_data[(split, model)])
         for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#Downsampling 1:3
logging.basicConfig(filename="train_log_down3.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
split_info_dir = "splits_info"

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_down3.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_down3.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    n_samples = min(len(df_major), 3 * len(df_minor))
    df_major_down = resample(df_major, replace=False, n_samples=n_samples, random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].values
    X_balanced = df_balanced.drop(columns=["label"]).values
    return X_balanced, y_balanced

#load features
all_features = {}
for model_name in allowed_models:
    path = f"features_{model_name}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df
    else:
        logging.warning(f"Missing features: {path}")

#split_data
split_data = {}
for split in splits:
    split_path = os.path.join(split_info_dir, split)
    try:
        with open(os.path.join(split_path, "train.txt")) as f:
            train_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "val.txt")) as f:
            val_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "test.txt")) as f:
            test_files = {line.strip() for line in f}
    except Exception as e:
        logging.warning(f"Skipping split {split}: {str(e)}")
        continue

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"])
        y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"])
        y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"])
        y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#load jsons
metrics_scores = json.load(open("metrics_scores_down3.json")) if os.path.exists("metrics_scores_down3.json") else {}
best_params = json.load(open("best_hyperparams_down3.json")) if os.path.exists("best_hyperparams_down3.json") else {}

#objective function
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

#training wrapper
def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_down3_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#training
tasks = [(ml, split, model, split_data[(split, model)])
         for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#Downsampling 1:5
logging.basicConfig(filename="train_log_down5.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
split_info_dir = "splits_info"

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_down5.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_down5.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    n_samples = min(len(df_major), 5 * len(df_minor))
    df_major_down = resample(df_major, replace=False, n_samples=n_samples, random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].values
    X_balanced = df_balanced.drop(columns=["label"]).values
    return X_balanced, y_balanced

#load features
all_features = {}
for model_name in allowed_models:
    path = f"features_{model_name}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df
    else:
        logging.warning(f"Missing features: {path}")

#split_data 
split_data = {}
for split in splits:
    split_path = os.path.join(split_info_dir, split)
    try:
        with open(os.path.join(split_path, "train.txt")) as f:
            train_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "val.txt")) as f:
            val_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "test.txt")) as f:
            test_files = {line.strip() for line in f}
    except Exception as e:
        logging.warning(f"Skipping split {split}: {str(e)}")
        continue

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"])
        y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"])
        y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"])
        y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#load jsons
metrics_scores = json.load(open("metrics_scores_down5.json")) if os.path.exists("metrics_scores_down5.json") else {}
best_params = json.load(open("best_hyperparams_down5.json")) if os.path.exists("best_hyperparams_down5.json") else {}

#objective function
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

#training wrapper
def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_down5_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#training 
tasks = [(ml, split, model, split_data[(split, model)])
         for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#Downsampling 1:1 + data augmentation
log_file = "train_log_aug_down.txt"
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ml_model_names = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def load_json(path):
    return json.load(open(path)) if os.path.exists(path) else {}

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_aug_down.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_aug_down.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    df_major_down = resample(df_major, replace=False, n_samples=len(df_minor), random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].values
    X_balanced = df_balanced.drop(columns=["label"]).values
    return X_balanced, y_balanced

#load features
split_data = {}
for split in splits:
    for model_name in allowed_models:
        try:
            df_train = pd.read_csv(f"features_aug_{split}_{model_name}_train.csv")
            df_val = pd.read_csv(f"features_aug_{split}_{model_name}_val.csv")
            df_test = pd.read_csv(f"features_aug_{split}_{model_name}_test.csv")

            df_train["filename"] = df_train["filename"].astype(str)
            df_val["filename"] = df_val["filename"].astype(str)
            df_test["filename"] = df_test["filename"].astype(str)

            X_train = df_train.drop(columns=["filename"])
            y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

            X_val = df_val.drop(columns=["filename"])
            y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

            X_test = df_test.drop(columns=["filename"])
            y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

            split_data[(split, model_name)] = {
                "X_train": X_train, "y_train": y_train,
                "X_val": X_val, "y_val": y_val,
                "X_test": X_test, "y_test": y_test
            }
        except Exception as e:
            logging.warning(f"Skipping {split}-{model_name} due to error: {str(e)}")

metrics_scores = load_json("metrics_scores_aug_down.json")
best_params = load_json("best_hyperparams_aug_down.json")

def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_aug_down_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#run all
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
tasks = [(ml, split, model, split_data[(split, model)]) for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=18) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#Downsampling 1:3 + data augmentation
log_file = "train_log_aug_down3.txt"
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ml_model_names = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def load_json(path):
    return json.load(open(path)) if os.path.exists(path) else {}

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_aug_down3.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_aug_down3.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y, ratio=3):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    n_major = min(len(df_major), len(df_minor) * ratio)
    df_major_down = resample(df_major, replace=False, n_samples=n_major, random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].tolist()
    X_balanced = df_balanced.drop(columns=["label"])
    return X_balanced, y_balanced

#load features
split_data = {}
for split in splits:
    for model_name in allowed_models:
        try:
            df_train = pd.read_csv(f"features_aug_{split}_{model_name}_train.csv")
            df_val = pd.read_csv(f"features_aug_{split}_{model_name}_val.csv")
            df_test = pd.read_csv(f"features_aug_{split}_{model_name}_test.csv")

            df_train["filename"] = df_train["filename"].astype(str)
            df_val["filename"] = df_val["filename"].astype(str)
            df_test["filename"] = df_test["filename"].astype(str)

            X_train = df_train.drop(columns=["filename"])
            y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

            X_val = df_val.drop(columns=["filename"])
            y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

            X_test = df_test.drop(columns=["filename"])
            y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

            split_data[(split, model_name)] = {
                "X_train": X_train, "y_train": y_train,
                "X_val": X_val, "y_val": y_val,
                "X_test": X_test, "y_test": y_test
            }
        except Exception as e:
            logging.warning(f"Skipping {split}-{model_name} due to error: {str(e)}")

metrics_scores = load_json("metrics_scores_aug_down3.json")
best_params = load_json("best_hyperparams_aug_down3.json")

def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_aug_down3_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"], ratio=3)
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=5, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#run all
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
tasks = [(ml, split, model, split_data[(split, model)]) for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=18) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#Downsampling 1:5 + data augmentation
log_file = "train_log_aug_down5.txt"
logging.basicConfig(filename=log_file, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ml_model_names = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def load_json(path):
    return json.load(open(path)) if os.path.exists(path) else {}

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_aug_down5.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_aug_down5.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_downsampling(X, y, ratio=5):
    df = X.copy()
    df["label"] = y
    df_major = df[df["label"] == 0]
    df_minor = df[df["label"] == 1]
    n_major = min(len(df_major), len(df_minor) * ratio)
    df_major_down = resample(df_major, replace=False, n_samples=n_major, random_state=123)
    df_balanced = pd.concat([df_major_down, df_minor]).sample(frac=1, random_state=123)
    y_balanced = df_balanced["label"].tolist()
    X_balanced = df_balanced.drop(columns=["label"])
    return X_balanced, y_balanced

#load features
split_data = {}
for split in splits:
    for model_name in allowed_models:
        try:
            df_train = pd.read_csv(f"features_aug_{split}_{model_name}_train.csv")
            df_val = pd.read_csv(f"features_aug_{split}_{model_name}_val.csv")
            df_test = pd.read_csv(f"features_aug_{split}_{model_name}_test.csv")

            df_train["filename"] = df_train["filename"].astype(str)
            df_val["filename"] = df_val["filename"].astype(str)
            df_test["filename"] = df_test["filename"].astype(str)

            X_train = df_train.drop(columns=["filename"])
            y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

            X_val = df_val.drop(columns=["filename"])
            y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

            X_test = df_test.drop(columns=["filename"])
            y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

            split_data[(split, model_name)] = {
                "X_train": X_train, "y_train": y_train,
                "X_val": X_val, "y_val": y_val,
                "X_test": X_test, "y_test": y_test
            }
        except Exception as e:
            logging.warning(f"Skipping {split}-{model_name} due to error: {str(e)}")

metrics_scores = load_json("metrics_scores_aug_down5.json")
best_params = load_json("best_hyperparams_aug_down5.json")

def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_aug_down5_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_downsampling(data["X_train"], data["y_train"], ratio=5)
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#run all
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
tasks = [(ml, split, model, split_data[(split, model)]) for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#SMOTE 1:3
logging.basicConfig(filename="train_log_smote3.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
split_info_dir = "splits_info"

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_smote3.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_smote3.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_smote_oversampling(X, y):
    smote = SMOTE(sampling_strategy=0.33, random_state=123)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

#load features 
all_features = {}
for model_name in allowed_models:
    path = f"features_{model_name}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df
    else:
        logging.warning(f"Missing features: {path}")

#split_data
split_data = {}
for split in splits:
    split_path = os.path.join(split_info_dir, split)
    try:
        with open(os.path.join(split_path, "train.txt")) as f:
            train_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "val.txt")) as f:
            val_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "test.txt")) as f:
            test_files = {line.strip() for line in f}
    except Exception as e:
        logging.warning(f"Skipping split {split}: {str(e)}")
        continue

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"])
        y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"])
        y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"])
        y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#load jsons
metrics_scores = json.load(open("metrics_scores_smote3.json")) if os.path.exists("metrics_scores_smote3.json") else {}
best_params = json.load(open("best_hyperparams_smote3.json")) if os.path.exists("best_hyperparams_smote3.json") else {}

#objective function 
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

#training wrapper
def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_smote3_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_smote_oversampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#training
tasks = [(ml, split, model, split_data[(split, model)])
         for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#SMOTE 1:5
logging.basicConfig(filename="train_log_smote5.txt", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]
allowed_models = [
    "convnext_base", "efficientnet_b3", "mobilenet_v3_large",
    "vit_b_16", "densenet121"
]
ordered_ml_models = ["MLP", "LogisticRegression", "XGBoost", "LightGBM", "SVM"]
split_info_dir = "splits_info"

def get_model_instance(name):
    if name == "SVM":
        return SVC(probability=True)
    elif name == "MLP":
        return MLPClassifier(max_iter=500, random_state=123)
    elif name == "LogisticRegression":
        return LogisticRegression(random_state=123)
    elif name == "XGBoost":
        return XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif name == "LightGBM":
        return LGBMClassifier(random_state=123)

def convert_for_json(obj):
    return list(obj) if isinstance(obj, tuple) else obj

def save_jsons():
    with open("metrics_scores_smote5.json", "w") as f:
        json.dump(metrics_scores, f, indent=4)
    with open("best_hyperparams_smote5.json", "w") as f:
        json.dump(best_params, f, indent=4, default=convert_for_json)

def find_best_threshold(y_true, probs):
    best_recall = 0
    best_thresh = 0.5
    for thresh in np.arange(0.1, 0.91, 0.05):
        preds = (probs >= thresh).astype(int)
        report = classification_report(y_true, preds, output_dict=True, zero_division=0)
        recall = report["1"]["recall"]
        if recall > best_recall:
            best_recall = recall
            best_thresh = thresh
    return best_thresh

def apply_smote_oversampling(X, y):
    smote = SMOTE(sampling_strategy=0.2, random_state=123)  # 1:5 ratio = 0.2
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

#load features
all_features = {}
for model_name in allowed_models:
    path = f"features_{model_name}.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        df["filename"] = df["filename"].astype(str)
        all_features[model_name] = df
    else:
        logging.warning(f"Missing features: {path}")

#split_data
split_data = {}
for split in splits:
    split_path = os.path.join(split_info_dir, split)
    try:
        with open(os.path.join(split_path, "train.txt")) as f:
            train_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "val.txt")) as f:
            val_files = {line.strip() for line in f}
        with open(os.path.join(split_path, "test.txt")) as f:
            test_files = {line.strip() for line in f}
    except Exception as e:
        logging.warning(f"Skipping split {split}: {str(e)}")
        continue

    for model_name, df in all_features.items():
        df_train = df[df["filename"].isin(train_files)]
        df_val = df[df["filename"].isin(val_files)]
        df_test = df[df["filename"].isin(test_files)]

        X_train = df_train.drop(columns=["filename"])
        y_train = [0 if "proper" in fn.lower() else 1 for fn in df_train["filename"]]

        X_val = df_val.drop(columns=["filename"])
        y_val = [0 if "proper" in fn.lower() else 1 for fn in df_val["filename"]]

        X_test = df_test.drop(columns=["filename"])
        y_test = [0 if "proper" in fn.lower() else 1 for fn in df_test["filename"]]

        split_data[(split, model_name)] = {
            "X_train": X_train, "y_train": y_train,
            "X_val": X_val, "y_val": y_val,
            "X_test": X_test, "y_test": y_test
        }

#load jsons
metrics_scores = json.load(open("metrics_scores_smote5.json")) if os.path.exists("metrics_scores_smote5.json") else {}
best_params = json.load(open("best_hyperparams_smote5.json")) if os.path.exists("best_hyperparams_smote5.json") else {}

#objective function 
def objective(trial, X_train, y_train, X_val, y_val, model_type):
    if model_type == "SVM":
        params = {
            "C": trial.suggest_float("C", 0.1, 10, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf"]),
        }
        model = SVC(**params, probability=True)
    elif model_type == "MLP":
        choice = trial.suggest_categorical("hidden_layer_sizes", ["100", "50_50", "200"])
        mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
        params = {
            "hidden_layer_sizes": mapping[choice],
            "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.1, log=True)
        }
        model = MLPClassifier(**params, max_iter=500, random_state=123)
    elif model_type == "LogisticRegression":
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        }
        model = LogisticRegression(**params, random_state=123)
    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=123)
    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 15, 60),
        }
        model = LGBMClassifier(**params, random_state=123)

    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    best_thresh = find_best_threshold(y_val, probs)
    preds = (probs >= best_thresh).astype(int)
    report = classification_report(y_val, preds, output_dict=True, zero_division=0)
    trial.set_user_attr("threshold", best_thresh)
    return report["1"]["f1-score"]

#training wrapper
def process_combination(ml_model_name, split_name, model_name, data):
    key_json = f"{ml_model_name}__{split_name}__{model_name}"
    model_path = f"model_smote5_{ml_model_name}_{split_name}_{model_name}.pkl"
    try:
        logging.info(f"[TRAINING] {key_json}")
        X_train, y_train = apply_smote_oversampling(data["X_train"], data["y_train"])
        X_val, y_val = data["X_val"], data["y_val"]

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=123))
        study.optimize(lambda trial: objective(trial, X_train, y_train, X_val, y_val, ml_model_name), n_trials=15, n_jobs=18)
        best_hyperparams = study.best_params
        best_thresh = study.best_trial.user_attrs["threshold"]

        if ml_model_name == "MLP" and isinstance(best_hyperparams.get("hidden_layer_sizes"), str):
            mapping = {"100": (100,), "50_50": (50, 50), "200": (200,)}
            best_hyperparams["hidden_layer_sizes"] = mapping[best_hyperparams["hidden_layer_sizes"]]

        model = get_model_instance(ml_model_name)
        model.set_params(**best_hyperparams)
        model.fit(X_train, y_train)
        joblib.dump(model, model_path)

        probs = model.predict_proba(X_val)[:, 1]
        preds = (probs >= best_thresh).astype(int)
        report = classification_report(y_val, preds, output_dict=True, zero_division=0)

        metrics_scores[key_json] = {
            "recall": report["1"]["recall"],
            "precision": report["1"]["precision"],
            "f1_score": report["1"]["f1-score"]
        }

        if ml_model_name not in best_params:
            best_params[ml_model_name] = {}
        best_params[ml_model_name][f"{split_name}__{model_name}"] = {
            "params": best_hyperparams,
            "threshold": float(best_thresh)
        }

        save_jsons()
        logging.info(f"[DONE] {key_json} with F1: {report['1']['f1-score']:.3f}")

    except Exception as e:
        logging.error(f"[ERROR] {key_json} failed: {str(e)}")
        logging.error(traceback.format_exc())

#training
tasks = [(ml, split, model, split_data[(split, model)])
         for ml in ordered_ml_models for (split, model) in split_data]

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_combination, *task) for task in tasks]
    for future in as_completed(futures):
        pass

In [None]:
#test
threshold_files = sorted(glob("best_hyperparams_*.json"))
model_files = sorted(glob("model_*.pkl"))
metrics_all = []

splits = [
    "customer1", "all_data", "broken", "capsules", "customer2",
    "double", "minor_major", "oval_round_oblong", "tablets"
]

#load features and labels for all splits and models
def load_test_features(split, model_name):
    path = f"features_{split}_{model_name}_test.csv"
    if os.path.exists(path):
        df = pd.read_csv(path)
        X = df.drop(columns=["filename"])
        y = [0 if "proper" in fn.lower() else 1 for fn in df["filename"]]
        return X, y
    return None, None

for model_file in model_files:
    filename = os.path.basename(model_file)
    parts = filename.replace(".pkl", "").split("_")
    
    strategy = parts[1] if parts[0] == "model" else parts[0]
    clf = parts[-3]
    split = parts[-2]
    model_name = parts[-1]
    
    if split not in splits:
        continue

    #load model 
    try:
        model = joblib.load(model_file)
    except Exception as e:
        print(f"Failed to load model: {filename} — {e}")
        continue

    # find and load a threshold
    matched_jsons = [f for f in threshold_files if strategy in f]
    threshold = 0.5
    for jfile in matched_jsons:
        with open(jfile) as f:
            best_params = json.load(f)
        if clf in best_params and f"{split}__{model_name}" in best_params[clf]:
            threshold = best_params[clf][f"{split}__{model_name}"].get("threshold", 0.5)
            break

    #load test data
    X_test, y_test = load_test_features(split, model_name)
    if X_test is None:
        continue

    try:
        probs = model.predict_proba(X_test)[:, 1]
        preds = (probs >= threshold).astype(int)
        report = classification_report(y_test, preds, output_dict=True, zero_division=0)
        metrics_all.append({
            "strategy": strategy,
            "split": split,
            "cnn": model_name,
            "classifier": clf,
            "precision": report["1"]["precision"],
            "recall": report["1"]["recall"],
            "f1_score": report["1"]["f1-score"],
            "accuracy": report["1"]["accuracy"]
        })
    except Exception as e:
        print(f"Failed prediction for {filename}: {e}")

df_metrics = pd.DataFrame(metrics_all)
df_metrics.to_csv("test_results_all_models.csv", index=False)