In [3]:
# Imports

import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.base import clone
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from scipy.stats import ttest_rel
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from scipy.optimize import differential_evolution
from IPython.display import display

RANDOM_STATE = 42
DATA_DIR = os.path.join("..", "data")



# Funktion f√∂r att utv√§rdera en modell

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba_raw = model.decision_function(X_test)
        y_proba = (y_proba_raw - y_proba_raw.min()) / (y_proba_raw.max() - y_proba_raw.min() + 1e-9)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "y_pred": y_pred,
        "y_proba": y_proba,
    }
def print_confusion_matrix(y_true, y_pred, title="Confusion matrix"):
    """
    Skriver ut confusion matrix i tabellform.
    """
    cm = confusion_matrix(y_true, y_pred)
    cm_df = pd.DataFrame(
        cm,
        index=["True 0 (clean)", "True 1 (defect)"],
        columns=["Pred 0", "Pred 1"],
    )
    print(title)
    display(cm_df)


POSSIBLE_TARGETS_GENERAL = [
    "defects", "Defects", "defect", "bug", "bugs", "problems", "problem",
    "class", "target"
]

# dataset-specifika kandidater (vi vet lite extra om vissa)
POSSIBLE_TARGETS_BY_DATASET = {
    "JM1": ["defects", "Defects"],
    "KC1": ["defects", "Defects"],
    "KC2": ["problems", "bug", "bugs", "defects", "Defects"],
    "PC1": ["defects", "Defects"],
    "CM1": ["defects", "Defects"],
}

DATASETS = {
    "JM1": {"filename": "jm1.csv", "target": "defects"},
    "KC1": {"filename": "kc1.csv", "target": "defects"},
    "KC2": {"filename": "kc2.csv", "target": "defects"},
    "PC1": {"filename": "pc1.csv", "target": "defects"},
    "CM1": {"filename": "cm1.csv", "target": "defects"},
}

def load_and_prepare_dataset(dataset_name):
    """
    L√§ser in valt dataset, f√∂rs√∂ker hitta r√§tt target-kolumn,
    delar i train/test och skalar features.
    """
    info = DATASETS[dataset_name]
    path = os.path.join(DATA_DIR, info["filename"])
    df = pd.read_csv(path)

    # 1) v√§lj kandidatlista f√∂r target-namn
    candidate_targets = POSSIBLE_TARGETS_BY_DATASET.get(
        dataset_name,
        POSSIBLE_TARGETS_GENERAL
    )

    # 2) hitta f√∂rsta kolumn som matchar en kandidat
    target_col = None
    for cand in candidate_targets:
        if cand in df.columns:
            target_col = cand
            break

    if target_col is None:
        raise ValueError(
            f"Kunde inte hitta target-kolumn i {dataset_name}.\n"
            f"F√∂rs√∂kte med: {candidate_targets}\n"
            f"Filen har kolumner: {list(df.columns)}"
        )

    print(f"Anv√§nder target-kolumn '{target_col}' f√∂r dataset {dataset_name}.")

    X = df.drop(columns=[target_col])
    y = df[target_col]

    # s√§kerst√§ll 0/1
    if y.dtype == "bool":
        y = y.astype(int)
    elif y.dtype == "object":
        y = y.astype(str).str.lower().map({
            "yes": 1,
            "true": 1,
            "defective": 1,
            "bug": 1,
            "bugs": 1,
            "problem": 1,
            "problems": 1,
            "1": 1,
            "0": 0,
        }).fillna(0).astype(int)

    print(f"{dataset_name}: shape={df.shape}")
    print("Klassf√∂rdelning (hela datan):")
    print(y.value_counts(), "\n")
# H√ÑR SKER TRAIN- TEST SPLIT REDAN I B√ñRJAN F√ñR ATT HA SAMMA KLASSF√ñRDELNING
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE
    )
# Z-SCORE NORMALISWEERING. Den r√§knar ut medelv√§rdet och standard avvikelsen p√• tr√§ningsdata 
# g√∂r att alla features hamnar p√• samma skala. P√• s√• s√§tt slipper modellen bli p√•verkad av olika skalor.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Train klassf√∂rdelning:")
    print(y_train.value_counts())
    print("\nTest klassf√∂rdelning:")
    print(y_test.value_counts(), "\n")

    return X_train_scaled, X_test_scaled, y_train, y_test

# === Del 3.5: Feature selection med RFE ===

def apply_rfe(base_model, X_train, y_train, X_test, n_features_to_select=12):
    rfe_estimator = RandomForestClassifier(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    selector = RFE(
        estimator=rfe_estimator,
        n_features_to_select=n_features_to_select,
        step=1
    )

    # Anpassa X_train/X_test (RFE)
    X_train_rfe = selector.fit_transform(X_train, y_train)
    X_test_rfe = selector.transform(X_test)

    print(f"RFE: beh√•ller {n_features_to_select} features av totalt {X_train.shape[1]}")

    # üî¥ M√ÖSTE vara indenterad inuti funktionen
    return X_train_rfe, X_test_rfe, selector


# === Del 4: Modeller (alla basmodeller) ===

def get_base_models():
    """
    Skapar alla modeller vi vill testa.
    """
    log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)

    rf = RandomForestClassifier(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    ann = MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=200,
        random_state=RANDOM_STATE,
    )

    svc = SVC(
        kernel="rbf",
        probability=True,   # beh√∂vs f√∂r AUC
        random_state=RANDOM_STATE,
    )

    voting = VotingClassifier(
        estimators=[
            ("logreg", log_reg),
            ("rf", rf),
            ("xgb", xgb),
        ],
        voting="soft"  # anv√§nder sannolikheter
    )

    models = {
        "LogisticRegression": log_reg,
        "RandomForest": rf,
        "XGBoost": xgb,
        "ANN": ann,
        "SVC": svc,
        "Voting": voting,
    }
    return models


# === Del 5: SMOTE-varianter ===
#kollar hur m√•nga minoritet det finns och till√§mpar ratio
def apply_basic_smote(X_train, y_train):
    """
    Standard-SMOTE med default-parametrar.
    """
    smote = SMOTE(random_state=RANDOM_STATE)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    print("Efter basic SMOTE:")
    print(pd.Series(y_res).value_counts(), "\n")
    return X_res, y_res


def smote_grid_search(model, X_train, y_train):
    """
    Enkel grid search p√• SMOTE-parametrar (inspirerad av SMOTUNED-id√©n).
    """
    pipe = Pipeline([
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", model),
    ])

    # OBS: dubbel underscore f√∂r pipeline-parametrar!
    param_grid = {
        "smote__k_neighbors": [3, 5, 7],
        "smote__sampling_strategy": [0.5, 0.75, 1.0],
    }

    grid = GridSearchCV(
        pipe,
        param_grid,
        scoring="f1",
        cv=3,
        n_jobs=-1,
        verbose=0,
    )

    grid.fit(X_train, y_train)
    print("GRID-SMOTE ‚Äì b√§sta parametrar:", grid.best_params_)
    return grid.best_estimator_


def smotuned_de(model, X_train, y_train):
    """
    F√∂renklad SMOTUNED-id√©:
    differential evolution optimerar SMOTE-parametrar (k_neighbors, sampling_strategy)
    f√∂r att maximera F1 med 3-fold CV.
    """

    def objective(params):
        # params = [k_neighbors, sampling_strategy]
        k = int(round(params[0]))
        k = max(2, min(k, 15))   # h√•ll k inom [2, 15]

        sampling = float(params[1])
        sampling = max(0.2, min(sampling, 1.0))  # sampling_strategy inom [0.2, 1.0]

        smote = SMOTE(
            k_neighbors=k,
            sampling_strategy=sampling,
            random_state=RANDOM_STATE,
        )

        X_res, y_res = smote.fit_resample(X_train, y_train)

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        scores = []

        for train_idx, val_idx in cv.split(X_res, y_res):
            X_tr, X_val = X_res[train_idx], X_res[val_idx]
            y_tr, y_val = y_res[train_idx], y_res[val_idx]

            m = clone(model)
            m.fit(X_tr, y_tr)
            y_pred = m.predict(X_val)
            scores.append(f1_score(y_val, y_pred, zero_division=0))

        # differential_evolution minimerar, s√• vi returnerar -F1
        return -np.mean(scores)

    bounds = [
        (2, 15),    # k_neighbors
        (0.2, 1.0), # sampling_strategy
    ]

    result = differential_evolution(
        objective,
        bounds,
        maxiter=15,
        popsize=10,
        tol=0.01,
        polish=True,
        disp=False,
    )

    best_k = int(round(result.x[0]))
    best_sampling = float(result.x[1])
    best_k = max(2, min(best_k, 15))
    best_sampling = max(0.2, min(best_sampling, 1.0))

    print("SMOTUNED-DE ‚Äì b√§sta parametrar:")
    print("k_neighbors:", best_k)
    print("sampling_strategy:", best_sampling)

    best_smote = SMOTE(
        k_neighbors=best_k,
        sampling_strategy=best_sampling,
        random_state=RANDOM_STATE,
    )
    X_res_best, y_res_best = best_smote.fit_resample(X_train, y_train)

    final_model = clone(model)
    final_model.fit(X_res_best, y_res_best)

    return final_model

def train_with_smote_mode(base_model, X_train, y_train, smote_mode): # Hj√§lpfunktion som hanterar smote_mode
    model = clone(base_model)
    
    if smote_mode == "NONE":
        print("Ingen SMOTE anv√§nds.\n")
        model.fit(X_train, y_train)
        used_model = model
        smote_label = "NONE"
    elif smote_mode == "BASIC":
        X_smote, y_smote = apply_basic_smote(X_train, y_train)
        model.fit(X_smote, y_smote)
        used_model = model
        smote_label = "BASIC"

    elif smote_mode == "GRID":
        # grid-funktionen tr√§nar sj√§lv och returnerar b√§sta estimatorn
        used_model = smote_grid_search(model, X_train, y_train)
        smote_label = "GRID"

    elif smote_mode == "SMOTUNED-DE":
        # smotuned_de tr√§nar ocks√• och returnerar en f√§rdig modell
        used_model = smotuned_de(model, X_train, y_train)
        smote_label = "SMOTUNED-DE"

    else:
        print("Ogiltigt SMOTE-l√§ge, anv√§nder NONE.")
        model.fit(X_train, y_train)
        used_model = model
        smote_label = "NONE"

    return used_model, smote_label    

def cross_val_f1_scores(dataset_name, model_name, smote_mode="NONE", use_rfe=False, n_features_to_select=12, n_splits=3):
    """
    K√∂r StratifiedKFold CV och returnerar en lista med F1-scores f√∂r vald modell.
    """
    X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(dataset_name)
    # Vi sl√•r ihop train+test h√§r f√∂r att g√∂ra CV p√• hela datasetet
    X_all = np.vstack([X_train_scaled, X_test_scaled])
    y_all = np.concatenate([y_train.values, y_test.values])

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    base_models = get_base_models()
    base_model = base_models[model_name]

    f1_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_all, y_all), start=1):
        X_tr, X_val = X_all[train_idx], X_all[val_idx]
        y_tr, y_val = y_all[train_idx], y_all[val_idx]

        # ev. RFE per fold
        if use_rfe:
            X_tr, X_val, _ = apply_rfe(
                base_model=base_model,
                X_train=X_tr,
                y_train=y_tr,
                X_test=X_val,
                n_features_to_select=n_features_to_select,
            )

        used_model, smote_label = train_with_smote_mode(
            base_model=base_model,
            X_train=X_tr,
            y_train=y_tr,
            smote_mode=smote_mode,
        )

        y_pred = used_model.predict(X_val)
        f1 = f1_score(y_val, y_pred, zero_division=0)
        f1_scores.append(f1)

        print(f"[{dataset_name}] Fold {fold_idx}: F1 = {f1:.4f} (SMOTE={smote_label})")

    return np.array(f1_scores)
   
# === Del 6: j√§mf√∂relsefunktion f√∂r EN modell + EN dataset ===

def compare_smote_variants(dataset_name, model_name, use_rfe=False, n_features_to_select=12):
    """
    K√∂r SAMMA dataset + SAMMA modell med:
    - ingen SMOTE
    - basic SMOTE
    - GRID-SMOTE
    - SMOTUNED-DE
    och returnerar en tabell med nyckeltal + pivot p√• F1.
    """

    # 1) Ladda och skala data
    X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(dataset_name)

    # 1b) Valfritt: Feature selection med RFE
    if use_rfe:
        print(f"\n>>> K√∂r RFE med {n_features_to_select} features f√∂r {dataset_name} / {model_name} <<<\n")
        temp_models = get_base_models()
        rfe_base_model = temp_models[model_name]

        X_train_used, X_test_used, rfe_selector = apply_rfe(
            base_model=rfe_base_model,
            X_train=X_train_scaled,
            y_train=y_train,
            X_test=X_test_scaled,
            n_features_to_select=n_features_to_select,
        )
    else:
        X_train_used = X_train_scaled
        X_test_used = X_test_scaled

    # 2) H√§mta vald basmodell
    base_models = get_base_models()
    if model_name not in base_models:
        raise ValueError(f"Modell '{model_name}' finns inte. Tillg√§ngliga: {list(base_models.keys())}")
    base_model = base_models[model_name]

    results = []

    # 3) Alla SMOTE-l√§gen vi vill j√§mf√∂ra
    smote_modes = ["NONE", "BASIC", "GRID", "SMOTUNED-DE"]

    for mode in smote_modes:
        print(f"\n--- Tr√§nar {model_name} p√• {dataset_name} med SMOTE-l√§ge: {mode} ---")

        used_model, smote_label = train_with_smote_mode(
            base_model=base_model,
            X_train=X_train_used,   # <-- RFE-version eller original
            y_train=y_train,
            smote_mode=mode,
        )

        # Utv√§rdera modellen
        res = evaluate_model(used_model, X_test_used, y_test)

        # Visa metrics i klartext
        print(f"\nResultat ‚Äì {dataset_name} ‚Äì {model_name} ‚Äì SMOTE={smote_label}")
        print(f"Recall   : {res['recall']:.4f}")
        print(f"Precision: {res['precision']:.4f}")
        print(f"F1-score : {res['f1']:.4f}")
        print(f"Accuracy : {res['accuracy']:.4f}")
        print(f"AUC      : {res['auc']:.4f}")

        # Confusion matrix EFTER scoren
        print_confusion_matrix(y_test, res["y_pred"], title=f"Confusion matrix ({dataset_name}, {model_name}, SMOTE={smote_label})")

        # Spara till tabell
        results.append({
            "dataset": dataset_name,
            "model": model_name,
            "smote_mode": smote_label,
            "accuracy": res["accuracy"],
            "precision": res["precision"],
            "recall": res["recall"],
            "f1": res["f1"],
            "auc": res["auc"],
        })

    # 4) L√§gg allt i en DataFrame
    df = pd.DataFrame(results)
    print(f"\n=== J√§mf√∂relse SMOTE-varianter ‚Äì dataset: {dataset_name}, modell: {model_name} ===")
    display(df)

    # 5) Pivot-tabell p√• F1 (som i dina tabeller)
    pivot_f1 = df.pivot_table(
        index=["dataset", "model"],
        columns="smote_mode",
        values="f1"
    )
    print("\nF1 per SMOTE-l√§ge:")
    display(pivot_f1)
        
    return df, pivot_f1


def cross_project_experiment(
    train_dataset,
    test_dataset,
    model_name,
    smote_mode="SMOTUNED-DE",
    use_rfe=False,
    n_features_to_select=12,
):
    """
    Train on one dataset (train_dataset) and test on another (test_dataset).
    - Skalar features baserat p√• TRAIN och anv√§nder samma transformation p√• TEST.
    - Alignar features: beh√•ller bara gemensamma kolumner med samma namn.
    - Kan anv√§nda SMOTE-l√§ge + ev. RFE (use_rfe=True).
    """
    # ===== 1) L√§s in TRAIN-data =====
    info_train = DATASETS[train_dataset]
    path_train = os.path.join(DATA_DIR, info_train["filename"])
    df_train = pd.read_csv(path_train)

    candidate_targets_train = POSSIBLE_TARGETS_BY_DATASET.get(
        train_dataset,
        POSSIBLE_TARGETS_GENERAL
    )

    target_col_train = None
    for cand in candidate_targets_train:
        if cand in df_train.columns:
            target_col_train = cand
            break

    if target_col_train is None:
        raise ValueError(f"Hittade ingen target-kolumn i {train_dataset}")

    X_train = df_train.drop(columns=[target_col_train])
    y_train = df_train[target_col_train]

    if y_train.dtype == "bool":
        y_train = y_train.astype(int)
    elif y_train.dtype == "object":
        y_train = y_train.astype(str).str.lower().map({
            "yes": 1, "true": 1, "defective": 1, "bug": 1, "bugs": 1,
            "problem": 1, "problems": 1, "1": 1, "0": 0,
        }).fillna(0).astype(int)

    # ===== 2) L√§s in TEST-data =====
    info_test = DATASETS[test_dataset]
    path_test = os.path.join(DATA_DIR, info_test["filename"])
    df_test = pd.read_csv(path_test)

    candidate_targets_test = POSSIBLE_TARGETS_BY_DATASET.get(
        test_dataset,
        POSSIBLE_TARGETS_GENERAL
    )

    target_col_test = None
    for cand in candidate_targets_test:
        if cand in df_test.columns:
            target_col_test = cand
            break

    if target_col_test is None:
        raise ValueError(f"Hittade ingen target-kolumn i {test_dataset}")

    X_test = df_test.drop(columns=[target_col_test])
    y_test = df_test[target_col_test]

    if y_test.dtype == "bool":
        y_test = y_test.astype(int)
    elif y_test.dtype == "object":
        y_test = y_test.astype(str).str.lower().map({
            "yes": 1, "true": 1, "defective": 1, "bug": 1, "bugs": 1,
            "problem": 1, "problems": 1, "1": 1, "0": 0,
        }).fillna(0).astype(int)

    #print(f"\n=== {train_dataset} ‚Üí {test_dataset} ===")
    #print("Train klassf√∂rdelning:")
    #print(y_train.value_counts())
    #print("\nTest klassf√∂rdelning:")
    #print(y_test.value_counts(), "\n")

    # ===== 3) Aligna features: beh√•ll bara gemensamma kolumner =====
    common_features = sorted(set(X_train.columns) & set(X_test.columns))

    if len(common_features) == 0:
        raise ValueError("Inga gemensamma features mellan train och test!")

    if len(common_features) < X_train.shape[1] or len(common_features) < X_test.shape[1]:
        dropped_train = set(X_train.columns) - set(common_features)
        dropped_test = set(X_test.columns) - set(common_features)
        print(f"Gemensamma features: {len(common_features)}")
        if dropped_train:
            print("Features som bara fanns i TRAIN och togs bort:", dropped_train)
        if dropped_test:
            print("Features som bara fanns i TEST och togs bort:", dropped_test)
        print()

    X_train = X_train[common_features].copy()
    X_test = X_test[common_features].copy()

    # ===== 4) Skala =====
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # ===== 5) Ev. RFE =====
    if use_rfe:
        print(f">>> Cross-project RFE: beh√•ller {n_features_to_select} features <<<")
        temp_models = get_base_models()
        rfe_base_model = temp_models[model_name]

        X_train_scaled, X_test_scaled, rfe_selector = apply_rfe(
            base_model=rfe_base_model,
            X_train=X_train_scaled,
            y_train=y_train,
            X_test=X_test_scaled,
            n_features_to_select=n_features_to_select,
        )

    # ===== 6) Modell + SMOTE-l√§ge =====
    base_models = get_base_models()
    base_model = base_models[model_name]

    used_model, smote_label = train_with_smote_mode(
        base_model=base_model,
        X_train=X_train_scaled,
        y_train=y_train,
        smote_mode=smote_mode,
    )

    # ===== 7) Utv√§rdera p√• TEST =====
    res = evaluate_model(used_model, X_test_scaled, y_test)

    print(f"Resultat ‚Äì {train_dataset} ‚Üí {test_dataset} ‚Äì {model_name} ‚Äì SMOTE: {smote_label}")
    for k, v in res.items():
        if k in ["y_pred", "y_proba"]:
            continue
        print(f"{k}: {v:.4f}")

    return res
def build_cpdp_table(train_dataset, test_dataset, model_name, n_features_to_select=12):
    """
    Bygger en tabell med metrics f√∂r tre CPDP-inst√§llningar:
    - Baseline (ingen SMOTE, ingen RFE)
    - SMOTUNED-DE (utan RFE)
    - SMOTUNED-DE + RFE (feature selection, t.ex. 12 features)

    Anv√§nder cross_project_experiment(...) under huven.
    """

    rows = []

    settings = [
        ("Baseline (NONE)",        "NONE",        False),
        ("SMOTUNED-DE",            "SMOTUNED-DE", False),
        ("SMOTUNED-DE + RFE(12)",  "SMOTUNED-DE", True),
    ]

    for setting_name, smote_mode, use_rfe in settings:
        print(f"\n>>> K√∂r CPDP: {train_dataset} ‚Üí {test_dataset}, "
              f"modell={model_name}, setting={setting_name} (SMOTE={smote_mode}, RFE={use_rfe})")

        res = cross_project_experiment(
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            model_name=model_name,
            smote_mode=smote_mode,
            use_rfe=use_rfe,
            n_features_to_select=n_features_to_select,
        )

        rows.append({
            "Train ‚Üí Test": f"{train_dataset} ‚Üí {test_dataset}",
            "Model":        model_name,
            "Setting":      setting_name,
            "Recall":       res["recall"],
            "Precision":    res["precision"],
            "F1":           res["f1"],
            "Accuracy":     res["accuracy"],
            "AUC":          res["auc"],
        })

    df = pd.DataFrame(rows)
    return df


# === Del 7: Meny f√∂r att k√∂ra experiment ===

def run_experiments_menu():
    # v√§lj dataset
    print("Tillg√§ngliga dataset:")
    dataset_names = list(DATASETS.keys())
    for idx, name in enumerate(dataset_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla dataset")

    dataset_choice = input("V√§lj dataset (t.ex. 1, 2, 3 eller JM1/KC1/ALL): ").strip().upper()

    all_datasets_selected = False

    if dataset_choice == "ALL":
        datasets_to_run = dataset_names
        all_datasets_selected = True
    elif dataset_choice.isdigit():
        idx = int(dataset_choice) - 1
        if 0 <= idx < len(dataset_names):
            datasets_to_run = [dataset_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]
    else:
        # anta att anv√§ndaren skrev namnet direkt, t.ex. JM1
        if dataset_choice in DATASETS:
            datasets_to_run = [dataset_choice]
        else:
            print("Ogiltigt namn, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]

    # v√§lj modell(er)
    models = get_base_models()
    print("\nTillg√§ngliga modeller:")
    model_names = list(models.keys())
    for idx, name in enumerate(model_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla modeller")

    model_choice = input("V√§lj modell (t.ex. 1, 2 eller RandomForest/ALL): ").strip()

    all_models_selected = False

    if model_choice.upper() == "ALL":
        model_names_to_run = model_names
        all_models_selected = True
    elif model_choice.isdigit():
        idx = int(model_choice) - 1
        if 0 <= idx < len(model_names):
            model_names_to_run = [model_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]
    else:
        if model_choice in models:
            model_names_to_run = [model_choice]
        else:
            print("Ogiltigt modellnamn, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]

    # v√§lj SMOTE-l√§ge
    print("\nSMOTE-l√§gen:")
    print("0 = Ingen SMOTE")
    print("1 = Basic SMOTE (standardparametrar)")
    print("2 = GRID-SMOTE (enkel tuning)")
    print("3 = SMOTUNED-DE (evolution√§r tuning)")
    print("4 = J√§mf√∂r ALLA SMOTE-varianter f√∂r vald dataset + modell")
    smote_mode = input("V√§lj 0 / 1 / 2 / 3 / 4: ").strip()

    # üî∏ Specialfall: smote_mode 4 = k√∂r compare_smote_variants f√∂r EN kombination
    if smote_mode == "4":
        if len(datasets_to_run) == 1 and len(model_names_to_run) == 1:
            ds = datasets_to_run[0]
            mn = model_names_to_run[0]
            df_compare, pivot_compare = compare_smote_variants(ds, mn)
            return df_compare
        else:
            print("\n‚ö† SMOTE-l√§ge 4 kr√§ver att du v√§ljer EXAKT ett dataset och en modell (inte ALL).")
            print("Byter till l√§ge 1 (Basic SMOTE) ist√§llet.\n")
            smote_mode = "1"

    all_results = []

    for ds in datasets_to_run:
        print("\n==============================")
        print(f"K√∂r dataset: {ds}")
        print("==============================\n")

        X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(ds)

        for model_name in model_names_to_run:
            base_models = get_base_models()  # nya instanser
            model = base_models[model_name]

            print(f"\n--- Modell: {model_name} ---")

            # v√§lj tr√§ningsstrategi beroende p√• smote_mode
            if smote_mode == "0":
                print("Ingen SMOTE anv√§nds.\n")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            elif smote_mode == "1":
                X_train_smote, y_train_smote = apply_basic_smote(X_train_scaled, y_train)
                model.fit(X_train_smote, y_train_smote)
                used_model = model
                smote_label = "BASIC"

            elif smote_mode == "2":
                used_model = smote_grid_search(model, X_train_scaled, y_train)
                smote_label = "GRID"

            elif smote_mode == "3":
                used_model = smotuned_de(model, X_train_scaled, y_train)
                smote_label = "SMOTUNED-DE"

            else:
                print("Ogiltigt SMOTE-val, anv√§nder ingen SMOTE.")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            # utv√§rdera
            eval_results = evaluate_model(used_model, X_test_scaled, y_test)
            print(f"Resultat ‚Äì {ds} ‚Äì {model_name} ‚Äì SMOTE-l√§ge {smote_label}")
            for k, v in eval_results.items():
                if k in ["y_pred", "y_proba"]:
                    continue

            all_results.append({
                "dataset": ds,
                "model": model_name,
                "smote_mode": smote_label,
                "accuracy": eval_results["accuracy"],
                "precision": eval_results["precision"],
                "recall": eval_results["recall"],
                "f1": eval_results["f1"],
                "auc": eval_results["auc"],
            })

    results_df = pd.DataFrame(all_results)
    print("\n=== Sammanfattning av alla k√∂rningar ===")
    display(results_df)

    # fortfarande: om du k√∂r ALL + ALL kan pivot-tabell vara nice
    if all_datasets_selected and all_models_selected and not results_df.empty:
        pivot_f1 = results_df.pivot_table(
            index=["dataset", "model"],
            columns="smote_mode",
            values="f1"
        )
        print("\n=== F1 per dataset/modell och SMOTE-l√§ge ===")
        display(pivot_f1)

    return results_df


# k√∂r menyn
#results_df = run_experiments_menu()


In [None]:
cross_project_experiment

# === CPDP: KC1 ‚Üí KC2 med RandomForest och olika SMOTE/RFE-l√§gen ===

# 1) Baseline: ingen SMOTE, ingen RFE
res_none = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="NONE",
    use_rfe=False
)

df_none = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "Baseline (NONE)",
    "Recall": res_none["recall"],
    "Precision": res_none["precision"],
    "F1": res_none["f1"],
    "Accuracy": res_none["accuracy"],
    "AUC": res_none["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì Baseline (NONE) ===")
display(df_none)


# 2) SMOTUNED-DE utan RFE
res_smotuned = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=False
)

df_smotuned = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "SMOTUNED-DE",
    "Recall": res_smotuned["recall"],
    "Precision": res_smotuned["precision"],
    "F1": res_smotuned["f1"],
    "Accuracy": res_smotuned["accuracy"],
    "AUC": res_smotuned["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì SMOTUNED-DE (utan RFE) ===")
display(df_smotuned)


# 3) SMOTUNED-DE + RFE (12 features)
res_smotuned_rfe = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)

df_smotuned_rfe = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "SMOTUNED-DE + RFE(12)",
    "Recall": res_smotuned_rfe["recall"],
    "Precision": res_smotuned_rfe["precision"],
    "F1": res_smotuned_rfe["f1"],
    "Accuracy": res_smotuned_rfe["accuracy"],
    "AUC": res_smotuned_rfe["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì SMOTUNED-DE + RFE(12) ===")
display(df_smotuned_rfe)


# (Extra) Kombinera alla tre i EN tabell f√∂r rapporten
df_kc1_kc2_all = pd.concat([df_none, df_smotuned, df_smotuned_rfe], ignore_index=True)
metrics = ["Recall", "Precision", "F1", "Accuracy", "AUC"]
df_kc1_kc2_all[metrics] = df_kc1_kc2_all[metrics].round(4)

print("\n=== Samlad tabell ‚Äì KC1 ‚Üí KC2, RandomForest, tre inst√§llningar ===")
display(df_kc1_kc2_all)


Gemensamma features: 20
Features som bara fanns i TRAIN och togs bort: {'locCodeAndComment'}
Features som bara fanns i TEST och togs bort: {'lOCodeAndComment'}

Ingen SMOTE anv√§nds.

Resultat ‚Äì KC1 ‚Üí KC2 ‚Äì RandomForest ‚Äì SMOTE: NONE
accuracy: 0.7969
precision: 0.5085
recall: 0.2804
f1: 0.3614
auc: 0.7682

=== Tabell: KC1 ‚Üí KC2 ‚Äì Baseline (NONE) ===


Unnamed: 0,Train ‚Üí Test,Model,Setting,Recall,Precision,F1,Accuracy,AUC
0,KC1 ‚Üí KC2,RandomForest,Baseline (NONE),0.280374,0.508475,0.361446,0.796935,0.768236


Gemensamma features: 20
Features som bara fanns i TRAIN och togs bort: {'locCodeAndComment'}
Features som bara fanns i TEST och togs bort: {'lOCodeAndComment'}



In [None]:
def run_experiment(
    dataset_name: str,
    model_name: str,
    smote_mode: str = "NONE",
    use_rfe: bool = False,
    n_features_to_select: int = 12
):
    """
    K√∂r ETT experiment p√• ETT dataset med EN modell.

    - dataset_name: t.ex. "JM1", "KC1", "KC2"
    - model_name:   t.ex. "LogisticRegression", "SVC", "RandomForest", "XGBoost", "Voting"
    - smote_mode:   "NONE", "BASIC", "GRID", "SMOTUNED-DE"
    - use_rfe:      True/False (om du vill k√∂ra RFE feature selection)
    """

    # 1) Ladda & skala data
    X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(dataset_name)

    X_train_used = X_train_scaled
    X_test_used = X_test_scaled

    # 2) Ev. RFE
    if use_rfe:
        print(f">>> K√∂r RFE ({n_features_to_select} features) f√∂r {dataset_name} / {model_name} <<<")
        temp_models = get_base_models()
        rfe_base_model = temp_models[model_name]

        X_train_used, X_test_used, rfe_selector = apply_rfe(
            base_model=rfe_base_model,
            X_train=X_train_scaled,
            y_train=y_train,
            X_test=X_test_scaled,
            n_features_to_select=n_features_to_select,
        )

    # 3) H√§mta basmodell
    base_models = get_base_models()
    if model_name not in base_models:
        raise ValueError(f"Modell '{model_name}' finns inte. Tillg√§ngliga: {list(base_models.keys())}")
    base_model = base_models[model_name]

    # 4) Tr√§na med valt SMOTE-l√§ge
    used_model, smote_label = train_with_smote_mode(
        base_model=base_model,
        X_train=X_train_used,
        y_train=y_train,
        smote_mode=smote_mode,
    )

    # 5) Utv√§rdera p√• testdata
    res = evaluate_model(used_model, X_test_used, y_test)

    print(f"\nResultat ‚Äì {dataset_name} ‚Äì {model_name} ‚Äì SMOTE={smote_label} ‚Äì RFE={use_rfe}")
    print(f"Recall   : {res['recall']:.4f}")
    print(f"Precision: {res['precision']:.4f}")
    print(f"F1-score : {res['f1']:.4f}")
    print(f"Accuracy : {res['accuracy']:.4f}")
    print(f"AUC      : {res['auc']:.4f}")

    return res["recall"], res["precision"], res["f1"]


K√ñRINGSFUNKTIONER ANV√ÑND SAMMA MEN BYT UT MODELL OCH DATASET
Starta exprimentet med hj√§lpfunktionen ovan

In [None]:
# 1. LR + SVM -ingen SMOTE, ingen RFE Tabell 4.1

recall_lr_ns, prec_lr_ns, f1_lr_ns = run_experiment(
    dataset_name="JM1",
    model_name="LogisticRegression",
    smote_mode="NONE",
    use_rfe=False
)

recall_svm_ns, prec_svm_ns, f1_svm_ns = run_experiment(
    dataset_name="JM1",
    model_name="SVC",
    smote_mode="NONE",
    use_rfe=False
)


In [None]:
# 2. Logistic Regression & SVM ‚Äì BASIC SMOTE, NO RFE (JM1) Tabell 4.2

recall_lr_sm, prec_lr_sm, f1_lr_sm = run_experiment(
    dataset_name="JM1",
    model_name="LogisticRegression",
    smote_mode="BASIC",   # vanlig SMOTE
    use_rfe=False
)

recall_svm_sm, prec_svm_sm, f1_svm_sm = run_experiment(
    dataset_name="JM1",
    model_name="SVC",
    smote_mode="BASIC",   # vanlig SMOTE
    use_rfe=False
)


In [None]:
# 3. RandomForest, XGBoost, Voting ‚Äì NO SMOTE, NO RFE (JM1) Tabell4.3
#RF,XGBoost, Voting - ingen SMOTE, ingen RFE

recall_rf_ns, prec_rf_ns, f1_rf_ns = run_experiment(
    dataset_name="JM1",
    model_name="RandomForest",
    smote_mode="NONE",
    use_rfe=False
)

recall_xgb_ns, prec_xgb_ns, f1_xgb_ns = run_experiment(
    dataset_name="JM1",
    model_name="XGBoost",
    smote_mode="NONE",
    use_rfe=False
)

recall_vot_ns, prec_vot_ns, f1_vot_ns = run_experiment(
    dataset_name="JM1",
    model_name="Voting",
    smote_mode="NONE",
    use_rfe=False
)


In [None]:
# 4. RandomForest, XGBoost, Voting ‚Äì NO SMOTE, WITH RFE (JM1)
# N√§r RFE anv√§nds, beh√•ll 12 features (enligt texten) Tabell 4.4 vi anv√§nder Recrusive Feature Eliminaton
#den rangordnar features baserat p√• deras betydelse f√∂r en RandomForest-modell och v√§ljer de 12 mest betydelsefulla.
# Masken v√§ljer vilka features som modellen ska tr√§nas p√•
recall_rf_fs, prec_rf_fs, f1_rf_fs = run_experiment(
    dataset_name="JM1",
    model_name="RandomForest",
    smote_mode="NONE",
    use_rfe=True,
    n_features_to_select=12
)

recall_xgb_fs, prec_xgb_fs, f1_xgb_fs = run_experiment(
    dataset_name="JM1",
    model_name="XGBoost",
    smote_mode="NONE",
    use_rfe=True,
    n_features_to_select=12
)

recall_vot_fs, prec_vot_fs, f1_vot_fs = run_experiment(
    dataset_name="JM1",
    model_name="Voting",
    smote_mode="NONE",
    use_rfe=True,
    n_features_to_select=12
)


In [None]:
# 5. RandomForest, XGBoost, Votingcommand:workbench.action.openLargeOutput?4f46e60f-23e8-4021-ace8-cf139781a49f ‚Äì SMOTUNED-DE, WITH RFE (JM1)
_#Rf, Xgboost, Voting - Smote +RFE tabell 4.5
recall_rf_smfs, prec_rf_smfs, f1_rf_smfs = run_experiment(
    dataset_name="JM1",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)

recall_xgb_smfs, prec_xgb_smfs, f1_xgb_smfs = run_experiment(
    dataset_name="JM1",
    model_name="XGBoost",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)

recall_vot_smfs, prec_vot_smfs, f1_vot_smfs = run_experiment(
    dataset_name="JM1",
    model_name="Voting",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)


Sammanst√§ll i tabell

In [None]:
# 6. Sammanst√§ll alla resultat i en tabell

results_all = [
    # 1) LR & SVM ‚Äì no SMOTE, no RFE
    ("JM1", "Logistic Regression", "No",        "No",        recall_lr_ns,  prec_lr_ns,  f1_lr_ns),
    ("JM1", "SVM",                 "No",        "No",        recall_svm_ns, prec_svm_ns, f1_svm_ns),

    # 2) LR & SVM ‚Äì basic SMOTE, no RFE
    ("JM1", "Logistic Regression", "SMOTE",     "No",        recall_lr_sm,  prec_lr_sm,  f1_lr_sm),
    ("JM1", "SVM",                 "SMOTE",     "No",        recall_svm_sm, prec_svm_sm, f1_svm_sm),

    # 3) RF, XGB, Voting ‚Äì no SMOTE, no RFE
    ("JM1", "Random Forest",       "No",        "No",        recall_rf_ns,  prec_rf_ns,  f1_rf_ns),
    ("JM1", "XGBoost",             "No",        "No",        recall_xgb_ns, prec_xgb_ns, f1_xgb_ns),
    ("JM1", "Model Averaging",     "No",        "No",        recall_vot_ns, prec_vot_ns, f1_vot_ns),

    # 4) RF, XGB, Voting ‚Äì no SMOTE, with RFE
    ("JM1", "Random Forest",       "No",        "RFE",       recall_rf_fs,  prec_rf_fs,  f1_rf_fs),
    ("JM1", "XGBoost",             "No",        "RFE",       recall_xgb_fs, prec_xgb_fs, f1_xgb_fs),
    ("JM1", "Model Averaging",     "No",        "RFE",       recall_vot_fs, prec_vot_fs, f1_vot_fs),

    # 5) RF, XGB, Voting ‚Äì SMOTUNED-DE, with RFE
    ("JM1", "Random Forest",       "SMOTUNED",  "RFE",       recall_rf_smfs,  prec_rf_smfs,  f1_rf_smfs),
    ("JM1", "XGBoost",             "SMOTUNED",  "RFE",       recall_xgb_smfs, prec_xgb_smfs, f1_xgb_smfs),
    ("JM1", "Model Averaging",     "SMOTUNED",  "RFE",       recall_vot_smfs, prec_vot_smfs, f1_vot_smfs),
]

df_results = pd.DataFrame(
    results_all,
    columns=["Dataset", "Model", "Oversampling", "Feature Selection", "Recall", "Precision", "F1"]
)

df_results


In [None]:
# XGBoost + SMOTUNED, utan RFE
recall_xgb_no_fs, prec_xgb_no_fs, f1_xgb_no_fs = run_experiment(
    dataset_name="JM1",
    model_name="XGBoost",
    smote_mode="SMOTUNED-DE",
    use_rfe=False
)

# XGBoost + SMOTUNED, med RFE
recall_xgb_fs, prec_xgb_fs, f1_xgb_fs = run_experiment(
    dataset_name="JM1",
    model_name="XGBoost",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)


Kod f√∂r ttest och 3-fold CV

In [None]:
# JM1 ‚Äì baseline vs ensemble
f1_lr = cross_val_f1_scores(
    dataset_name="JM1",
    model_name="LogisticRegression",
    smote_mode="NONE",
    use_rfe=False,
    n_splits=3
)

f1_rf = cross_val_f1_scores(
    dataset_name="JM1",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=False,
    n_splits=3
)

print("LR F1-scores:", f1_lr)
print("RF F1-scores:", f1_rf)

t_stat, p_val = ttest_rel(f1_lr, f1_rf)
print(f"Paired t-test: t = {t_stat:.4f}, p = {p_val:.4f}")


Kod f√∂r cross-projekt. Dvs tr√§nar p√• KC1 och testar p√• KC2

In [None]:
res_cpdp = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=False  # b√∂rja utan RFE
)


Samma expriment men med och utan smotetuned och RFE

In [None]:
# === CPDP: KC1 ‚Üí KC2 med RandomForest och olika SMOTE/RFE-l√§gen ===

# 1) Baseline: ingen SMOTE, ingen RFE
res_none = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="NONE",
    use_rfe=False
)

df_none = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "Baseline (NONE)",
    "Recall": res_none["recall"],
    "Precision": res_none["precision"],
    "F1": res_none["f1"],
    "Accuracy": res_none["accuracy"],
    "AUC": res_none["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì Baseline (NONE) ===")
display(df_none)


# 2) SMOTUNED-DE utan RFE
res_smotuned = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=False
)

df_smotuned = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "SMOTUNED-DE",
    "Recall": res_smotuned["recall"],
    "Precision": res_smotuned["precision"],
    "F1": res_smotuned["f1"],
    "Accuracy": res_smotuned["accuracy"],
    "AUC": res_smotuned["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì SMOTUNED-DE (utan RFE) ===")
display(df_smotuned)


# 3) SMOTUNED-DE + RFE (12 features)
res_smotuned_rfe = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)

df_smotuned_rfe = pd.DataFrame([{
    "Train ‚Üí Test": "KC1 ‚Üí KC2",
    "Model": "RandomForest",
    "Setting": "SMOTUNED-DE + RFE(12)",
    "Recall": res_smotuned_rfe["recall"],
    "Precision": res_smotuned_rfe["precision"],
    "F1": res_smotuned_rfe["f1"],
    "Accuracy": res_smotuned_rfe["accuracy"],
    "AUC": res_smotuned_rfe["auc"],
}])

print("\n=== Tabell: KC1 ‚Üí KC2 ‚Äì SMOTUNED-DE + RFE(12) ===")
display(df_smotuned_rfe)


# (Extra) Kombinera alla tre i EN tabell om du vill klistra in i rapporten
df_kc1_kc2_all = pd.concat([df_none, df_smotuned, df_smotuned_rfe], ignore_index=True)
df_kc1_kc2_all[["Recall", "Precision", "F1", "Accuracy", "AUC"]] = df_kc1_kc2_all[["Recall", "Precision", "F1", "Accuracy", "AUC"]].round(4)

print("\n=== Samlad tabell ‚Äì KC1 ‚Üí KC2, RandomForest, tre inst√§llningar ===")
display(df_kc1_kc2_all)


In [None]:
# Baseline CPDP utan SMOTE
res_rf_cpdp_no = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="NONE",
    use_rfe=False
)

# Din nuvarande (SMOTUNED)
res_rf_cpdp_sm = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=False  # eller True om du vill RFE h√§r ocks√•
)

res_rf_cpdp_sm_rfe = cross_project_experiment(
    train_dataset="KC1",
    test_dataset="KC2",
    model_name="RandomForest",
    smote_mode="SMOTUNED-DE",
    use_rfe=True,
    n_features_to_select=12
)



In [None]:
# Experiment 1: JM1 + RandomForest utan RFE
df_no_rfe, pivot_no_rfe = compare_smote_variants(
    "JM1",
    "RandomForest",
    use_rfe=False  # ingen feature selection
)

df_no_rfe


Kod f√∂r smote varianter MED RFE

In [None]:
# Experiment 2: JM1 + RandomForest med RFE (t.ex. 12 features)
df_rfe, pivot_rfe = compare_smote_variants(
    "KC1",
    "RandomForest",
    use_rfe=True,          # aktivera RFE
    n_features_to_select=12  # antal features du beh√•ller
)

df_rfe


Kod f√∂r tabell j√§mf√∂relse av dataset + modell med/utan RFE

In [None]:
# J√§mf√∂r JM1 + RandomForest med och utan RFE

df_no_rfe_tagged = df_no_rfe.copy().set_index("smote_mode")
df_rfe_tagged = df_rfe.copy().set_index("smote_mode")

comparison = pd.DataFrame({
    "F1_no_rfe": df_no_rfe_tagged["f1"],
    "F1_rfe": df_rfe_tagged["f1"],
    "Recall_no_rfe": df_no_rfe_tagged["recall"],
    "Recall_rfe": df_rfe_tagged["recall"],
})

comparison["delta_F1"] = comparison["F1_rfe"] - comparison["F1_no_rfe"]
comparison["delta_recall"] = comparison["Recall_rfe"] - comparison["Recall_no_rfe"]

comparison


F√∂r att k√∂ra via menyn

In [None]:

results_df = run_experiments_menu()

S√• h√§r tolkar du den:
Varje rad = samma modell + samma dataset, men olika SMOTE-uppl√§gg
NONE: ingen SMOTE (original obalanserad tr√§ning).
BASIC: standard-SMOTE.
GRID: SMOTE med grid-tunade parametrar.
SMOTUNED-DE: SMOTE med DE-optimerade parametrar.

Kolumnerna:
accuracy ‚Üí andel r√§tt totalt.
‚Üí kan vara missvisande p√• obalanserad data, f√∂r att en modell kan f√• h√∂g accuracy genom att n√§stan alltid gissa 0.
precision (f√∂r klass 1) ‚Üí ‚Äún√§r modellen s√§ger defekt, hur ofta har den r√§tt?‚Äù
recall (f√∂r klass 1) ‚Üí ‚Äúhur stor andel av alla verkliga defekter hittar modellen?‚Äù
f1 ‚Üí balans mellan precision & recall (bra huvudm√•tt).
auc ‚Üí hur bra modellen rangordnar defekter vs icke-defekter (threshold-oberoende).