In [26]:
# === Del 1: Importera n√∂dv√§ndiga bibliotek ===

import os  # Hantera filer, mappar och s√∂kv√§gar

import pandas as pd  # Pandas: l√§sa och hantera tabell-data (CSV osv)

from sklearn.model_selection import train_test_split  # Dela upp data i train/test
from sklearn.preprocessing import StandardScaler      # Skala numeriska features

from sklearn.linear_model import LogisticRegression   # Logistisk regression (klassificering)
from sklearn.svm import SVC                           # Support Vector Classifier (SVM-modell)
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# RandomForestClassifier: ensemble av m√•nga decision trees
# VotingClassifier: kombinerar flera modeller och l√•ter dem "r√∂sta"

from xgboost import XGBClassifier                     # XGBoost-modell (gradient boosting)

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)
# Dessa funktioner r√§knar ut utv√§rderingsm√•tt:
# accuracy, precision, recall, F1, AUC, samt en sammanfattande rapport

from imblearn.over_sampling import SMOTE              # SMOTE: √∂versampling f√∂r obalanserade klasser



import os : ger funktion f√∂r att kunna jobba med filer
import pandas: standardbiblotek f√∂r att l√§sa/skriva och hantera tabell-data
train_test_split: delar upp data i tr√§ning/test med stratifiering dvs den beh√•ller samma ratio exepelvis 80/20 i b√•de test och tr√§ningsdata.
Standardscalar. Skalar kolumnerna (features) s√• att de f√•r medelv√§rdet 0 och standardavikelse 1.
Logistig Regression: Enkel linj√§r klassifieringsmodell
SVc: Support Vector Machine , enkel klassifierare (extra)
RandomForestClassifier: Ensemblemodell av m√•nga beslutstr√§d. Vanlig Baseline
VotingClassifier: en kombination av olika modeller ex: LogReg +RF+XGB och ta majoritetsr√∂st (extra)
XGBClassifier: gradient boosting modell

Accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report : funktioner f√∂r att r√§kna utv√§rderingsm√•tt

SMOTE: syntetiskt skapar fler minoritets-exempel i tr√§ningsdataset(endast p√• train delen INTE test) f√∂r att hantera obalanserade klasser.

In [27]:
# === Del 2: Grundinst√§llningar ===

RANDOM_STATE = 42  # F√∂r reproducerbara resultat (samma slump varje g√•ng)

# Om din notebook ligger i mappen "Thesis/notebooks":
DATA_DIR = os.path.join("..", "data")

# Om din notebook ligger direkt i "Thesis"-mappen ist√§llet, anv√§nd i st√§llet:
# DATA_DIR = "data"

print("Arbetskatalog:", os.getcwd())
print("Data-mapp:", DATA_DIR)

# === Del 3: Ladda in ett NASA-dataset (CM1) ===

cm1_path = os.path.join(DATA_DIR, "cm1.csv")
print("S√∂kv√§g till filen:", cm1_path)

# L√§sa in csv-filen med pandas
cm1 = pd.read_csv(cm1_path)

# Snabb √∂verblick
print("Shape (rader, kolumner):", cm1.shape)
print("\nKolumner:")
print(cm1.columns)

# Kolla hur m√•lkategorin ser ut (just nu gissar vi att den heter 'defects')
if "defects" in cm1.columns:
    print("\nKlassf√∂rdelning i 'defects':")
    print(cm1["defects"].value_counts())
else:
    print("\nHittade ingen kolumn med namnet 'defects' ‚Äì vi f√•r kolla vad target heter.")


Arbetskatalog: c:\Users\josef\OneDrive\Desktop\Thesis\notebooks
Data-mapp: ..\data
S√∂kv√§g till filen: ..\data\cm1.csv
Shape (rader, kolumner): (498, 22)

Kolumner:
Index(['loc', 'v(g)', 'ev(g)', 'iv(g)', 'n', 'v', 'l', 'd', 'i', 'e', 'b', 't',
       'lOCode', 'lOComment', 'lOBlank', 'locCodeAndComment', 'uniq_Op',
       'uniq_Opnd', 'total_Op', 'total_Opnd', 'branchCount', 'defects'],
      dtype='object')

Klassf√∂rdelning i 'defects':
defects
False    449
True      49
Name: count, dtype: int64


os.path.join(DATA_DIR, "cm1.csv") skapar r√§tt filv√§g
pb.read_csv(...) l√§ser in filen till en DataFrame vi skriver ut: Shape antal rader/kolumner, columnnamn och evetuell klassf√∂rdelning om target heter defect

In [28]:
# === Del 4: Dela upp data i X (features) och y (m√•lvariabel) ===

TARGET_COL = "defects"  # √§ndra h√§r om din kolumn heter n√•got annat

# X = alla kolumner utom target
X = cm1.drop(columns=[TARGET_COL])

# y = bara target-kolumnen
y = cm1[TARGET_COL]

print("X shape:", X.shape)  # (antal rader, antal features)
print("y shape:", y.shape)
print("\nKlassf√∂rdelning i y:")
print(y.value_counts())


X shape: (498, 21)
y shape: (498,)

Klassf√∂rdelning i y:
defects
False    449
True      49
Name: count, dtype: int64


drop(columns= [TARGET_COL]) vi plockar bort defects och sparar resten som indata.
y=cm1[TARGET_COL] vector med 0/1 dvs defekt/icke-defekt
value_counts () visar hur obalanserad datan √§r, viktig att notera f√∂re smote

In [29]:
# === Del 5: Train-test-split med stratifiering ===

from sklearn.model_selection import train_test_split  # redan importerat, men skadar inte

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 80% tr√§ning, 20% test
    random_state=42,     # samma slump varje g√•ng
    stratify=y           # bevara klassf√∂rdelning i b√•de train och test
)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)
print("\nTrain klassf√∂rdelning:")
print(y_train.value_counts(normalize=True))
print("\nTest klassf√∂rdelning:")
print(y_test.value_counts(normalize=True))

# === Del 6: Skala features med StandardScaler ===

scaler = StandardScaler()

# fit p√• tr√§ningsdatan, transform b√•de train och test
# fit_tranform l√§r sig parametrar (medelv√§rde, std) fr√•n X_train och
# anv√§nder dessa parametrar f√∂r att skala datan
# detta g√∂r 

X_train_scaled = scaler.fit_transform(X_train)  #
X_test_scaled = scaler.transform(X_test)

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)



Train shape: (398, 21) (398,)
Test shape: (100, 21) (100,)

Train klassf√∂rdelning:
defects
False    0.90201
True     0.09799
Name: proportion, dtype: float64

Test klassf√∂rdelning:
defects
False    0.9
True     0.1
Name: proportion, dtype: float64
X_train_scaled shape: (398, 21)
X_test_scaled shape: (100, 21)


N√§r vi normaliserar (t.ex. med StandardScaler) g√∂r vi s√• att alla features f√•r ungef√§r:

medelv√§rde ‚âà 0
spridning (standardavvikelse) ‚âà 1

Det betyder att:
alla kolumner hamnar p√• liknande skala
ingen feature ‚Äúdominerar‚Äù bara f√∂r att den r√•kar ha stora tal
Modeller som Logistisk regression, SVM och neurala n√§t fungerar d√•:
stabilare
snabbare
och ger oftast b√§ttre resultat.


Vi skalar om alla features s√• att de har liknande storlek ist√§llet f√∂r att vissa √§r s√§ttestora och andra j√§tteasm√•

test_size=0.2 betyder att 20% av datan sparas som test dvs den r√∂rs inte av SMOTE eller tr√§ning. 
stratify=y samma propwertion 0/1 i b√•de train data och test data
vi skriver ut storleken och dess klassf√∂rdelning f√∂r att verifiers.

eli5: 21 st kolumner √§r input och 1 √§r output = defects
N√§r vi tr√§nar en modell beh√∂ver den veta vad √§r input -> X och vad det r√§tta svaret √§r f√∂r varje rad -> y
d√§rf√∂r √§r x alla kolumner som modellen anv√§nder f√∂r att g√∂ra en prediction, kolumnen defects √§r facit.

Genom klassf√∂rdelning kontrollerar vi 3 grejer

A vi kontrollerar att vi har r√§tt kolumn som target det ser vi genom att vi ser tv√• v√§rden och verkar vara bin√§rt?

B Vi ser hur obalanserad datan √§r genom value_counts() som s√§ger hur m√•nga fler 0 √§n 1 vi har exempelvis 90% klass 0 och 10% klass 1 
Detta hj√§lper varf√∂r vi beh√∂ver smote och hj√§lper oss att f√∂rst√• resultatet

C Vi kan j√§mf√∂ra f√∂re och efter split och SMOTE

F√∂re split & SMOTE (originaldata)
Genom y.value_counts() ser vi snabbt hur m√•nga som √§r majoritetsklassen (0) och minoriteten (1).
Det visar hur obalanserad datan √§r innan vi g√∂r n√•got alls.


Efter train/test-split
Vi skriver ut y_train.value_counts() och y_test.value_counts() f√∂r att
se att f√∂rdelningen 0/1 √§r liknande i b√•de train och test.
Det bekr√§ftar att stratify=y fungerade.


Efter SMOTE (p√• tr√§ningen)
Vi k√∂r SMOTE p√• X_train, y_train och skriver sedan ut y_train_smote.value_counts().

Nu kan vi se att majoritetsklassen och minoritetsklassen √§r lika stora ‚Üí datan √§r balanserad f√∂r tr√§ningen.
Sedan j√§mf√∂r vi modellerna

Vi kan j√§mf√∂ra modellernas resultat:

p√• obalanserad data (f√∂re SMOTE)

mot balanserad tr√§ning (efter SMOTE)

D√• ser vi hur mycket SMOTE faktiskt f√∂rb√§ttrar t.ex. recall och F1 f√∂r minoritetsklassen.

In [30]:
# === Del 7: Hantera obalans med SMOTE (endast p√• train) ===

smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

print("F√∂re SMOTE:", X_train_scaled.shape, "klassf√∂rdelning:")
print(y_train.value_counts())

print("\nEfter SMOTE:", X_train_smote.shape, "klassf√∂rdelning:")
print(pd.Series(y_train_smote).value_counts())


F√∂re SMOTE: (398, 21) klassf√∂rdelning:
defects
False    359
True      39
Name: count, dtype: int64

Efter SMOTE: (718, 21) klassf√∂rdelning:
defects
False    359
True     359
Name: count, dtype: int64


Fit_resample skapar syntetiska minoritets exempel tills  b√•da klasserna har ungef√§r samma antal. Vi j√§mf√∂r sedan nshape och value counts f√∂r att observeras sdkillnaden f√∂re/efter

Vi skriver ut en funktion som hj√§lper oss att utv√§dera modellerna baserat p√• vad den har tr√§nar p√•, vad den gissar p√• och vad som √§r det r√§tta svaret.

steg f√∂r steg
1. delar datan i test och train i samma ratio
2. Vi skalar och anv√§nder SMOTE p√• tr√§ningsdatan
3. Vi tr√§nar modellen p√• tr√§ningsdatan 
4. Vi utv√§rderar modellen mot testdatan dvs y_test som √§r facit. Modellen gissar p√• testdatan med y_pred=model.predict(X_test)

In [None]:
# === Del 8: Hj√§lpfunktion f√∂r att utv√§rdera modeller ===

def evaluate_model(model, X_test, y_test):
    """
    Tr√§nad modell + testdata -> returnerar en dict med metrics.
    """
    y_pred = model.predict(X_test)

    # ber√§kna sannolikheter f√∂r klass 1 om m√∂jligt
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        # vissa modeller har decision_function ist√§llet
        y_proba = model.decision_function(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "y_pred": y_pred,
        "y_proba": y_proba,
    }
# === Del 9: Tr√§na och utv√§rdera olika modeller ===
# Logistisk Regression
log_reg = LogisticRegression(
    max_iter=1000,     # till√•ter fler iterationer s√• den hinner konvergera
    n_jobs=-1          # anv√§nd alla k√§rnor
)

log_reg.fit(X_train_smote, y_train_smote)

log_results = evaluate_model(log_reg, X_test_scaled, y_test)

print("Logistic Regression ‚Äì CM1 (med SMOTE)")
for k, v in log_results.items():
    if k in ["y_pred", "y_proba"]:
        continue
    print(f"{k}: {v:.4f}")



# === Del 10: Tr√§na Random Forest p√• CM1 (med SMOTE) ===

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_smote, y_train_smote)

rf_results = evaluate_model(rf, X_test_scaled, y_test)

print("Random Forest ‚Äì CM1 (med SMOTE)")
for k, v in rf_results.items():
    if k in ["y_pred", "y_proba"]:
        continue
    print(f"{k}: {v:.4f}")


In [None]:
F√∂rklaring:
Vi tr√§nar modellen p√• balanserad tr√§ning (X_train_smote, y_train_smote).
Vi utv√§rderar p√• originell, obalanserad testdata (X_test_scaled, y_test).
Sedan skriver vi ut accuracy, precision, recall, f1, auc.

KOMPLETT KOD INKLUSIVE MENYN

In [None]:
# === Del 1: Imports & grundinst√§llningar ===

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)
from sklearn.base import clone

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from scipy.optimize import differential_evolution
from IPython.display import display

RANDOM_STATE = 42

# Anpassa beroende p√• var din notebook ligger:
# - om notebook ligger i Thesis/notebooks ‚Üí ".. / data"
# - om notebook ligger direkt i Thesis ‚Üí "data"
DATA_DIR = os.path.join("..", "data")
# DATA_DIR = "data"  # anv√§nd denna ist√§llet om notebooken ligger i rotmappen


# === Del 2: utv√§rderingsfunktion ===

def evaluate_model(model, X_test, y_test):
    """
    Tar en TR√ÑNAD modell + testdata och r√§knar nyckeltal.
    Returnerar en dict med metrics + y_pred/y_proba.
    """
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "y_pred": y_pred,
        "y_proba": y_proba,
    }


# === Del 3: Dataset-konfiguration & f√∂rberedelse ===

DATASETS = {
    "JM1": {"filename": "jm1.csv", "target": "defects"},
    "KC1": {"filename": "kc1.csv", "target": "defects"},
    "KC2": {"filename": "kc2.csv", "target": "defects"},
    "PC1": {"filename": "pc1.csv", "target": "defects"},
    "CM1": {"filename": "cm1.csv", "target": "defects"},
}

def load_and_prepare_dataset(dataset_name):
    """
    L√§ser in valt dataset, delar i train/test, skalar features.
    SMOTE g√∂r vi separat beroende p√• menyval.
    """
    info = DATASETS[dataset_name]
    path = os.path.join(DATA_DIR, info["filename"])
    df = pd.read_csv(path)

    target_col = info["target"]
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # s√§kerst√§ll 0/1
    if y.dtype == "bool":
        y = y.astype(int)
    elif y.dtype == "object":
        y = y.str.lower().map({"yes": 1, "true": 1, "defective": 1}).fillna(0).astype(int)

    print(f"{dataset_name}: shape={df.shape}")
    print("Klassf√∂rdelning (hela datan):")
    print(y.value_counts(), "\n")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Train klassf√∂rdelning:")
    print(y_train.value_counts())
    print("\nTest klassf√∂rdelning:")
    print(y_test.value_counts(), "\n")

    return X_train_scaled, X_test_scaled, y_train, y_test


# === Del 4: Modeller (alla basmodeller) ===

def get_base_models():
    """
    Skapar alla modeller vi vill testa.
    """
    log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)

    rf = RandomForestClassifier(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    ann = MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=200,
        random_state=RANDOM_STATE,
    )

    svc = SVC(
        kernel="rbf",
        probability=True,   # beh√∂vs f√∂r AUC
        random_state=RANDOM_STATE,
    )

    voting = VotingClassifier(
        estimators=[
            ("logreg", log_reg),
            ("rf", rf),
            ("xgb", xgb),
        ],
        voting="soft"  # anv√§nder sannolikheter
    )

    models = {
        "LogisticRegression": log_reg,
        "RandomForest": rf,
        "XGBoost": xgb,
        "ANN": ann,
        "SVC": svc,
        "Voting": voting,
    }
    return models


# === Del 5: SMOTE-varianter ===

def apply_basic_smote(X_train, y_train):
    """
    Standard-SMOTE med default-parametrar.
    """
    smote = SMOTE(random_state=RANDOM_STATE)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    print("Efter basic SMOTE:")
    print(pd.Series(y_res).value_counts(), "\n")
    return X_res, y_res


def smote_grid_search(model, X_train, y_train):
    """
    Enkel grid search p√• SMOTE-parametrar (inspirerad av SMOTUNED-id√©n).
    """
    pipe = Pipeline([
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", model),
    ])

    param_grid = {
        "smote.k_neighbors": [3, 5, 7],
        "smote.sampling_strategy": [0.5, 0.75, 1.0],
    }

    grid = GridSearchCV(
        pipe,
        param_grid,
        scoring="f1",
        cv=3,
        n_jobs=-1,
        verbose=0,
    )

    grid.fit(X_train, y_train)
    print("GRID-SMOTE ‚Äì b√§sta parametrar:", grid.best_params_)
    return grid.best_estimator_


def smotuned_de(model, X_train, y_train):
    """
    F√∂renklad SMOTUNED-id√©:
    differential evolution optimerar SMOTE-parametrar (k_neighbors, sampling_strategy)
    f√∂r att maximera F1 med 3-fold CV.
    """

    def objective(params):
        # params = [k_neighbors, sampling_strategy]
        k = int(round(params[0]))
        k = max(2, min(k, 15))   # h√•ll k inom [2, 15]

        sampling = float(params[1])
        sampling = max(0.2, min(sampling, 1.0))  # sampling_strategy inom [0.2, 1.0]

        smote = SMOTE(
            k_neighbors=k,
            sampling_strategy=sampling,
            random_state=RANDOM_STATE,
        )

        X_res, y_res = smote.fit_resample(X_train, y_train)

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        scores = []

        for train_idx, val_idx in cv.split(X_res, y_res):
            X_tr, X_val = X_res[train_idx], X_res[val_idx]
            y_tr, y_val = y_res[train_idx], y_res[val_idx]

            m = clone(model)
            m.fit(X_tr, y_tr)
            y_pred = m.predict(X_val)
            scores.append(f1_score(y_val, y_pred, zero_division=0))

        # differential_evolution minimerar, s√• vi returnerar -F1
        return -np.mean(scores)

    bounds = [
        (2, 15),    # k_neighbors
        (0.2, 1.0), # sampling_strategy
    ]

    result = differential_evolution(
        objective,
        bounds,
        maxiter=15,
        popsize=10,
        tol=0.01,
        polish=True,
        disp=False,
    )

    best_k = int(round(result.x[0]))
    best_sampling = float(result.x[1])
    best_k = max(2, min(best_k, 15))
    best_sampling = max(0.2, min(best_sampling, 1.0))

    print("SMOTUNED-DE ‚Äì b√§sta parametrar:")
    print("k_neighbors:", best_k)
    print("sampling_strategy:", best_sampling)

    best_smote = SMOTE(
        k_neighbors=best_k,
        sampling_strategy=best_sampling,
        random_state=RANDOM_STATE,
    )
    X_res_best, y_res_best = best_smote.fit_resample(X_train, y_train)

    final_model = clone(model)
    final_model.fit(X_res_best, y_res_best)

    return final_model


# === Del 6: Meny f√∂r att k√∂ra experiment ===

def run_experiments_menu():
    # v√§lj dataset
    print("Tillg√§ngliga dataset:")
    dataset_names = list(DATASETS.keys())
    for idx, name in enumerate(dataset_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla dataset")

    dataset_choice = input("V√§lj dataset (t.ex. 1, 2, 3 eller JM1/KC1/ALL): ").strip().upper()

    all_datasets_selected = False

    if dataset_choice == "ALL":
        datasets_to_run = dataset_names
        all_datasets_selected = True
    elif dataset_choice.isdigit():
        idx = int(dataset_choice) - 1
        if 0 <= idx < len(dataset_names):
            datasets_to_run = [dataset_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]
    else:
        # anta att anv√§ndaren skrev namnet direkt, t.ex. JM1
        if dataset_choice in DATASETS:
            datasets_to_run = [dataset_choice]
        else:
            print("Ogiltigt namn, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]

    # v√§lj modell(er)
    models = get_base_models()
    print("\nTillg√§ngliga modeller:")
    model_names = list(models.keys())
    for idx, name in enumerate(model_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla modeller")

    model_choice = input("V√§lj modell (t.ex. 1, 2 eller RandomForest/ALL): ").strip()

    all_models_selected = False

    if model_choice.upper() == "ALL":
        model_names_to_run = model_names
        all_models_selected = True
    elif model_choice.isdigit():
        idx = int(model_choice) - 1
        if 0 <= idx < len(model_names):
            model_names_to_run = [model_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]
    else:
        if model_choice in models:
            model_names_to_run = [model_choice]
        else:
            print("Ogiltigt modellnamn, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]

    # v√§lj SMOTE-l√§ge
    print("\nSMOTE-l√§gen:")
    print("0 = Ingen SMOTE")
    print("1 = Basic SMOTE (standardparametrar)")
    print("2 = GRID-SMOTE (enkel tuning)")
    print("3 = SMOTUNED-DE (evolution√§r tuning)")
    print("4 = J√§mf√∂r ALLA SMOTE-varianter f√∂r vald dataset + modell")
    smote_mode = input("V√§lj 0 / 1 / 2 / 3 / 4: ").strip()

    # üî∏ Specialfall: smote_mode 4 = k√∂r compare_smote_variants f√∂r EN kombination
    if smote_mode == "4":
        if len(datasets_to_run) == 1 and len(model_names_to_run) == 1:
            ds = datasets_to_run[0]
            mn = model_names_to_run[0]
            df_compare, pivot_compare = compare_smote_variants(ds, mn)
            return df_compare
        else:
            print("\n‚ö† SMOTE-l√§ge 4 kr√§ver att du v√§ljer EXAKT ett dataset och en modell (inte ALL).")
            print("Byter till l√§ge 1 (Basic SMOTE) ist√§llet.\n")
            smote_mode = "1"

    all_results = []

    for ds in datasets_to_run:
        print("\n==============================")
        print(f"K√∂r dataset: {ds}")
        print("==============================\n")

        X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(ds)

        for model_name in model_names_to_run:
            base_models = get_base_models()  # nya instanser
            model = base_models[model_name]

            print(f"\n--- Modell: {model_name} ---")

            # v√§lj tr√§ningsstrategi beroende p√• smote_mode
            if smote_mode == "0":
                print("Ingen SMOTE anv√§nds.\n")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            elif smote_mode == "1":
                X_train_smote, y_train_smote = apply_basic_smote(X_train_scaled, y_train)
                model.fit(X_train_smote, y_train_smote)
                used_model = model
                smote_label = "BASIC"

            elif smote_mode == "2":
                used_model = smote_grid_search(model, X_train_scaled, y_train)
                smote_label = "GRID"

            elif smote_mode == "3":
                used_model = smotuned_de(model, X_train_scaled, y_train)
                smote_label = "SMOTUNED-DE"

            else:
                print("Ogiltigt SMOTE-val, anv√§nder ingen SMOTE.")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            # utv√§rdera
            eval_results = evaluate_model(used_model, X_test_scaled, y_test)
            print(f"Resultat ‚Äì {ds} ‚Äì {model_name} ‚Äì SMOTE-l√§ge {smote_label}")
            for k, v in eval_results.items():
                if k in ["y_pred", "y_proba"]:
                    continue
                print(f"{k}: {v:.4f}")

            all_results.append({
                "dataset": ds,
                "model": model_name,
                "smote_mode": smote_label,
                "accuracy": eval_results["accuracy"],
                "precision": eval_results["precision"],
                "recall": eval_results["recall"],
                "f1": eval_results["f1"],
                "auc": eval_results["auc"],
            })

    results_df = pd.DataFrame(all_results)
    print("\n=== Sammanfattning av alla k√∂rningar ===")
    display(results_df)

    # fortfarande: om du k√∂r ALL + ALL kan pivot-tabell vara nice
    if all_datasets_selected and all_models_selected and not results_df.empty:
        pivot_f1 = results_df.pivot_table(
            index=["dataset", "model"],
            columns="smote_mode",
            values="f1"
        )
        print("\n=== F1 per dataset/modell och SMOTE-l√§ge ===")
        display(pivot_f1)

    return results_df
results_df = run_experiments_menu()


KOMPLETT 2

In [34]:
# === Del 1: Imports & grundinst√§llningar ===

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.base import clone

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from scipy.optimize import differential_evolution
from IPython.display import display

RANDOM_STATE = 42

# Anpassa beroende p√• var din notebook ligger:
# - om notebook ligger i Thesis/notebooks ‚Üí ".. / data"
# - om notebook ligger direkt i Thesis ‚Üí "data"
DATA_DIR = os.path.join("..", "data")
# DATA_DIR = "data"  # anv√§nd denna ist√§llet om notebooken ligger i rotmappen


# === Del 2: utv√§rderingsfunktion ===

def evaluate_model(model, X_test, y_test):
    """
    Tar en TR√ÑNAD modell + testdata och r√§knar nyckeltal.
    Returnerar en dict med metrics + y_pred/y_proba.
    """
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        # t.ex. SVC utan predict_proba men med decision_function
        y_proba_raw = model.decision_function(X_test)
        # skala om till [0,1] om det beh√∂vs
        y_proba = (y_proba_raw - y_proba_raw.min()) / (y_proba_raw.max() - y_proba_raw.min() + 1e-9)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc,
        "y_pred": y_pred,
        "y_proba": y_proba,
    }


# === Del 3: Dataset-konfiguration & f√∂rberedelse ===

DATASETS = {
    "JM1": {"filename": "jm1.csv", "target": "defects"},
    "KC1": {"filename": "kc1.csv", "target": "defects"},
    "KC2": {"filename": "kc2.csv", "target": "defects"},
    "PC1": {"filename": "pc1.csv", "target": "defects"},
    "CM1": {"filename": "cm1.csv", "target": "defects"},
}

def load_and_prepare_dataset(dataset_name):
    """
    L√§ser in valt dataset, delar i train/test, skalar features.
    SMOTE g√∂r vi separat beroende p√• menyval.
    """
    info = DATASETS[dataset_name]
    path = os.path.join(DATA_DIR, info["filename"])
    df = pd.read_csv(path)

    target_col = info["target"]
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # s√§kerst√§ll 0/1
    if y.dtype == "bool":
        y = y.astype(int)
    elif y.dtype == "object":
        y = y.str.lower().map({"yes": 1, "true": 1, "defective": 1}).fillna(0).astype(int)

    print(f"{dataset_name}: shape={df.shape}")
    print("Klassf√∂rdelning (hela datan):")
    print(y.value_counts(), "\n")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=RANDOM_STATE
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print("Train klassf√∂rdelning:")
    print(y_train.value_counts())
    print("\nTest klassf√∂rdelning:")
    print(y_test.value_counts(), "\n")

    return X_train_scaled, X_test_scaled, y_train, y_test


# === Del 4: Modeller (alla basmodeller) ===

def get_base_models():
    """
    Skapar alla modeller vi vill testa.
    """
    log_reg = LogisticRegression(max_iter=1000, n_jobs=-1)

    rf = RandomForestClassifier(
        n_estimators=200,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    xgb = XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    ann = MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=200,
        random_state=RANDOM_STATE,
    )

    svc = SVC(
        kernel="rbf",
        probability=True,   # beh√∂vs f√∂r AUC
        random_state=RANDOM_STATE,
    )

    voting = VotingClassifier(
        estimators=[
            ("logreg", log_reg),
            ("rf", rf),
            ("xgb", xgb),
        ],
        voting="soft"  # anv√§nder sannolikheter
    )

    models = {
        "LogisticRegression": log_reg,
        "RandomForest": rf,
        "XGBoost": xgb,
        "ANN": ann,
        "SVC": svc,
        "Voting": voting,
    }
    return models


# === Del 5: SMOTE-varianter ===

def apply_basic_smote(X_train, y_train):
    """
    Standard-SMOTE med default-parametrar.
    """
    smote = SMOTE(random_state=RANDOM_STATE)
    X_res, y_res = smote.fit_resample(X_train, y_train)
    print("Efter basic SMOTE:")
    print(pd.Series(y_res).value_counts(), "\n")
    return X_res, y_res


def smote_grid_search(model, X_train, y_train):
    """
    Enkel grid search p√• SMOTE-parametrar (inspirerad av SMOTUNED-id√©n).
    """
    pipe = Pipeline([
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", model),
    ])

    # OBS: dubbel underscore f√∂r pipeline-parametrar!
    param_grid = {
        "smote__k_neighbors": [3, 5, 7],
        "smote__sampling_strategy": [0.5, 0.75, 1.0],
    }

    grid = GridSearchCV(
        pipe,
        param_grid,
        scoring="f1",
        cv=3,
        n_jobs=-1,
        verbose=0,
    )

    grid.fit(X_train, y_train)
    print("GRID-SMOTE ‚Äì b√§sta parametrar:", grid.best_params_)
    return grid.best_estimator_


def smotuned_de(model, X_train, y_train):
    """
    F√∂renklad SMOTUNED-id√©:
    differential evolution optimerar SMOTE-parametrar (k_neighbors, sampling_strategy)
    f√∂r att maximera F1 med 3-fold CV.
    """

    def objective(params):
        # params = [k_neighbors, sampling_strategy]
        k = int(round(params[0]))
        k = max(2, min(k, 15))   # h√•ll k inom [2, 15]

        sampling = float(params[1])
        sampling = max(0.2, min(sampling, 1.0))  # sampling_strategy inom [0.2, 1.0]

        smote = SMOTE(
            k_neighbors=k,
            sampling_strategy=sampling,
            random_state=RANDOM_STATE,
        )

        X_res, y_res = smote.fit_resample(X_train, y_train)

        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        scores = []

        for train_idx, val_idx in cv.split(X_res, y_res):
            X_tr, X_val = X_res[train_idx], X_res[val_idx]
            y_tr, y_val = y_res[train_idx], y_res[val_idx]

            m = clone(model)
            m.fit(X_tr, y_tr)
            y_pred = m.predict(X_val)
            scores.append(f1_score(y_val, y_pred, zero_division=0))

        # differential_evolution minimerar, s√• vi returnerar -F1
        return -np.mean(scores)

    bounds = [
        (2, 15),    # k_neighbors
        (0.2, 1.0), # sampling_strategy
    ]

    result = differential_evolution(
        objective,
        bounds,
        maxiter=15,
        popsize=10,
        tol=0.01,
        polish=True,
        disp=False,
    )

    best_k = int(round(result.x[0]))
    best_sampling = float(result.x[1])
    best_k = max(2, min(best_k, 15))
    best_sampling = max(0.2, min(best_sampling, 1.0))

    print("SMOTUNED-DE ‚Äì b√§sta parametrar:")
    print("k_neighbors:", best_k)
    print("sampling_strategy:", best_sampling)

    best_smote = SMOTE(
        k_neighbors=best_k,
        sampling_strategy=best_sampling,
        random_state=RANDOM_STATE,
    )
    X_res_best, y_res_best = best_smote.fit_resample(X_train, y_train)

    final_model = clone(model)
    final_model.fit(X_res_best, y_res_best)

    return final_model


# === Del 6: j√§mf√∂relsefunktion f√∂r EN modell + EN dataset ===

def compare_smote_variants(dataset_name, model_name):
    """
    K√∂r SAMMA dataset + SAMMA modell med:
    - ingen SMOTE
    - basic SMOTE
    - GRID-SMOTE
    - SMOTUNED-DE
    och returnerar en liten tabell med nyckeltal.
    """

    # 1) Ladda och skala data
    X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(dataset_name)

    # 2) H√§mta vald basmodell
    base_models = get_base_models()
    if model_name not in base_models:
        raise ValueError(f"Modell '{model_name}' finns inte. Tillg√§ngliga: {list(base_models.keys())}")
    base_model = base_models[model_name]

    results = []

    # --- 0: Ingen SMOTE ---
    m_none = clone(base_model)
    m_none.fit(X_train_scaled, y_train)
    res_none = evaluate_model(m_none, X_test_scaled, y_test)
    results.append({
        "dataset": dataset_name,
        "model": model_name,
        "smote_mode": "NONE",
        "accuracy": res_none["accuracy"],
        "precision": res_none["precision"],
        "recall": res_none["recall"],
        "f1": res_none["f1"],
        "auc": res_none["auc"],
    })

    # --- 1: Basic SMOTE ---
    X_train_smote, y_train_smote = apply_basic_smote(X_train_scaled, y_train)
    m_basic = clone(base_model)
    m_basic.fit(X_train_smote, y_train_smote)
    res_basic = evaluate_model(m_basic, X_test_scaled, y_test)
    results.append({
        "dataset": dataset_name,
        "model": model_name,
        "smote_mode": "BASIC",
        "accuracy": res_basic["accuracy"],
        "precision": res_basic["precision"],
        "recall": res_basic["recall"],
        "f1": res_basic["f1"],
        "auc": res_basic["auc"],
    })

    # --- 2: GRID-SMOTE ---
    m_grid = smote_grid_search(base_model, X_train_scaled, y_train)
    res_grid = evaluate_model(m_grid, X_test_scaled, y_test)
    results.append({
        "dataset": dataset_name,
        "model": model_name,
        "smote_mode": "GRID",
        "accuracy": res_grid["accuracy"],
        "precision": res_grid["precision"],
        "recall": res_grid["recall"],
        "f1": res_grid["f1"],
        "auc": res_grid["auc"],
    })

    # --- 3: SMOTUNED-DE ---
    m_de = smotuned_de(base_model, X_train_scaled, y_train)
    res_de = evaluate_model(m_de, X_test_scaled, y_test)
    results.append({
        "dataset": dataset_name,
        "model": model_name,
        "smote_mode": "SMOTUNED-DE",
        "accuracy": res_de["accuracy"],
        "precision": res_de["precision"],
        "recall": res_de["recall"],
        "f1": res_de["f1"],
        "auc": res_de["auc"],
    })

    df = pd.DataFrame(results)
    print(f"\n=== J√§mf√∂relse SMOTE-varianter ‚Äì dataset: {dataset_name}, modell: {model_name} ===")
    display(df)

    # liten pivot p√• F1 f√∂r att se snabbt vilken som vinner
    pivot_f1 = df.pivot_table(
        index=["dataset", "model"],
        columns="smote_mode",
        values="f1"
    )
    print("\nF1 per SMOTE-l√§ge:")
    display(pivot_f1)

    return df, pivot_f1


# === Del 7: Meny f√∂r att k√∂ra experiment ===

def run_experiments_menu():
    # v√§lj dataset
    print("Tillg√§ngliga dataset:")
    dataset_names = list(DATASETS.keys())
    for idx, name in enumerate(dataset_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla dataset")

    dataset_choice = input("V√§lj dataset (t.ex. 1, 2, 3 eller JM1/KC1/ALL): ").strip().upper()

    all_datasets_selected = False

    if dataset_choice == "ALL":
        datasets_to_run = dataset_names
        all_datasets_selected = True
    elif dataset_choice.isdigit():
        idx = int(dataset_choice) - 1
        if 0 <= idx < len(dataset_names):
            datasets_to_run = [dataset_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]
    else:
        # anta att anv√§ndaren skrev namnet direkt, t.ex. JM1
        if dataset_choice in DATASETS:
            datasets_to_run = [dataset_choice]
        else:
            print("Ogiltigt namn, anv√§nder f√∂rsta datasetet.")
            datasets_to_run = [dataset_names[0]]

    # v√§lj modell(er)
    models = get_base_models()
    print("\nTillg√§ngliga modeller:")
    model_names = list(models.keys())
    for idx, name in enumerate(model_names, start=1):
        print(f"{idx} = {name}")
    print("ALL = alla modeller")

    model_choice = input("V√§lj modell (t.ex. 1, 2 eller RandomForest/ALL): ").strip()

    all_models_selected = False

    if model_choice.upper() == "ALL":
        model_names_to_run = model_names
        all_models_selected = True
    elif model_choice.isdigit():
        idx = int(model_choice) - 1
        if 0 <= idx < len(model_names):
            model_names_to_run = [model_names[idx]]
        else:
            print("Ogiltigt sifferval, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]
    else:
        if model_choice in models:
            model_names_to_run = [model_choice]
        else:
            print("Ogiltigt modellnamn, anv√§nder f√∂rsta modellen.")
            model_names_to_run = [model_names[0]]

    # v√§lj SMOTE-l√§ge
    print("\nSMOTE-l√§gen:")
    print("0 = Ingen SMOTE")
    print("1 = Basic SMOTE (standardparametrar)")
    print("2 = GRID-SMOTE (enkel tuning)")
    print("3 = SMOTUNED-DE (evolution√§r tuning)")
    print("4 = J√§mf√∂r ALLA SMOTE-varianter f√∂r vald dataset + modell")
    smote_mode = input("V√§lj 0 / 1 / 2 / 3 / 4: ").strip()

    # üî∏ Specialfall: smote_mode 4 = k√∂r compare_smote_variants f√∂r EN kombination
    if smote_mode == "4":
        if len(datasets_to_run) == 1 and len(model_names_to_run) == 1:
            ds = datasets_to_run[0]
            mn = model_names_to_run[0]
            df_compare, pivot_compare = compare_smote_variants(ds, mn)
            return df_compare
        else:
            print("\n‚ö† SMOTE-l√§ge 4 kr√§ver att du v√§ljer EXAKT ett dataset och en modell (inte ALL).")
            print("Byter till l√§ge 1 (Basic SMOTE) ist√§llet.\n")
            smote_mode = "1"

    all_results = []

    for ds in datasets_to_run:
        print("\n==============================")
        print(f"K√∂r dataset: {ds}")
        print("==============================\n")

        X_train_scaled, X_test_scaled, y_train, y_test = load_and_prepare_dataset(ds)

        for model_name in model_names_to_run:
            base_models = get_base_models()  # nya instanser
            model = base_models[model_name]

            print(f"\n--- Modell: {model_name} ---")

            # v√§lj tr√§ningsstrategi beroende p√• smote_mode
            if smote_mode == "0":
                print("Ingen SMOTE anv√§nds.\n")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            elif smote_mode == "1":
                X_train_smote, y_train_smote = apply_basic_smote(X_train_scaled, y_train)
                model.fit(X_train_smote, y_train_smote)
                used_model = model
                smote_label = "BASIC"

            elif smote_mode == "2":
                used_model = smote_grid_search(model, X_train_scaled, y_train)
                smote_label = "GRID"

            elif smote_mode == "3":
                used_model = smotuned_de(model, X_train_scaled, y_train)
                smote_label = "SMOTUNED-DE"

            else:
                print("Ogiltigt SMOTE-val, anv√§nder ingen SMOTE.")
                model.fit(X_train_scaled, y_train)
                used_model = model
                smote_label = "NONE"

            # utv√§rdera
            eval_results = evaluate_model(used_model, X_test_scaled, y_test)
            print(f"Resultat ‚Äì {ds} ‚Äì {model_name} ‚Äì SMOTE-l√§ge {smote_label}")
            for k, v in eval_results.items():
                if k in ["y_pred", "y_proba"]:
                    continue
                print(f"{k}: {v:.4f}")

            all_results.append({
                "dataset": ds,
                "model": model_name,
                "smote_mode": smote_label,
                "accuracy": eval_results["accuracy"],
                "precision": eval_results["precision"],
                "recall": eval_results["recall"],
                "f1": eval_results["f1"],
                "auc": eval_results["auc"],
            })

    results_df = pd.DataFrame(all_results)
    print("\n=== Sammanfattning av alla k√∂rningar ===")
    display(results_df)

    # Om du k√∂r ALL + ALL: visa pivot-tabell p√• F1
    if all_datasets_selected and all_models_selected and not results_df.empty:
        pivot_f1 = results_df.pivot_table(
            index=["dataset", "model"],
            columns="smote_mode",
            values="f1"
        )
        print("\n=== F1 per dataset/modell och SMOTE-l√§ge ===")
        display(pivot_f1)

    return results_df
results_df = run_experiments_menu()


Tillg√§ngliga dataset:
1 = JM1
2 = KC1
3 = KC2
4 = PC1
5 = CM1
ALL = alla dataset

Tillg√§ngliga modeller:
1 = LogisticRegression
2 = RandomForest
3 = XGBoost
4 = ANN
5 = SVC
6 = Voting
ALL = alla modeller

SMOTE-l√§gen:
0 = Ingen SMOTE
1 = Basic SMOTE (standardparametrar)
2 = GRID-SMOTE (enkel tuning)
3 = SMOTUNED-DE (evolution√§r tuning)
4 = J√§mf√∂r ALLA SMOTE-varianter f√∂r vald dataset + modell
JM1: shape=(13204, 22)
Klassf√∂rdelning (hela datan):
defects
0    11101
1     2103
Name: count, dtype: int64 

Train klassf√∂rdelning:
defects
0    8881
1    1682
Name: count, dtype: int64

Test klassf√∂rdelning:
defects
0    2220
1     421
Name: count, dtype: int64 

Efter basic SMOTE:
defects
0    8881
1    8881
Name: count, dtype: int64 

GRID-SMOTE ‚Äì b√§sta parametrar: {'smote__k_neighbors': 5, 'smote__sampling_strategy': 1.0}
SMOTUNED-DE ‚Äì b√§sta parametrar:
k_neighbors: 14
sampling_strategy: 0.9998369460786775

=== J√§mf√∂relse SMOTE-varianter ‚Äì dataset: JM1, modell: LogisticRe

Unnamed: 0,dataset,model,smote_mode,accuracy,precision,recall,f1,auc
0,JM1,LogisticRegression,NONE,0.842484,0.532468,0.097387,0.164659,0.719646
1,JM1,LogisticRegression,BASIC,0.711094,0.291971,0.570071,0.386163,0.720185
2,JM1,LogisticRegression,GRID,0.711094,0.291971,0.570071,0.386163,0.720185
3,JM1,LogisticRegression,SMOTUNED-DE,0.70958,0.290049,0.567696,0.383936,0.719507



F1 per SMOTE-l√§ge:


Unnamed: 0_level_0,smote_mode,BASIC,GRID,NONE,SMOTUNED-DE
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JM1,LogisticRegression,0.386163,0.386163,0.164659,0.383936
