# CROSS-VALIDATION — Ringkasan per Fold (dari artefak)

Notebook ini bertujuan untuk mengolah hasil *cross-validation* (CV) yang tersimpan sebagai artefak. Prosesnya adalah membaca semua file `cv_*.csv`, mengidentifikasi kombinasi hyperparameter terbaik, lalu merangkum skornya ke dalam sebuah file Excel yang mudah dibaca dan siap untuk presentasi.

---

### Tahapan Proses

1.  **Membaca Data**: Membaca semua file dengan pola `cv_*.csv` yang berada di direktori `./artefacts_2024_window_full`.

2.  **Identifikasi Kombinasi Terbaik**: Untuk setiap file, notebook akan mencari baris dengan **rank teratas** (atau **rata-rata skor tertinggi**) untuk menentukan kombinasi hyperparameter terbaik.

3.  **Ekstraksi Skor**: Dari baris terbaik tersebut, notebook akan mengekstrak skor dari setiap *fold* (`split0_test_score`, `split1_test_score`, ..., `splitK_test_score`) beserta nilai rata-ratanya (`mean_test_score`).

4.  **Menulis Output**: Hasil ekstraksi akan disusun rapi dan ditulis ke dalam sebuah file **Excel**

In [1]:
from pathlib import Path
import re, json
import numpy as np
import pandas as pd

ARTE_DIR = Path("./artefacts_2024_window_full")
OUT_DIR  = Path("./reports_2024_window_full")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Pemetaan nama pipeline -> (Algoritma, Feature)
LABEL_MAP = {
    "count_nb":   ("Naïve Bayes",          "CountVectorizer"),
    "tfidf_nb":   ("Naïve Bayes",          "TF-IDF"),
    "count_svm":  ("SVM (linear)",         "CountVectorizer"),
    "tfidf_svm":  ("SVM (linear)",         "TF-IDF"),
    "count_lr":   ("Logistic Regression",  "CountVectorizer"),
    "tfidf_lr":   ("Logistic Regression",  "TF-IDF"),
    "stack_count":("Stacking",             "CountVectorizer"),
    "stack_tfidf":("Stacking",             "TF-IDF"),
}

# Urutan tampilan baris (biar rapi seperti slide)
ORDER = ["Naïve Bayes","SVM (linear)","Logistic Regression","Stacking"]
FEAT_ORDER = ["CountVectorizer","TF-IDF"]


In [2]:
def read_best_folds(path: Path):
    """
    Ambil row terbaik dari RandomizedSearchCV:
    - Jika ada 'rank_test_score': pilih yang rank minimum.
    - Jika tidak: pilih 'mean_test_score' maksimum.
    Return: list skor per split, mean.
    """
    df = pd.read_csv(path)
    split_cols = [c for c in df.columns if re.match(r"split\d+_test_score$", c)]
    split_cols = sorted(split_cols, key=lambda x: int(re.findall(r"\d+", x)[0]))
    if "rank_test_score" in df.columns:
        best = df.loc[df["rank_test_score"].astype(int).idxmin()]
    else:
        best = df.loc[df["mean_test_score"].astype(float).idxmax()]
    folds = [float(best[c]) for c in split_cols]
    mean  = float(best["mean_test_score"]) if "mean_test_score" in best else np.mean(folds)
    return folds, mean, split_cols


In [3]:
# Baca resume JSON untuk tahu best_params tiap pipeline
cv_json = ARTE_DIR / "cv_results_2024_window.json"
summary = json.loads(cv_json.read_text(encoding="utf-8")) if cv_json.exists() else []
best_map = {d["name"]: d for d in summary if isinstance(d, dict) and "name" in d}

def is_balanced(name: str) -> bool:
    bp = (best_map.get(name) or {}).get("best_params", {})
    for k,v in (bp or {}).items():
        if "class_weight" in k and (v == "balanced" or (isinstance(v, list) and "balanced" in v)):
            return True
    return False


In [4]:
rows = []
max_folds = 0

for cv_file in sorted(ARTE_DIR.glob("cv_*.csv")):
    pipe = cv_file.stem.replace("cv_", "")  # e.g., 'count_nb'
    if pipe not in LABEL_MAP:
        continue
    folds, mean, split_cols = read_best_folds(cv_file)
    max_folds = max(max_folds, len(folds))
    algo, feat = LABEL_MAP[pipe]
    rows.append({
        "Pipeline": pipe,
        "Algoritma": algo,
        "Feature Extraction": feat,
        "Balanced?": is_balanced(pipe),
        **{f"Fold {i+1}": folds[i] for i in range(len(folds))},
        "Mean": mean
    })

df_cv = pd.DataFrame(rows)

# Susun urutan baris rapi
df_cv["AlgoOrder"]  = df_cv["Algoritma"].map({a:i for i,a in enumerate(ORDER)})
df_cv["FeatOrder"]  = df_cv["Feature Extraction"].map({f:i for i,f in enumerate(FEAT_ORDER)})
df_cv = df_cv.sort_values(["Balanced?","AlgoOrder","FeatOrder"]).drop(columns=["AlgoOrder","FeatOrder"])
df_cv


Unnamed: 0,Pipeline,Algoritma,Feature Extraction,Balanced?,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean
1,count_nb,Naïve Bayes,CountVectorizer,False,0.73922,0.742743,0.741772,0.741133,0.740777,0.741129
6,tfidf_nb,Naïve Bayes,TF-IDF,False,0.784355,0.791308,0.790564,0.790213,0.787679,0.788824
3,stack_count,Stacking,CountVectorizer,False,0.945853,0.949326,0.949855,0.94986,0.947964,0.948572
4,stack_tfidf,Stacking,TF-IDF,False,0.94284,0.946462,0.946518,0.947092,0.94535,0.945652
2,count_svm,SVM (linear),CountVectorizer,True,0.920855,0.925108,0.925547,0.926224,0.924115,0.92437
7,tfidf_svm,SVM (linear),TF-IDF,True,0.926178,0.930621,0.931044,0.931297,0.928449,0.929518
0,count_lr,Logistic Regression,CountVectorizer,True,0.944141,0.948123,0.948374,0.948224,0.94647,0.947066
5,tfidf_lr,Logistic Regression,TF-IDF,True,0.914222,0.918514,0.91885,0.919346,0.916813,0.917549


In [5]:
# === Satu sheet gabungan (ada baris judul "Imbalanced" dan "Balanced") ===
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side
from openpyxl.utils import get_column_letter
import numpy as np
import pandas as pd

def _write_group(ws, title, frame, k_folds, start_row):
    # judul baris grup
    ws.merge_cells(start_row=start_row, start_column=1, end_row=start_row, end_column=3+k_folds)
    cell = ws.cell(row=start_row, column=1, value=title)
    cell.font = Font(bold=True); cell.alignment = Alignment(horizontal="left", vertical="center")
    start_row += 1

    # header 2 tingkat
    headers_top = ["Algoritma", "Feature Extraction", "Fold", *[""]*(k_folds-1), "Mean"]
    headers_sub = ["", "", *[str(i+1) for i in range(k_folds)], ""]
    ws.append(headers_top); ws.append(headers_sub)
    r0 = start_row
    ws.merge_cells(start_row=r0,   start_column=1, end_row=r0+1, end_column=1)
    ws.merge_cells(start_row=r0,   start_column=2, end_row=r0+1, end_column=2)
    ws.merge_cells(start_row=r0,   start_column=3, end_row=r0,   end_column=2+k_folds)
    ws.merge_cells(start_row=r0,   start_column=3+k_folds, end_row=r0+1, end_column=3+k_folds)

    thin = Side(style="thin", color="000000")
    border = Border(left=thin, right=thin, top=thin, bottom=thin)
    bold = Font(bold=True)
    center = Alignment(horizontal="center", vertical="center", wrap_text=True)
    left   = Alignment(horizontal="left",   vertical="center", wrap_text=True)

    for rr in (r0, r0+1):
        for cc in range(1, 4+k_folds):
            c = ws.cell(row=rr, column=cc); c.font=bold; c.alignment=center
            c.border=border; c.fill=PatternFill("solid", fgColor="D9D9D9" if rr==r0 else "EEEEEE")

    r = r0 + 2
    for _, row in frame.iterrows():
        ws.cell(row=r, column=1, value=row["Algoritma"]).alignment = left
        ws.cell(row=r, column=2, value=row["Feature Extraction"]).alignment = left
        for i in range(k_folds):
            col = f"Fold {i+1}"
            val = row.get(col, np.nan)
            if pd.isna(val):
                ws.cell(row=r, column=3+i, value="–").alignment = center
            else:
                c = ws.cell(row=r, column=3+i, value=float(val))
                c.number_format = "0.00%"; c.alignment = center
        cm = ws.cell(row=r, column=3+k_folds, value=float(row["Mean"]))
        cm.number_format = "0.00%"; cm.alignment = center
        for cc in range(1, 4+k_folds):
            ws.cell(row=r, column=cc).border = border
        r += 1

    return r + 1  # next start row

# siapkan data (pakai df_cv dari langkah sebelumnya)
df_imbal = df_cv[df_cv["Balanced?"] == False].drop(columns=["Balanced?","Pipeline"]).reset_index(drop=True)
df_bal   = df_cv[df_cv["Balanced?"] == True ].drop(columns=["Balanced?","Pipeline"]).reset_index(drop=True)

# paksa jumlah fold tertentu (mis. 5) atau biarkan otomatis dari data
K = max(len([c for c in df_cv.columns if c.startswith("Fold ")]), 1)
# K = 5  # <- un-comment kalau mau selalu 5 kolom

wb = Workbook()
ws = wb.active; ws.title = "CV_for_Slide"

row = 1
row = _write_group(ws, "Imbalanced", df_imbal, K, row)
row = _write_group(ws, "Balanced",   df_bal,   K, row)

# lebar kolom
widths = [22, 20] + [10]*K + [12]
from openpyxl.utils import get_column_letter
for i,w in enumerate(widths, start=1):
    ws.column_dimensions[get_column_letter(i)].width = w

from pathlib import Path
out_combined = OUT_DIR / "cross_validation_for_slide.xlsx"
wb.save(out_combined)
print("Saved ->", out_combined)


Saved -> reports_2024_window_full\cross_validation_for_slide.xlsx
