In [1]:
import pandas as pd
import numpy as np
import time, gc
from itertools import combinations
from memory_profiler import memory_usage
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score
import scikit_posthocs as sp
from scipy.stats import friedmanchisquare


In [3]:
# Ganti path sesuai file kamu
dataset_paths = {
    "BreastCancer": "medis_breast_cancer.csv",
    "MNIST": "citra_mnist.csv",
    "NewsGroup": "teks_20newsgroups.csv"
}

datasets = {}
for name, path in dataset_paths.items():
    df = pd.read_csv(path)
    df.dropna(inplace=True)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    datasets[name] = (X, y)


In [4]:
TOP_K = 10
RFE_STEP = 50
N_SPLITS = 3

fs_methods = {
    "Chi2": SelectKBest(chi2, k=TOP_K),
    "RFE": RFE(LogisticRegression(solver="liblinear"), n_features_to_select=TOP_K, step=RFE_STEP),
    "LASSO": SelectFromModel(LogisticRegression(penalty="l1", solver="liblinear"), max_features=TOP_K)
}

models = {
    "LR": LogisticRegression(solver="liblinear"),
    "RF": RandomForestClassifier(n_estimators=100, random_state=42)
}


In [5]:
def jaccard_index(set1, set2):
    return len(set1 & set2) / len(set1 | set2)

def kuncheva_index(sets, k, N):
    R = len(sets)
    sum_k = 0
    for (s1, s2) in combinations(sets, 2):
        sum_k += len(s1 & s2)
    expected_overlap = (k**2)/N
    kuncheva = (2 / (R * (R - 1))) * sum_k - expected_overlap
    kuncheva /= (k - expected_overlap)
    return kuncheva

def run_eval_with_memory(fs_selector, model, X_train, X_test, y_train, y_test):
    def train_and_eval():
        fs_selector.fit(X_train, y_train)
        X_train_fs = fs_selector.transform(X_train)
        X_test_fs = fs_selector.transform(X_test)
        model.fit(X_train_fs, y_train)
        y_pred = model.predict(X_test_fs)
        return f1_score(y_test, y_pred, average='macro'), accuracy_score(y_test, y_pred)
    mem_usage, (f1, acc) = memory_usage(proc=train_and_eval, retval=True, max_usage=True)
    return f1, acc, mem_usage


In [6]:
all_results = []

for domain, (X, y) in datasets.items():
    print(f"[DEBUG] Processing domain: {domain}")
    X = MinMaxScaler().fit_transform(X)
    results = []
    feature_sets = {fs: [] for fs in fs_methods}
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

    for fs_name, selector in fs_methods.items():
        for model_name, model in models.items():
            f1_scores, acc_scores, times, memories = [], [], [], []
            for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                gc.collect()
                start_time = time.time()
                f1, acc, mem = run_eval_with_memory(selector, model, X_train, X_test, y_train, y_test)
                end_time = time.time()
                selector.fit(X_train, y_train)
                selected_idx = set(np.where(selector.get_support())[0])
                feature_sets[fs_name].append(selected_idx)
                f1_scores.append(f1)
                acc_scores.append(acc)
                times.append(end_time - start_time)
                memories.append(mem)
                print(f"[DEBUG] {domain} | {fs_name}+{model_name} | Fold-{fold+1} Done")
            results.append({
                "Domain": domain,
                "FS_Method": fs_name,
                "Model": model_name,
                "F1_macro": np.mean(f1_scores),
                "Accuracy": np.mean(acc_scores),
                "Time_sec": np.mean(times),
                "Memory_MB": np.mean(memories)
            })

    results_df = pd.DataFrame(results)

    # Jaccard & Kuncheva
    jaccard_scores = []
    kuncheva_scores = []
    k = TOP_K
    N = X.shape[1]

    for fs_name in fs_methods.keys():
        sets = feature_sets[fs_name]
        jaccards = [jaccard_index(s1, s2) for s1, s2 in combinations(sets, 2)]
        jaccard_scores.append(np.mean(jaccards))
        kuncheva_scores.append(kuncheva_index(sets, k, N))

    results_df["Jaccard_Index"] = np.repeat(jaccard_scores, len(models))
    results_df["Kuncheva_Index"] = np.repeat(kuncheva_scores, len(models))

    # Friedman & Nemenyi
    pivot = results_df.pivot(index="Model", columns="FS_Method", values="F1_macro")
    friedman_stat, friedman_p = friedmanchisquare(*[pivot[c] for c in pivot.columns])
    results_df["Friedman_pvalue"] = friedman_p

    all_results.append(results_df)

[DEBUG] Processing domain: BreastCancer
[DEBUG] BreastCancer | Chi2+LR | Fold-1 Done
[DEBUG] BreastCancer | Chi2+LR | Fold-2 Done
[DEBUG] BreastCancer | Chi2+LR | Fold-3 Done
[DEBUG] BreastCancer | Chi2+RF | Fold-1 Done
[DEBUG] BreastCancer | Chi2+RF | Fold-2 Done
[DEBUG] BreastCancer | Chi2+RF | Fold-3 Done
[DEBUG] BreastCancer | RFE+LR | Fold-1 Done
[DEBUG] BreastCancer | RFE+LR | Fold-2 Done
[DEBUG] BreastCancer | RFE+LR | Fold-3 Done
[DEBUG] BreastCancer | RFE+RF | Fold-1 Done
[DEBUG] BreastCancer | RFE+RF | Fold-2 Done
[DEBUG] BreastCancer | RFE+RF | Fold-3 Done
[DEBUG] BreastCancer | LASSO+LR | Fold-1 Done
[DEBUG] BreastCancer | LASSO+LR | Fold-2 Done
[DEBUG] BreastCancer | LASSO+LR | Fold-3 Done
[DEBUG] BreastCancer | LASSO+RF | Fold-1 Done
[DEBUG] BreastCancer | LASSO+RF | Fold-2 Done
[DEBUG] BreastCancer | LASSO+RF | Fold-3 Done
[DEBUG] Processing domain: MNIST
[DEBUG] MNIST | Chi2+LR | Fold-1 Done
[DEBUG] MNIST | Chi2+LR | Fold-2 Done
[DEBUG] MNIST | Chi2+LR | Fold-3 Done
[DE

In [7]:
final_df = pd.concat(all_results, ignore_index=True)
final_df.to_csv("fs_evaluation_3domain_full.csv", index=False)
print("[INFO] Saved to fs_evaluation_3domain_full.csv")
final_df.head()


[INFO] Saved to fs_evaluation_3domain_full.csv


Unnamed: 0,Domain,FS_Method,Model,F1_macro,Accuracy,Time_sec,Memory_MB,Jaccard_Index,Kuncheva_Index,Friedman_pvalue
0,BreastCancer,Chi2,LR,0.931952,0.938522,4.789171,808.565104,0.90303,0.92,0.135335
1,BreastCancer,Chi2,RF,0.939022,0.943785,2.343114,808.89974,0.90303,0.92,0.135335
2,BreastCancer,RFE,LR,0.947087,0.952594,4.266086,809.252604,0.90303,0.92,0.135335
3,BreastCancer,RFE,RF,0.940883,0.945558,1.630952,809.313802,0.90303,0.92,0.135335
4,BreastCancer,LASSO,LR,0.951679,0.956094,4.044001,809.604167,0.893333,0.11,0.135335
