In [1]:
import sys, os

src = r"C:\Users\user\Desktop\Coding mo\AutoML proj1\task 1\AutoML Project Molham\src" # copy the location of the folder that has paths.py file using (ctrl + shift + C ) and paste it here    
sys.path.append(src)

from paths import *

get_paths()

sys.path.append(SRC_PATH)

from feature_engineering import *



In [4]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import pickle

def run_ablation_study_with_save(
    ablation_dict,
    pipeline_data_dict,
    timeout_per_fold=60,
    cpu_limit=1
):
    import pickle, gc, shutil, tempfile, torch

    all_results = []
    models_dict = {}
    X_dict = {}
    feature_names_dict = {}

    for dataset_name, folds_dict in pipeline_data_dict.items():
        target_col = TARGET_COLS[dataset_name]

        models_dict[dataset_name] = {}
        X_dict[dataset_name] = {}
        feature_names_dict[dataset_name] = {}

        dataset_results = []
        dataset_base = dataset_name.replace(".csv", "")

        for fold_name, split in folds_dict.items():
            df_train = split["train"]
            df_val = split["val"]
            X_train = df_train.drop(columns=[target_col])
            y_train = df_train[target_col]
            X_val = df_val.drop(columns=[target_col])
            y_val = df_val[target_col]

            X_dict[dataset_name][fold_name] = X_val.copy()
            models_dict[dataset_name][fold_name] = {}
            feature_names_dict[dataset_name][fold_name] = {}

            for ablation_name, config in ablation_dict.items():


                pipeline = pipeline_builder(dataset_name, X=X_train, enable_steps=config)
                pipeline.fit(X_train, y_train)

                X_train_proc = pipeline.transform(X_train)
                X_val_proc = pipeline.transform(X_val)

                feature_names = [f"f_{i}" for i in range(X_val_proc.shape[1])]
                X_train_proc = pd.DataFrame(X_train_proc, columns=feature_names)
                X_val_proc = pd.DataFrame(X_val_proc, columns=feature_names)

                feature_names_dict[dataset_name][fold_name][ablation_name] = {
                    "before": X_train.columns.tolist(),
                    "after": feature_names
                }

                del pipeline
                gc.collect()


                train_df = pd.concat(
                    [X_train_proc.reset_index(drop=True),
                     pd.Series(y_train.values, name=target_col)],
                    axis=1
                )
                val_df = pd.concat(
                    [X_val_proc.reset_index(drop=True),
                     pd.Series(y_val.values, name=target_col)],
                    axis=1
                )

                del X_train_proc
                gc.collect()

                task = Task(tasks_dict_classification_only[dataset_name])

                temp_dir = tempfile.mkdtemp()
                start = time.time()

                automl = TabularAutoML(
                    task=task,
                    timeout=timeout_per_fold,
                    cpu_limit=cpu_limit,
                    reader_params={"n_jobs": cpu_limit},
                    general_params={"use_algos": [["lgb", "lgb_tuned"]]},
                )

                _ = automl.fit_predict(train_df, roles={"target": target_col})


                preds_raw = automl.predict(val_df).data
                preds = (preds_raw[:, 0] > 0.5)

                runtime = round(time.time() - start, 3)


                model_path = os.path.join(
                    ABLATION_MODELS_FOR_SHAP_PATH,
                    f"{dataset_base}_{fold_name}_{ablation_name}.pkl"
                )
                os.makedirs(ABLATION_MODELS_FOR_SHAP_PATH, exist_ok=True)

                with open(model_path, "wb") as f:
                    pickle.dump(automl, f)

                models_dict[dataset_name][fold_name][ablation_name] = model_path

                del automl, preds_raw
                shutil.rmtree(temp_dir, ignore_errors=True)
                gc.collect()

                try:
                    torch.cuda.empty_cache()
                except:
                    pass


                fold_idx = int(fold_name.split("_")[1])

                r = {
                    "dataset": dataset_name,
                    "fold": fold_idx,
                    "ablation": ablation_name,
                    "accuracy": accuracy_score(y_val, preds),
                    "f1": f1_score(y_val, preds, average="weighted"),
                    "precision": precision_score(y_val, preds, average="weighted"),
                    "recall": recall_score(y_val, preds, average="weighted"),
                    "runtime": runtime
                }

                all_results.append(r)
                dataset_results.append(r)

            del X_val_proc
            gc.collect()


        df_dataset = pd.DataFrame(dataset_results)
        os.makedirs(ABLATION_METRICS_PATH, exist_ok=True)
        df_dataset.to_csv(
            os.path.join(ABLATION_METRICS_PATH, f"ablation_{dataset_base}.csv"),
            index=False
        )

        df_plot = df_dataset.groupby("ablation")["f1"].mean().sort_values()
        plt.figure(figsize=(8,5))
        df_plot.plot(kind="bar")
        plt.title(f"Ablation Study - {dataset_name}")
        plt.ylabel("F1 Score")
        plt.xticks(rotation=45)
        plt.tight_layout()

        os.makedirs(ABLATION_PLOTS_PATH, exist_ok=True)
        plt.savefig(os.path.join(ABLATION_PLOTS_PATH, f"ablation_{dataset_base}_all.png"))
        plt.close()

    with open(os.path.join(ABLATION_MODELS_FOR_SHAP_PATH, "all_results.pkl"), "wb") as f:
        pickle.dump(all_results, f)

    with open(os.path.join(ABLATION_MODELS_FOR_SHAP_PATH, "best_models_dict.pkl"), "wb") as f:
        pickle.dump(models_dict, f)

    with open(os.path.join(ABLATION_MODELS_FOR_SHAP_PATH, "X_dict.pkl"), "wb") as f:
        pickle.dump(X_dict, f)

    with open(os.path.join(ABLATION_MODELS_FOR_SHAP_PATH, "feature_names_dict.pkl"), "wb") as f:
        pickle.dump(feature_names_dict, f)

    return all_results, models_dict, X_dict, feature_names_dict


all_results, best_models_dict, X_dict, feature_names_dict = run_ablation_study_with_save(
    ablation_dict=ablation_dict,
    pipeline_data_dict=pipeline_data_dict,
    timeout_per_fold=ABLATION_TIME_BUDGET,   
    cpu_limit=4                               
)


In [9]:
for dataset_name in data_dict_classification_only.keys():
    dataset_base = dataset_name.replace('.csv', '')
    
    csv_file1  = os.path.join(ABLATION_METRICS_PATH, f"ablation_{dataset_base}.csv")
    plot_file1 = os.path.join(ABLATION_PLOTS_PATH, f"ablation_{dataset_base}_all.png")

    print(f"CSV for {dataset_base}: ablation_{dataset_base}.csv")
    df1 = pd.read_csv(csv_file1)
    display(df1)

    # print(f"Plot for {dataset_base}: ablation_{dataset_base}_all.png")
    # display(Image(plot_file1))

CSV for modeldata: ablation_modeldata.csv


Unnamed: 0,dataset,fold,ablation,accuracy,f1,precision,recall,runtime
0,modeldata.csv,1,no_feature_engineering,0.930397,0.930529,0.930737,0.930397,16.247
1,modeldata.csv,1,polynomial_only,0.930397,0.930529,0.930737,0.930397,16.37
2,modeldata.csv,1,binning_only,0.816423,0.811754,0.814383,0.816423,17.365
3,modeldata.csv,1,polynomial_and_binning,0.816423,0.811754,0.814383,0.816423,16.861
4,modeldata.csv,1,vif_and_binning,0.816423,0.811754,0.814383,0.816423,17.581
5,modeldata.csv,2,no_feature_engineering,0.931631,0.931747,0.931923,0.931631,17.186
6,modeldata.csv,2,polynomial_only,0.931631,0.931747,0.931923,0.931631,13.753
7,modeldata.csv,2,binning_only,0.822605,0.81867,0.82057,0.822605,13.508
8,modeldata.csv,2,polynomial_and_binning,0.822605,0.81867,0.82057,0.822605,13.374
9,modeldata.csv,2,vif_and_binning,0.822605,0.81867,0.82057,0.822605,13.08


CSV for titanic: ablation_titanic.csv


Unnamed: 0,dataset,fold,ablation,accuracy,f1,precision,recall,runtime
0,titanic.csv,1,no_feature_engineering,0.769058,0.742632,0.811221,0.769058,9.24
1,titanic.csv,1,polynomial_only,0.769058,0.742632,0.811221,0.769058,5.357
2,titanic.csv,1,binning_only,0.670404,0.609892,0.691669,0.670404,5.49
3,titanic.csv,1,polynomial_and_binning,0.670404,0.609892,0.691669,0.670404,5.429
4,titanic.csv,1,vif_and_binning,0.641256,0.533561,0.706687,0.641256,5.172
5,titanic.csv,2,no_feature_engineering,0.831461,0.828491,0.831235,0.831461,10.584
6,titanic.csv,2,polynomial_only,0.831461,0.828491,0.831235,0.831461,10.607
7,titanic.csv,2,binning_only,0.680899,0.64512,0.681564,0.680899,6.728
8,titanic.csv,2,polynomial_and_binning,0.680899,0.64512,0.681564,0.680899,6.556
9,titanic.csv,2,vif_and_binning,0.667416,0.611113,0.679811,0.667416,6.312


CSV for train: ablation_train.csv


Unnamed: 0,dataset,fold,ablation,accuracy,f1,precision,recall,runtime
0,train.csv,1,no_feature_engineering,0.0,0.0,0.0,0.0,13.351
1,train.csv,1,polynomial_only,0.0,0.0,0.0,0.0,13.36
2,train.csv,1,binning_only,0.0,0.0,0.0,0.0,18.014
3,train.csv,1,polynomial_and_binning,0.0,0.0,0.0,0.0,18.006
4,train.csv,1,vif_and_binning,0.0,0.0,0.0,0.0,17.746
5,train.csv,2,no_feature_engineering,0.0,0.0,0.0,0.0,17.954
6,train.csv,2,polynomial_only,0.0,0.0,0.0,0.0,18.186
7,train.csv,2,binning_only,0.0,0.0,0.0,0.0,17.925
8,train.csv,2,polynomial_and_binning,0.0,0.0,0.0,0.0,17.513
9,train.csv,2,vif_and_binning,0.0,0.0,0.0,0.0,17.231


CSV for wine: ablation_wine.csv


Unnamed: 0,dataset,fold,ablation,accuracy,f1,precision,recall,runtime
0,wine.csv,1,no_feature_engineering,0.011236,0.007491,0.005618,0.011236,16.807
1,wine.csv,1,polynomial_only,0.011236,0.007491,0.005618,0.011236,16.996
2,wine.csv,1,binning_only,0.044944,0.034709,0.030623,0.044944,16.177
3,wine.csv,1,polynomial_and_binning,0.044944,0.034709,0.030623,0.044944,16.0
4,wine.csv,1,vif_and_binning,0.044944,0.034709,0.030623,0.044944,16.248
5,wine.csv,2,no_feature_engineering,0.022472,0.019768,0.019379,0.022472,17.009
6,wine.csv,2,polynomial_only,0.022472,0.019768,0.019379,0.022472,16.913
7,wine.csv,2,binning_only,0.033708,0.027122,0.02513,0.033708,17.105
8,wine.csv,2,polynomial_and_binning,0.033708,0.027122,0.02513,0.033708,15.91
9,wine.csv,2,vif_and_binning,0.033708,0.027122,0.02513,0.033708,16.636
