In [13]:

import time
import torch
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random
from experiments.pytorch_tablu.utils import preprocess
# import numpy as np
import pandas as pd
# import wandb
from rich import print

from typing import Callable
from datetime import datetime

from time import time
from pytorch_tabular import model_sweep
import warnings

# from pytorch_tabular import TabularModel
# from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)
from pytorch_tabular.models.common.heads import LinearHeadConfig


from pytorch_tabular.utils import (
    make_mixed_dataset,
    print_metrics,
    load_covertype_dataset,
)
from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy

# import os
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:

Feature_Explain = pd.read_excel("data/Amex_ori/Amex Campus.xlsx")
Feature_target_cols = Feature_Explain["Feature Name"][
    Feature_Explain["Extended description"] == "Target"
].to_list()
Feature_cat_col_names = Feature_Explain["Feature Name"][
    Feature_Explain["Feature Type"] == "categorical"
].to_list()
Feature_num_col_names = Feature_Explain["Feature Name"][
    Feature_Explain["Variable Type"] == "numeric"
].to_list()

DEVICE = (
    "mps"
    if torch.backends.mps.is_available()
    else ("gpu" if torch.cuda.is_available() else "cpu")
)
print(DEVICE)

# %%
data = pd.read_parquet("data/Amex_ori/Amex Campus Challenge Train 3.parquet")

data=preprocess(data)

cols = data.columns.to_list()
target_cols = [col for col in cols if col in Feature_target_cols]
if len(target_cols) == 0:
    print("No target column found")
    exit()
# elif len(target_cols) > 1:
#     target_col=target_cols[1]
cat_col_names = [col for col in cols if col in Feature_cat_col_names]
num_col_names = [col for col in cols if col in Feature_num_col_names]

print(f"{target_cols}")
# target_col=cols.unite(Feature_target_cols)

# target_col = 'activation'

# %%

needed_cols = cat_col_names + num_col_names + target_cols
data = data[needed_cols]

data.head()
print(
    f"target_cols={target_cols}, cat_col_names={cat_col_names}, num_col_names={num_col_names}"
)

# %%
# Data: DF, cat_col_names, num_col_names

# 1. activate(0.57%) X recom(12.7%) -> 0~3

# _,data=train_test_split(data,test_size=0.05)




In [16]:
train, test = train_test_split(data,test_size=0.2)
# train, val = train_test_split(train,test_size=0.2)

del data
print(f"train.shape={train.shape}, test.shape={test.shape}")


In [17]:
train.describe()


Unnamed: 0,merchant_profile_01,customer_spend_06,customer_spend_07,customer_profile_01,customer_profile_02,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,ind_recommended,activation
count,2638385.0,2667987.0,2667987.0,2784362.0,2784362.0,2767502.0,2777636.0,2783800.0,2784724.0,2784724.0,2784724.0
mean,260.3839,250.6789,119.1629,2941.422,2657.957,49.98276,8.476522,180.9109,5.830544,0.4999465,0.005997004
std,138.849,353.9141,98.20016,21283.97,19334.83,35.51472,18.50103,139.627,8.262838,0.5000001,0.07720778
min,101.0,1.0,1.0,-999.0,-33031.22,0.0,0.0,1.0,0.00106013,0.0,0.0
25%,101.0,41.0,31.0,129.63,107.72,15.19437,1.166667,65.0,1.75227,0.0,0.0
50%,319.0,142.0,95.0,769.03,623.0,48.98091,2.833333,157.0,3.581526,0.0,0.0
75%,406.0,360.0,195.0,2503.83,2150.94,85.32891,7.833333,257.0,7.402355,1.0,0.0
max,507.0,121583.0,360.0,4186172.0,3401788.0,100.0,2756.5,763.0,2540.838,1.0,1.0


In [18]:

test[['activation', 'ind_recommended']].describe()


Unnamed: 0,activation,ind_recommended
count,309414.0,309414.0
mean,0.006263,0.500482
std,0.078894,0.500001
min,0.0,0.0
25%,0.0,0.0
50%,0.0,1.0
75%,0.0,1.0
max,1.0,1.0


In [19]:

target_col = target_cols[1]

data_config = DataConfig(
    target=[target_col],
    # target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
    num_workers=0,
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=125,
    early_stopping=None,
    # early_stopping="valid_loss",  # Monitor valid_loss for early stopping
    # early_stopping_mode="min",  # Set the mode as min because for val_loss, lower is better
    # early_stopping_patience=5,  # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss",  # Save best checkpoint monitoring val_loss
    checkpoints_path="checkpoints",  # Save the checkpoint in the experiment directory
    # checkpoints_save_top_k=5,
    # progress_bar="simple",
    load_best=True,  # After training, load the best checkpoint
    # accelerator=DEVICE,
    trainer_kwargs=dict(enable_model_summary=False),  # Turning off model summary
)
optimizer_config = OptimizerConfig()

head_config = LinearHeadConfig(
    layers="",  # No additional layer in head, just a mapping layer to output_dim
    dropout=0.1,
    initialization="kaiming",
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)



In [20]:
from pathlib import Path
def save_sweep_results(sweep_df, save_dir, best_model):
    if not Path(save_dir).exists:
        Path.mkdir(save_dir,parents=True)
    sweep_df.drop(columns=["params", "time_taken", "epochs"])
    sweep_df.to_excel(f"{save_dir}/sweep_result.xlsx", index=False)
    best_model.save_model(f"{save_dir}/best_model.ckpt")


def visualize_sweep_results(sweep_df):
    sweep_df.drop(columns=["params", "time_taken", "epochs"]).style.background_gradient(
        subset=["test_accuracy", "test_f1_score"], cmap="RdYlGn"
    ).background_gradient(subset=["time_taken_per_epoch", "test_loss"], cmap="RdYlGn_r")



In [21]:
%%time
# Filtering out the warnings
# t = time()

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sweep_df, best_model = model_sweep(
        task="classification",  # One of "classification", "regression"
        train=train,
        test=test,
        data_config=data_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        model_list="lite",
        common_model_args=dict(head="LinearHead", head_config=head_config),
        metrics=["accuracy", "f1_score"],
        metrics_params=[{}, {"average": "macro"}],
        metrics_prob_input=[False, True],
        rank_metric=("accuracy", "higher_is_better"),
        progress_bar=True,
        verbose=True,
        suppress_lightning_logger=True,
    )
    



Output()

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

AssertionError: 

In [None]:

visualize_sweep_results(sweep_df)
save_sweep_results(sweep_df, f"sweep_results for {target_col}",best_model)


NameError: name 'sweep_df' is not defined