In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.categorical_encoders import CategoricalEmbeddingTransformer
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models import TabTransformerConfig
import pandas as pd
import numpy as np
import first_analysis
import matplotlib.pyplot as plt

In [None]:
def test_regression(
    regression_data,
    multi_target,
    continuous_cols,
    categorical_cols,
    continuous_feature_transform,
    normalize_continuous_features,
    target_range,
    batch_size,
    epoch,
    num_heads,
    num_attn_blocks
):
    (train, test, target) = regression_data
    data_config = DataConfig(
        target=target + ["MedInc"] if multi_target else target,
        continuous_cols=continuous_cols,
        categorical_cols=categorical_cols,
        continuous_feature_transform=continuous_feature_transform,
        normalize_continuous_features=normalize_continuous_features
    )
    input_embed_dim = num_heads * 4
    model_config_params = {
        "task": "regression",
        "input_embed_dim": input_embed_dim,
        "num_attn_blocks": num_attn_blocks,
        "num_heads": num_heads,
        "metrics":["mean_absolute_percentage_error","mean_absolute_error","r2_score"]
    }
    if target_range:
        _target_range = []
        for target in data_config.target:
            _target_range.append(
                (
                    float(train[target].min()),
                    float(train[target].max()),
                )
            )
        model_config_params["target_range"] = _target_range
    model_config = TabTransformerConfig(**model_config_params)
    trainer_config = TrainerConfig(
        max_epochs= epoch,
        checkpoints=None,
        early_stopping=None,
        accelerator="cpu",
        fast_dev_run=False,
        batch_size= batch_size
    )
    optimizer_config = OptimizerConfig()

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
    )
    #print(train[53])
    tabular_model.fit(train=train)

    result = tabular_model.evaluate(test)
    #assert "test_mean_squared_error" in result[0].keys()
    pred_df = tabular_model.predict(test)
    assert pred_df.shape[0] == test.shape[0]
    return result

In [None]:
def input_Tab_tests(df_train, df_test, y_train, y_test):
    ref_df_train = pd.DataFrame(df_train)
    ref_df_test = pd.DataFrame(df_test)
 
    ref_df_train.columns = ref_df_train.columns.astype(str)
    ref_df_test.columns = ref_df_test.columns.astype(str)
 
    ref_df_train['Year'] = y_train['Year']
    ref_df_test['Year'] = y_test['Year']

    lista = list(ref_df_train.columns)
    lista_target_range = list(range(1900, 2024))
 
    target_column = str(ref_df_train.columns[-1])
   
    return (ref_df_train, ref_df_test, lista, lista_target_range, target_column)

In [None]:
# DA PROVARE CON DIVERSI DATASET
X_train = np.load("./Datasets/Train_Val_Test/Scaled/X_train_scaled.npy")
X_val = np.load("./Datasets/Train_Val_Test/Scaled/X_val_scaled.npy")
y_train, y_val = first_analysis.load_df("./Datasets/Train_Val_Test/y_train.csv", "./Datasets/Train_Val_Test/y_val.csv")

In [None]:
# X_train = X_train[:200]
# X_val = X_val[:40]
# y_train = y_train.head(200)
# y_val = y_val.head(40)

In [None]:
ref_df_train, ref_df_test, lista, lista_target_range, target_column = input_Tab_tests(X_train, X_val, y_train, y_val)

In [None]:
epochs = list(range(5, 11))
batch_size = list(range(50, 450, 50))
num_heads = list(range(1, 6))
num_attn_blocks = list(range(1, 6))

results = []
for j,batch in enumerate(batch_size):
    for i,epoch in enumerate(epochs):
        for k,heads in enumerate(num_heads):
            for w,attn_blocks in enumerate(num_attn_blocks):
                result = test_regression(regression_data=(ref_df_train, ref_df_test, [target_column]), multi_target = None,
                    continuous_cols = lista,
                    categorical_cols = [],
                    continuous_feature_transform = None,
                    normalize_continuous_features = False,
                    target_range=True,
                    batch_size=batch,
                    epoch=epoch, 
                    num_heads = heads,
                    num_attn_blocks = attn_blocks )
                result[0]["batch_size"] = batch
                result[0]["epochs"] = epoch
                result[0]["num_heads"] = heads
                result[0]["num_attn_blocks"] = attn_blocks
                results.append(result)
        #print("batch_size: %d / %d      epoch: %d / %d"%(j,(len(batch_size)-1),i,(len(epochs)-1)))

In [None]:
list_dict = [item[0] for item in results]

df_res =pd.DataFrame(list_dict)
df_res.to_csv("./df_Res_tabtransformer.csv")

df_res.sort_values(["test_loss"])

In [None]:
plt.figure(figsize=(10, 6))

scatter = plt.scatter(df_res['epochs'], df_res['test_mean_absolute_percentage_error'], c=df_res['batch_size'], cmap='Blues', s=100, label='loss', alpha=0.6, edgecolors='w')
#plt.scatter(df_res['epochs'], df_res['test_mean_absolute_error'], c=df_res['batch_size'], cmap='Blues', s=200, label='MAE', alpha=0.6, edgecolors='w')
#plt.scatter(df_res['epochs'], df_res['test_mean_absolute_percentage_error'], marker='o', label='MAPE')
#plt.scatter(df_res['epochs'], df_res['test_r2_score'], marker='o', label='R2')

cbar = plt.colorbar(scatter)
cbar.set_label('batch_Size')
plt.xlabel('epochs')
plt.ylabel('Errore')
plt.title('Variazione degli errori al variare delle epoche')
plt.legend()
 
plt.grid(True)
plt.show()

In [None]:
# SLAVA MODELLO DOPO AVER TROVATO GLI IPERPARAMETRI MIGLIORI