In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
print(2)

2


In [2]:
from src.data_loading import load_data
from skrub import MinHashEncoder
from sklearn.decomposition import PCA
from src.utils import FeaturesExtractor, FixedSizeSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from tabpfn import TabPFNClassifier
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import time
from sentence_transformers import SentenceTransformer

In [4]:
def encode(X, encoder_name):
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        X = np.array(X)
    encoder_type, encoder_params = encoder_name.split("__", 1)
    if encoder_type == "lm":
        encoder = SentenceTransformer(encoder_params)
        return encoder.encode(X)
    elif encoder_type == "skrub":
        if encoder_params.startswith("minhash"):
            n_components = int(encoder_params.split("_")[1])
            encoder = MinHashEncoder(n_components=n_components)
            # reshape to 2d array
            # if pandas dataframe, convert to numpy array
            X = X.reshape(-1, 1)
            return encoder.fit_transform(X)
        else:
            raise ValueError(f"Unknown skrub encoder {encoder_params}")

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeaturesExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, n_features=1, method="first"):
        self.n_features = n_features
        self.method = method

    def fit(self, X, y=None):
        return self  # Nothing to fit, so return self

    def transform(self, X):
        # Extract the first n_features
        # choose features to keep
        if self.method == "first":
            res = X[:, :self.n_features]
        elif self.method == "last":
            res = X[:, -self.n_features:]
        elif self.method == "middle":
            res = X[:, self.n_features//2:self.n_features//2+self.n_features]
        elif self.method == "random":
            res = X[:, np.random.choice(X.shape[1], self.n_features, replace=False)]
        elif self.method == "biggest_variance":
            indices = np.argsort(np.var(X, axis=0))[-self.n_features:]
            res = X[:, indices]
        elif self.method == "smallest_variance":
            indices = np.argsort(np.var(X, axis=0))[:self.n_features]
            res = X[:, indices]
        
        assert res.shape == (X.shape[0], self.n_features)
        return res

In [6]:
from skrub import TableVectorizer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [9]:
load_data("spotify", max_rows=10000, include_all_columns=True)

TypeError: load_data() got an unexpected keyword argument 'include_all_columns'

In [7]:
#encodings = ["skrub__minhash_10", "skrub__minhash_30", "lm__all-MiniLM-L12-v2", "lm__whaleloops/phrase-bert"]
encodings = ["lm__all-distilroberta-v1"]
datasets = ["journal_jcr_cls", "movies", "michelin", "spotify", "employee_salary", "drug_directory", "museums", "fifa_footballplayers_22", "jp_anime"]
#dim_reductions = {"PCA_10": PCA(n_components=10), "PCA_30": PCA(n_components=10), "subset_10": FeaturesExtractor(method="first", n_features=10), "subset_30": FeaturesExtractor(method="first", n_features=30),
#                  "subset_biggest_10": FeaturesExtractor(method="biggest_variance", n_features=10), "subset_biggest_30": FeaturesExtractor(method="biggest_variance", n_features=30),
#                  "passthrough": "passthrough"}
#models = {"LogisticRegression": LogisticRegression(), "GradientBoostingClassifier": GradientBoostingClassifier(), "TabPFNClassifier": TabPFNClassifier(device="cpu")}
dim_reductions = {"subset_100": FeaturesExtractor(method="first", n_features=100),
                  "subset_smallest_50": FeaturesExtractor(method="smallest_variance", n_features=50),
                  "subset_smallest_100": FeaturesExtractor(method="smallest_variance", n_features=100),
                  "PCA_100": PCA(n_components=100)}

models = {"TabPFNClassifier": TabPFNClassifier(device="cpu")}
print("Number of iterations: ", len(datasets) * len(encodings) * len(dim_reductions) * len(models))

def run_on_encoded_data(X_enc, X_rest, y, dim_reduction_name, dim_reduction, model_name, model):
    if dim_reduction_name == "passthrough" and model_name != "LogisticRegression" and not encoding.startswith("skrub"):
        return None
    if dim_reduction_name != "passthrough" and encoding.startswith("skrub"):
        return None
    # encode X_rest with the TableVectorizer
    if model_name == "TabPFNClassifier":
        # ordinal encoding for low_cardinality columns
        low_card_cat_transformer = OrdinalEncoder()
    else:
        low_card_cat_transformer = OneHotEncoder(handle_unknown="ignore")
    if model_name == "LogisticRegression":
        numerical_transformer = StandardScaler()
    else:
        numerical_transformer = "passthrough"
    
    rest_trans = TableVectorizer(high_card_cat_transformer=MinHashEncoder(),
                                low_card_cat_transformer=low_card_cat_transformer,
                                numerical_transformer=numerical_transformer)
    
    # Assuming X_enc and X_rest are numpy arrays, you can get their shapes
    n_enc_columns = X_enc.shape[1]
    n_rest_columns = X_rest.shape[1]

    # Create column indices for X_enc and X_rest
    enc_indices = np.arange(n_enc_columns)
    rest_indices = np.arange(n_enc_columns, n_enc_columns + n_rest_columns)

    # Create the ColumnTransformer
    complete_trans = ColumnTransformer(
        transformers=[
            ('dim_reduction', dim_reduction, enc_indices),  # Apply dimensionality reduction to X_enc
            ('rest_trans', rest_trans, rest_indices)  # Apply TableVectorizer to X_rest
        ])
    

    full_X = np.concatenate([X_enc, X_rest], axis=1)


    pipeline = Pipeline([("encoding", complete_trans), ("model", model)])
    cv = FixedSizeSplit(n_splits=5, n_train=1000, n_test=4000)
    scores = cross_val_score(pipeline, full_X, y, scoring="accuracy", cv=cv)
    return scores

results = pd.DataFrame(columns=["dataset", "encoding", "dim_reduction", "model", "accuracy"])

for dataset in tqdm(datasets):
    for encoding in tqdm(encodings, leave=False):
        #TODO do a proper sklearn transform (not a problem for sentence transformers)
        # right now not done for speed reasons and gpu handling
        print(f"Dataset: {dataset}, Encoding: {encoding}")
        X_text, X_rest, y = load_data(dataset, max_rows=10000, include_all_columns=True)
        # encode X_text
        X_enc = encode(X_text, encoding)



        # # run with joblib
        results_data_enc = Parallel(n_jobs=-1)(delayed(run_on_encoded_data)(X_enc, X_rest, y, dim_reduction_name, dim_reduction,model_name, model) for (dim_reduction_name, dim_reduction) in dim_reductions.items() for (model_name, model) in models.items())
        start_time = time.time()
        for dim_reduction_name, dim_reduction in dim_reductions.items():
            for model_name, model in models.items():
                scores = results_data_enc.pop(0)
                if scores is None:
                    continue
                results = pd.concat([results, pd.DataFrame({"dataset": dataset, "encoding": encoding, "dim_reduction": dim_reduction_name, "model": model_name, "accuracy": scores})])
        print(f"Time elapsed: {time.time() - start_time}")



Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Number of iterations:  36


  0%|          | 0/9 [00:00<?, ?it/s]

Dataset: journal_jcr_cls, Encoding: lm__all-distilroberta-v1





TypeError: load_data() got an unexpected keyword argument 'include_all_columns'

In [26]:
# save results
#results.to_csv("results.csv", index=False)
results.to_csv("results2.csv", index=False)

In [27]:
results

Unnamed: 0,dataset,encoding,dim_reduction,model,accuracy
0,journal_jcr_cls,lm__all-distilroberta-v1,subset_100,TabPFNClassifier,0.58700
1,journal_jcr_cls,lm__all-distilroberta-v1,subset_100,TabPFNClassifier,0.58875
2,journal_jcr_cls,lm__all-distilroberta-v1,subset_100,TabPFNClassifier,0.58350
3,journal_jcr_cls,lm__all-distilroberta-v1,subset_100,TabPFNClassifier,0.58575
4,journal_jcr_cls,lm__all-distilroberta-v1,subset_100,TabPFNClassifier,0.59400
...,...,...,...,...,...
0,jp_anime,lm__all-distilroberta-v1,PCA_100,TabPFNClassifier,0.58375
1,jp_anime,lm__all-distilroberta-v1,PCA_100,TabPFNClassifier,0.58775
2,jp_anime,lm__all-distilroberta-v1,PCA_100,TabPFNClassifier,0.56575
3,jp_anime,lm__all-distilroberta-v1,PCA_100,TabPFNClassifier,0.57850


In [3]:
%cd lm_tab/scripts

/scratch/lgrinszt/lm_tab/scripts


In [29]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
#results = pd.read_csv("../results/results.csv")
#esults['mean_accuracy'] = results['accuracy'].apply(np.mean)
#melted_results = results
# "Melting" the data to make it suitable for a swarmplot
#melted_results = results.explode('accuracy')
#melted_results['accuracy'] = melted_results['accuracy'].astype(float)
# average the accuracy scores in the accuracy column

# filter for encoding =lm__all-MiniLM-L12-v2
# take the mean of accuracy
melted_results = results.groupby(['dataset', 'model', 'dim_reduction', 'encoding']).mean().reset_index()
#melted_results = results.explode('accuracy')
#melted_results['accuracy'] = melted_results['accuracy'].astype(float)
#melted_results = melted_results[melted_results['encoding'] == 'lm__all-MiniLM-L12-v2']
print(len(melted_results))

# Creating the swarmplot
# plt.figure(figsize=(15, 20))
# sns.swarmplot(data=melted_results, x='accuracy', y='dataset', hue='model', dodge=True)
# plt.title('Swarm Plot of Model Accuracies Across Datasets')
# plt.xlabel('Accuracy')
# plt.ylabel('Dataset')
# plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.show()
# Create the plot

# Create the plot
fig = px.strip(
    data_frame=melted_results,
    x="accuracy",
    y="dataset",
    #color="model",
    color="dim_reduction",
    title="Swarm Plot of Model Accuracies Across Datasets",
    labels={"accuracy": "Accuracy", "dataset": "Dataset", "model": "Model"},
    height=600,
    width=900,
)

# Update hover information for each trace (grouped by 'color' or 'model' in this case)
for i, trace in enumerate(fig.data):
    subset_df = melted_results[melted_results['model'] == trace.name]
    hover_template = "<br>".join([f"{col}: %{{customdata[{i}]}}" for i, col in enumerate(subset_df.columns)])
    trace.customdata = subset_df.values
    trace.hovertemplate = hover_template

# Show the figure
fig.show()

36


In [6]:
%cd lm_tab/scripts

/scratch/lgrinszt/lm_tab/scripts


In [7]:
!pwd

/scratch/lgrinszt/lm_tab/scripts


In [8]:
# compare speed on cpu and gpu
X, y = load_data("spotify", max_rows=10000)
# label encoding
#y = y.astype('category').cat.codes
y = y.astype(np.int64)
#X_enc = encode(X, "lm__all-MiniLM-L12-v2")
X, y = X[:500], y[:500]

Original task: classification for spotify


In [7]:
# Choose a tokenizer and BERT model that work together
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler, ProgressBar
from skorch.hf import HuggingfacePretrainedTokenizer
# import LambdaLR from torch.optim.lr_scheduler
from torch.optim.lr_scheduler import LambdaLR
from src.models import BertAndTabPFN

#TOKENIZER = "distilbert-base-uncased"
#PRETRAINED_MODEL = "distilbert-base-uncased"
TOKENIZER = 'distilroberta-base'
PRETRAINED_MODEL = 'distilroberta-base'

# model hyper-parameters
OPTMIZER = torch.optim.AdamW
LR = 5e-5
MAX_EPOCHS = 3
CRITERION = nn.CrossEntropyLoss
BATCH_SIZE = 8

# device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

num_training_steps = MAX_EPOCHS * (len(X) // BATCH_SIZE + 1)

def lr_schedule(current_step):
    factor = float(num_training_steps - current_step) / float(max(1, num_training_steps))
    assert factor > 0
    return factor

class BertModule(nn.Module):
    def __init__(self, name, num_labels):
        super().__init__()
        self.name = name
        self.num_labels = num_labels
        
        self.reset_weights()
        
    def reset_weights(self):
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            self.name, num_labels=self.num_labels
        )
        
    def forward(self, **kwargs):
        pred = self.bert(**kwargs)
        return pred.logits
    
class BertAndTabPFNWrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, X, **fit_params):
        y = fit_params.get("y", None)
        return self.model(X, y)
    
#wrapped_model = BertAndTabPFNWrapper(BertAndTabPFN)

class HuggingfacePretrainedTokenizerWithY(HuggingfacePretrainedTokenizer):
    def transform(self, X):
        res = super().transform(X)
        res["y"] = y
        return res

pipeline = Pipeline([
    ('tokenizer', HuggingfacePretrainedTokenizer(TOKENIZER)),
    ('net', NeuralNetClassifier(
        BertAndTabPFN,
        module__dim_tabpfn=50, 
        module__preprocess_before_tabpfn=False,
        #module__name=PRETRAINED_MODEL,
        #module__num_labels=len(set(y)),
        optimizer=OPTMIZER,
        lr=LR,
        max_epochs=MAX_EPOCHS,
        criterion=CRITERION,
        batch_size=BATCH_SIZE,
        iterator_train__shuffle=True,
        device=DEVICE,
        callbacks=[
            LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'),
            ProgressBar(),
        ],
    )),
])

In [8]:
step1 = HuggingfacePretrainedTokenizer(TOKENIZER).fit_transform(X, y)

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class MergeDictTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def transform(self, X, y=None):
        # Make a copy of X to avoid modifying the original dictionary
        merged_dict = X.copy()
        # Merge y into the copied dictionary
        if y is not None:
            merged_dict.update({"y": y.astype(np.float32)})
            
        return merged_dict

    def fit_transform(self, X, y=None):
        return self.transform(X, y)
    
step2 = MergeDictTransformer().fit_transform(step1, y)

In [10]:
step2["y"].shape

(500,)

In [9]:
import torch.nn as nn
import torch
from tabpfn import TabPFNClassifier
from src.utils import preprocess_input
from transformers import AutoModel
class BertAndTabPFN(nn.Module):
    def __init__(self, linear_translator=False, dim_tabpfn=100, preprocess_before_tabpfn=False,
                 train_tabpfn=False):
        super().__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        #self.bert = BertModel.from_pretrained('distilbert-base-uncased')
        self.bert = AutoModel.from_pretrained('distilroberta-base')
        self.tabpfn = TabPFNClassifier().model[2]
        if not train_tabpfn:
            # no requires_grad for the tabpfn
            for param in self.tabpfn.parameters():
                param.requires_grad = False
        self.dim_tabpfn = dim_tabpfn
        self.preprocess_before_tabpfn = preprocess_before_tabpfn
        if linear_translator:
            self.linear_translator = nn.Linear(768, dim_tabpfn)
    
    def forward(self, input_ids, attention_mask, y, tabular_data=None, single_eval_pos=100, **fit_params):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]
        if hasattr(self, 'linear_translator'):
            tabpfn_input = self.linear_translator(bert_embeddings)
        else:
            tabpfn_input = bert_embeddings[:, :self.dim_tabpfn]
        tabpfn_input = tabpfn_input.reshape(tabpfn_input.shape[0], 1, tabpfn_input.shape[1])
        if self.preprocess_before_tabpfn:
            tabpfn_input = preprocess_input(tabpfn_input, y, single_eval_pos, preprocess_transform="none", device=input_ids.device)
        # print shapes
        print("bert_embeddings.shape", bert_embeddings.shape)
        print("tabpfn_input.shape", tabpfn_input.shape)
        print("y.shape", y.shape)
        y = y.reshape(y.shape[0], 1)
        tabpfn_outputs = self.tabpfn((tabpfn_input, y), single_eval_pos=single_eval_pos)
        return tabpfn_outputs
    

In [12]:
#step2_cuda = {k: torch.tensor(v).to("cuda")[:200] for k, v in step2.items()}
#model = BertAndTabPFN().to('cuda')
#model(**step2_cuda)

In [13]:
clf = NeuralNetClassifier(
        BertAndTabPFN,
        module__dim_tabpfn=100, 
        module__preprocess_before_tabpfn=False,
        #module__name=PRETRAINED_MODEL,
        #module__num_labels=len(set(y)),
        optimizer=OPTMIZER,
        lr=LR,
        max_epochs=MAX_EPOCHS,
        criterion=CRITERION,
        batch_size=200,
        iterator_train__shuffle=True,
        device=DEVICE,
        callbacks=[
            LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every='batch'),
            ProgressBar(),
        ],
    )


In [14]:
clf.fit(step2, y)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


  0%|          | 0/3 [00:00<?, ?it/s]

bert_embeddings.shape torch.Size([200, 768])
tabpfn_input.shape torch.Size([200, 1, 100])
y.shape torch.Size([200])


ValueError: Expected input batch_size (100) to match target batch_size (200).

In [20]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklea

class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # return self for compatibility reasons

    def transform(self, X, y=None):
        # Your code here that uses both X and y
        print("X", X.shape)
        print("y", y.shape)
        return X
    
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('custom_transformer', CustomTransformer())
])

SyntaxError: invalid syntax (1303619598.py, line 4)

In [None]:
X, y = np.random.rand(100, 10), np.random.randint(0, 2, 100)

pipeline.fit(X, y)

TypeError: float() argument must be a string or a real number, not 'ellipsis'

In [13]:
from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import numpy as np
from sklearn.metrics import accuracy_score
from src.utils import preprocess_input

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Your text data and labels (replace these with your actual data and labels)
# Your text data and labels (replace these with your actual data and labels)
#texts = X_original[column_to_consider].tolist()
texts = X.tolist()
#labels = (y_original > np.median(y_original)).tolist()
#labels = y_original.tolist()
labels = y.tolist()

# Tokenize the text data
encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset and dataloaders
#dataset = CustomDataset(encoding, torch.tensor(labels).float().reshape(-1, 1))
#print(f"Dataset size: {len(dataset)}")
train_size = 1000
val_size = 1000 #TODO
#test_size = len(dataset) - train_size - val_size
#print(f"Train size: {train_size}, Val size: {val_size}, Test size: {test_size}")
#train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
#train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
#val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
#test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
train_dataset = CustomDataset(encoding[:train_size], torch.tensor(labels).float().reshape(-1, 1)[:train_size])
val_dataset = CustomDataset(encoding[train_size:train_size+val_size], torch.tensor(labels).float().reshape(-1, 1)[train_size:train_size+val_size])
test_dataset = CustomDataset(encoding[train_size+val_size:], torch.tensor(labels).float().reshape(-1, 1)[train_size+val_size:])
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

# Initialize model and optimizer
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = BertAndTabPFN(preprocess_before_tabpfn=True, linear_translator=False).to('cuda')
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    ###########
    # Train loop
    model.train()
    for batch in train_loader:
        input_ids_train = batch['input_ids']
        attention_mask_train = batch['attention_mask']
        labels_train = batch['labels']
        # move the inputs to GPU
        input_ids_train = input_ids_train.to('cuda')
        attention_mask_train = attention_mask_train.to('cuda')
        labels_train = labels_train.to('cuda')
        single_eval_pos = 500
        output = model(input_ids_train, attention_mask=attention_mask_train, y=labels_train, single_eval_pos=single_eval_pos).squeeze()
        loss = nn.CrossEntropyLoss()(output, labels_train[single_eval_pos:].long().reshape(-1))
        if epoch > 0:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # compute train accuracy
        preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
        train_accuracy = accuracy_score(labels_train[single_eval_pos:].cpu().detach().numpy().reshape(-1), preds)
        print(f"Epoch {epoch + 1} - Training loss: {loss}, Training accuracy: {train_accuracy}")
    # Validation loop
    model.eval()
    val_preds, val_labels, val_losses = [], [], []
    best_val_loss = np.inf
    with torch.no_grad():
        for batch in val_loader: #TODO: remove the useless for loop
            input_ids_val = batch['input_ids']
            attention_mask_val = batch['attention_mask']
            labels_val = batch['labels']
            # move the inputs to GPU
            input_ids_val = input_ids_val.to('cuda')
            attention_mask_val = attention_mask_val.to('cuda')
            labels_val = labels_val.to('cuda')
            # concatenate train and val
            #TODO: make sure this is correct, no leak etc
            # maybe safer to create a TabPFNClassifier with the same parameters as the one in BertAndTabPFN
            input_ids = torch.cat((input_ids_train, input_ids_val), axis=0)
            attention_mask = torch.cat((attention_mask_train, attention_mask_val), axis=0)
            labels = torch.cat((labels_train, labels_val), axis=0)
            single_eval_pos = 1000
            output = model(input_ids, attention_mask=attention_mask, y=labels, single_eval_pos=single_eval_pos).squeeze()
            val_loss = nn.CrossEntropyLoss()(output, labels[single_eval_pos:].long().reshape(-1))
            if val_loss < best_val_loss:
                print(f"New best validation loss: {val_loss}")
                best_val_loss = val_loss
                # save the model
                torch.save(model.state_dict(), "checkpoints/model.pt")
                # save input_ids_train, attention_mask_train, labels_train
                torch.save(input_ids_train, "checkpoints/input_ids_train.pt")
                torch.save(attention_mask_train, "checkpoints/attention_mask_train.pt")
                torch.save(labels_train, "checkpoints/labels_train.pt")


            val_losses.append(val_loss.cpu())
            preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
            val_preds.append(preds)
            val_labels.append(labels[single_eval_pos:].cpu().detach().numpy().reshape(-1))
    val_preds = np.concatenate(val_preds)
    val_labels = np.concatenate(val_labels)
    val_losses = np.mean(val_losses)
    print(f"Epoch {epoch + 1} - Validation loss: {val_losses}")
    # Compute accuracy
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch + 1} - Validation accuracy: {val_accuracy}")

# Load the best model
model.load_state_dict(torch.load("checkpoints/model.pt"))
input_ids_train = torch.load("checkpoints/input_ids_train.pt")
attention_mask_train = torch.load("checkpoints/attention_mask_train.pt")
labels_train = torch.load("checkpoints/labels_train.pt")

# Test loop
model.eval()
test_preds, test_labels, test_losses = [], [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids_test = batch['input_ids']
        attention_mask_test = batch['attention_mask']
        labels_test = batch['labels']
        # move the inputs to GPU
        input_ids_test = input_ids_test.to('cuda')
        attention_mask_test = attention_mask_test.to('cuda')
        labels_test = labels_test.to('cuda')
        # concatenate train and val
        #TODO put this back
        #input_ids = torch.cat((input_ids_train, input_ids_test), axis=0)
        #attention_mask = torch.cat((attention_mask_train, attention_mask_test), axis=0)
        #labels = torch.cat((labels_train, labels_test), axis=0)
        single_eval_pos = 1000

        output = model(input_ids_test, attention_mask=attention_mask_test, y=labels_test, single_eval_pos=single_eval_pos).squeeze()
        test_loss = nn.CrossEntropyLoss()(output, labels_test[single_eval_pos:].long().reshape(-1))
        test_losses.append(test_loss.cpu())
        preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
        test_preds.append(preds)
        test_labels.append(labels_test[single_eval_pos:].cpu().detach().numpy().reshape(-1))

test_preds = np.concatenate(test_preds)
test_labels = np.concatenate(test_labels)
test_losses = np.mean(test_losses)
print(f"Test loss: {test_losses}")
print(f"Test accuracy: {accuracy_score(test_labels, test_preds)}")

# Save the model
# model.save_pretrained("./fine_tuned_bert")


ValueError: batch_size should be a positive integer value, but got batch_size=0