In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from src.data_loading import load_data
from skrub import MinHashEncoder
from sklearn.decomposition import PCA
from src.utils import FeaturesExtractor, FixedSizeSplit
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from tabpfn import TabPFNClassifier
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import time
from sentence_transformers import SentenceTransformer

In [4]:
def encode(X, encoder_name):
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        X = np.array(X)
    encoder_type, encoder_params = encoder_name.split("__", 1)
    if encoder_type == "lm":
        encoder = SentenceTransformer(encoder_params)
        return encoder.encode(X)
    elif encoder_type == "skrub":
        if encoder_params.startswith("minhash"):
            n_components = int(encoder_params.split("_")[1])
            encoder = MinHashEncoder(n_components=n_components)
            # reshape to 2d array
            # if pandas dataframe, convert to numpy array
            X = X.reshape(-1, 1)
            return encoder.fit_transform(X)
        else:
            raise ValueError(f"Unknown skrub encoder {encoder_params}")

In [5]:
%cd lm_tab/scripts

/scratch/lgrinszt/lm_tab/scripts


In [6]:
!pwd

/scratch/lgrinszt/lm_tab/scripts


In [7]:
# compare speed on cpu and gpu
X, y = load_data("spotify", max_rows=10000)
# label encoding
#y = y.astype('category').cat.codes
y = y.astype(np.int64)
#X_enc = encode(X, "lm__all-MiniLM-L12-v2")

Original task: classification for spotify


In [8]:
import torch.nn as nn
import torch
from tabpfn import TabPFNClassifier
from src.utils import preprocess_input
from transformers import AutoModel
class BertAndTabPFN(nn.Module):
    def __init__(self, linear_translator=False, dim_tabpfn=100, preprocess_before_tabpfn=False,
                 train_tabpfn=False, transformer_name="distilroberta-base"):
        super().__init__()
        #self.bert = BertModel.from_pretrained('bert-base-uncased')
        #self.bert = BertModel.from_pretrained('distilbert-base-uncased')
        self.bert = AutoModel.from_pretrained(transformer_name)
        self.raw_tabpfn = TabPFNClassifier()
        self.tabpfn = self.raw_tabpfn.model[2]
        if not train_tabpfn:
            # no requires_grad for the tabpfn
            for param in self.tabpfn.parameters():
                param.requires_grad = False
        self.dim_tabpfn = dim_tabpfn
        self.preprocess_before_tabpfn = preprocess_before_tabpfn
        if linear_translator:
            self.linear_translator = nn.Linear(768, dim_tabpfn)
    
    def forward(self, input_ids, attention_mask, y, tabular_data=None, single_eval_pos=100, return_tabpfn_input=False):
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state[:, 0, :]
        if hasattr(self, 'linear_translator'):
            tabpfn_input = self.linear_translator(bert_embeddings)
        else:
            tabpfn_input = bert_embeddings[:, :self.dim_tabpfn]
        if return_tabpfn_input:
            return tabpfn_input
        tabpfn_input = tabpfn_input.reshape(tabpfn_input.shape[0], 1, tabpfn_input.shape[1])
        if self.preprocess_before_tabpfn:
            tabpfn_input = preprocess_input(tabpfn_input, y, single_eval_pos, preprocess_transform="none", device=input_ids.device)
        y = y.reshape(y.shape[0], 1)
        tabpfn_outputs = self.tabpfn((tabpfn_input, y), single_eval_pos=single_eval_pos)
        return tabpfn_outputs
    

In [9]:
from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import numpy as np
from sklearn.metrics import accuracy_score
from src.utils import preprocess_input
from icecream import ic

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)
    

def evaluate_model(dataloader, model, input_ids_train, attention_mask_train, labels_train):
    print("Evaluating model")
    model.eval()
    val_preds, val_labels, val_losses = [], [], []
    with torch.no_grad():
        for batch in dataloader: #TODO: remove the useless for loop
            input_ids_val = batch['input_ids']
            attention_mask_val = batch['attention_mask']
            labels_val = batch['labels']
            # move the inputs to GPU
            input_ids_val = input_ids_val.to('cuda')
            attention_mask_val = attention_mask_val.to('cuda')
            labels_val = labels_val.to('cuda')
            # concatenate train and val
            #TODO: make sure this is correct, no leak etc
            # maybe safer to create a TabPFNClassifier with the same parameters as the one in BertAndTabPFN
            input_ids = torch.cat((input_ids_train, input_ids_val), axis=0)
            attention_mask = torch.cat((attention_mask_train, attention_mask_val), axis=0)
            labels = torch.cat((labels_train, labels_val), axis=0)
            single_eval_pos = len(input_ids_train)
            print(f"Train size: {len(input_ids_train)}, Val size: {len(input_ids_val)}")
            output = model(input_ids, attention_mask=attention_mask, y=labels, single_eval_pos=single_eval_pos).squeeze()
            val_loss = nn.CrossEntropyLoss()(output, labels[single_eval_pos:].long().reshape(-1))
            val_losses.append(val_loss.cpu())
            preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
            val_preds.append(preds)
            val_labels.append(labels[single_eval_pos:].cpu().detach().numpy().reshape(-1))
        val_preds = np.concatenate(val_preds)
        val_labels = np.concatenate(val_labels)
        loss = np.mean(val_losses)
        accuracy = accuracy_score(val_labels, val_preds)
    
    # try with tabpfn classifier to see if there is a difference
    # tabfn_input_train = model(input_ids_train, attention_mask=attention_mask_train, y=labels_train, single_eval_pos=single_eval_pos, return_tabpfn_input=True).cpu().detach().numpy()
    # clf = model.raw_tabpfn
    # clf.fit(tabfn_input_train, labels_train.cpu().detach().numpy())
    # tabfn_input_val = model(input_ids_val, attention_mask=attention_mask_val, y=labels_val, single_eval_pos=single_eval_pos, return_tabpfn_input=True).cpu().detach().numpy()
    # val_preds = clf.predict(tabfn_input_val)
    # val_labels = labels_val.cpu().detach().numpy()
    # accuracy_clf = accuracy_score(val_labels, val_preds)
    # print(f"Accuracy with tabpfn classifier: {accuracy_clf}")
    # print(f"Accuracy raw: {accuracy}")
    # # move back to cuda

        
    model.train()
    return loss, accuracy
    

def train_model(X_enc_dic, y, transformer_name="distilroberta-base"):
    


    train_size = int(0.8 * len(X_enc_dic["input_ids"]))
    val_size = len(X_enc_dic["input_ids"]) - train_size
    print(f"Train size: {train_size}, Val size: {val_size}")
    X_train, X_val = {k: v[:train_size] for k, v in X_enc_dic.items()}, {k: v[train_size:] for k, v in X_enc_dic.items()}
    y_train, y_val = y[:train_size], y[train_size:]
    train_dataset = CustomDataset(X_train, torch.tensor(y_train).float().reshape(-1, 1))
    val_dataset = CustomDataset(X_val, torch.tensor(y_val).float().reshape(-1, 1))
    train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)

    # Initialize model and optimizer
    #model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model = BertAndTabPFN(preprocess_before_tabpfn=True, linear_translator=False, transformer_name=transformer_name).to('cuda')
    optimizer = optim.AdamW(model.parameters(), lr=1e-4)

    es_patience = 3
    es_tolerance = 1e-4
    es_counter = 0

    # Training loop
    num_epochs = 20
    best_val_loss = np.inf
    for epoch in range(num_epochs):
        if es_counter >= es_patience:
            break
        ###########
        # Train loop
        model.train()
        train_preds, train_labels, train_losses = [], [], []
        for batch in train_loader:
            input_ids_train = batch['input_ids']
            attention_mask_train = batch['attention_mask']
            labels_train = batch['labels']
            # move the inputs to GPU
            input_ids_train = input_ids_train.to('cuda')
            attention_mask_train = attention_mask_train.to('cuda')
            labels_train = labels_train.to('cuda')
            single_eval_pos = 400
            output = model(input_ids_train, attention_mask=attention_mask_train, y=labels_train, single_eval_pos=single_eval_pos).squeeze()
            loss = nn.CrossEntropyLoss()(output, labels_train[single_eval_pos:].long().reshape(-1))
            if epoch > 0: #TODO: remove this, this is just for testing
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            # compute train accuracy
            preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
            train_losses.append(loss.cpu().item())
            train_preds.append(preds)
            train_labels.append(labels_train[single_eval_pos:].cpu().detach().numpy().reshape(-1))
        train_preds = np.concatenate(train_preds)
        train_labels = np.concatenate(train_labels)
        train_losses = np.mean(train_losses)
        print(f"Epoch {epoch + 1} - Training loss: {train_losses}, Training accuracy: {accuracy_score(train_labels, train_preds)}")
        # Validation loop
        val_loss, val_accuracy = evaluate_model(val_loader, model, input_ids_train, attention_mask_train, labels_train)
        print(f"Epoch {epoch + 1} - Validation loss: {val_loss}, Validation accuracy: {val_accuracy}")
        if val_loss < best_val_loss - es_tolerance:
            print(f"Validation loss decreased from {best_val_loss} to {val_loss}")
            best_val_loss = val_loss
            # save the model
            torch.save(model.state_dict(), "checkpoints/model.pt")
            # save input_ids_train, attention_mask_train, labels_train
            #torch.save(input_ids_train, "checkpoints/input_ids_train.pt")
            #torch.save(attention_mask_train, "checkpoints/attention_mask_train.pt")
            #torch.save(labels_train, "checkpoints/labels_train.pt")
        else:
            es_counter += 1
            print(f"Early stopping counter: {es_counter}")
            if es_counter >= es_patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    # Load the best model
    model.load_state_dict(torch.load("checkpoints/model.pt"))
    #input_ids_train = torch.load("checkpoints/input_ids_train.pt")
    #attention_mask_train = torch.load("checkpoints/attention_mask_train.pt")
    #labels_train = torch.load("checkpoints/labels_train.pt")
    # concatenate train and val
    for batch in val_loader: #TODO: this won't work when I take lower batch sizes
        input_ids_val = batch['input_ids']
        attention_mask_val = batch['attention_mask']
        labels_val = batch['labels']
        # move the inputs to GPU
        input_ids_val = input_ids_val.to('cuda')
        attention_mask_val = attention_mask_val.to('cuda')
        labels_val = labels_val.to('cuda')
        # concatenate train and val
    input_ids = torch.cat((input_ids_train, input_ids_val), axis=0)
    attention_mask = torch.cat((attention_mask_train, attention_mask_val), axis=0)
    labels = torch.cat((labels_train, labels_val), axis=0)



    return model, input_ids, attention_mask, labels




In [10]:
from src.utils import FixedSizeSplit
cv = FixedSizeSplit(n_splits=5, n_train=1000, n_test=4000)
transformer_name = "distilroberta-base"
test_losses = []
test_accuracies = []
tokenizer = AutoTokenizer.from_pretrained(transformer_name)
texts = X.tolist()
all_encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
for train_index, test_index in cv.split(X, y):
    X_train, X_test = {k: v[train_index] for k, v in all_encoding.items()}, {k: v[test_index] for k, v in all_encoding.items()}
    y_train, y_test = y[train_index], y[test_index]
    #model, input_ids_train, attention_mask_train, labels_train = train_model(X_train, y_train, transformer_name=transformer_name)
    model, input_ids_train, attention_mask_train, labels_train = train_model(X_train, y_train, transformer_name=transformer_name)
    print("Finished training")
    # Evaluate on test set
    # create a test dataset
    

    test_dataset = CustomDataset(X_test, torch.tensor(y_test).float().reshape(-1, 1))
    test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
    test_loss, test_accuracy = evaluate_model(test_loader, model, input_ids_train, attention_mask_train, labels_train)
    print(f"Test loss: {test_loss}, Test accuracy: {test_accuracy}")
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)
    del input_ids_train, attention_mask_train, labels_train, model
    torch.cuda.empty_cache()

Train size: 800, Val size: 200
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Epoch 1 - Training loss: 0.6252211928367615, Training accuracy: 0.655
Evaluating model
Train size: 800, Val size: 200
Epoch 1 - Validation loss: 0.5349120497703552, Validation accuracy: 0.75
Validation loss decreased from inf to 0.5349120497703552
Epoch 2 - Training loss: 0.6445856690406799, Training accuracy: 0.6425
Evaluating model
Train size: 800, Val size: 200
Epoch 2 - Validation loss: 0.525595486164093, Validation accuracy: 0.735
Validation loss decreased from 0.5349120497703552 to 0.525595486164093
Epoch 3 - Training loss: 0.5289170742034912, Training accuracy: 0.7075
Evaluating model
Train size: 800, Val size: 200
Epoch 3 - Validation loss: 0.5299437642097473, Validation accuracy: 0.725
Early stopping counter: 1
Epoch 4 - Training loss: 0.5535533428192139, Training accuracy: 0.7
Evaluating model
Train size: 800, Val size: 200
Epoch 4 - Validation loss: 0.

In [17]:
print(test_accuracies)

[0.76675, 0.743, 0.75425, 0.74525, 0.75025]


In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
df = pd.read_csv("../results/results.csv")
df = df[df["dataset"] == "spotify"]
# add new rows with test accuracies
new_rows = {"dataset": ["spotify"] * 5, "model": ["bert"] * 5, "dim_reduction": ["none"] * 5, "encoding": ["lm__all-MiniLM-L12-v2"] * 5, "accuracy": test_accuracies}
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
melted_results = df.groupby(['dataset', 'model', 'dim_reduction', 'encoding']).mean().reset_index()
#melted_results = results.explode('accuracy')
#melted_results['accuracy'] = melted_results['accuracy'].astype(float)
#melted_results = melted_results[melted_results['encoding'] == 'lm__all-MiniLM-L12-v2']


# Creating the swarmplot
# plt.figure(figsize=(15, 20))
# sns.swarmplot(data=melted_results, x='accuracy', y='dataset', hue='model', dodge=True)
# plt.title('Swarm Plot of Model Accuracies Across Datasets')
# plt.xlabel('Accuracy')
# plt.ylabel('Dataset')
# plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
# plt.show()
# Create the plot
fig = px.strip(
    data_frame=melted_results,
    x="accuracy",
    y="dataset",
    color="model",
    #color="dim_reduction",
    title="Swarm Plot of Model Accuracies Across Datasets",
    labels={"accuracy": "Accuracy", "dataset": "Dataset", "model": "Model"},
    height=600,
    width=900,
)

# Update hover information for each trace (grouped by 'color' or 'model' in this case)
for i, trace in enumerate(fig.data):
    subset_df = melted_results[melted_results['model'] == trace.name]
    hover_template = "<br>".join([f"{col}: %{{customdata[{i}]}}" for i, col in enumerate(subset_df.columns)])
    trace.customdata = subset_df.values
    trace.hovertemplate = hover_template

fig.show()

In [13]:
from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import numpy as np
from sklearn.metrics import accuracy_score
from src.utils import preprocess_input

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Your text data and labels (replace these with your actual data and labels)
# Your text data and labels (replace these with your actual data and labels)
#texts = X_original[column_to_consider].tolist()
texts = X.tolist()
#labels = (y_original > np.median(y_original)).tolist()
#labels = y_original.tolist()
labels = y.tolist()

# Tokenize the text data
#encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset and dataloaders
#dataset = CustomDataset(encoding, torch.tensor(labels).float().reshape(-1, 1))
#print(f"Dataset size: {len(dataset)}")
train_size = 700
val_size = 1000 #TODO

all_encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
encoding_train = {k: v[:train_size] for k, v in all_encoding.items()}
encoding_val = {k: v[train_size:train_size+val_size] for k, v in all_encoding.items()}
encoding_test = {k: v[train_size+val_size:] for k, v in all_encoding.items()}
train_dataset = CustomDataset(encoding_train, torch.tensor(labels).float().reshape(-1, 1)[:train_size])
val_dataset = CustomDataset(encoding_val, torch.tensor(labels).float().reshape(-1, 1)[train_size:train_size+val_size])
test_dataset = CustomDataset(encoding_test, torch.tensor(labels).float().reshape(-1, 1)[train_size+val_size:])
train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

# Initialize model and optimizer
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = BertAndTabPFN(preprocess_before_tabpfn=True, linear_translator=False).to('cuda')
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

es_patience = 5
es_tolerance = 1e-4
es_counter = 0

# Training loop
num_epochs = 20
best_val_loss = np.inf
for epoch in range(num_epochs):
    if es_counter >= es_patience:
        break
    ###########
    # Train loop
    model.train()
    train_preds, train_labels, train_losses = [], [], []
    for batch in train_loader:
        input_ids_train = batch['input_ids']
        attention_mask_train = batch['attention_mask']
        labels_train = batch['labels']
        # move the inputs to GPU
        input_ids_train = input_ids_train.to('cuda')
        attention_mask_train = attention_mask_train.to('cuda')
        labels_train = labels_train.to('cuda')
        single_eval_pos = 400
        output = model(input_ids_train, attention_mask=attention_mask_train, y=labels_train, single_eval_pos=single_eval_pos).squeeze()
        loss = nn.CrossEntropyLoss()(output, labels_train[single_eval_pos:].long().reshape(-1))
        if epoch > 0:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # compute train accuracy
        preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
        #train_accuracy = accuracy_score(labels_train[single_eval_pos:].cpu().detach().numpy().reshape(-1), preds)
        #print(f"Epoch {epoch + 1} - Training loss: {loss}, Training accuracy: {train_accuracy}")
        train_losses.append(loss.cpu().item())
        train_preds.append(preds)
        train_labels.append(labels_train[single_eval_pos:].cpu().detach().numpy().reshape(-1))
    train_preds = np.concatenate(train_preds)
    train_labels = np.concatenate(train_labels)
    train_losses = np.mean(train_losses)
    print(f"Epoch {epoch + 1} - Training loss: {train_losses}, Training accuracy: {accuracy_score(train_labels, train_preds)}")
    # Validation loop
    model.eval()
    val_preds, val_labels, val_losses = [], [], []
    with torch.no_grad():
        for batch in val_loader: #TODO: remove the useless for loop
            input_ids_val = batch['input_ids']
            attention_mask_val = batch['attention_mask']
            labels_val = batch['labels']
            # move the inputs to GPU
            input_ids_val = input_ids_val.to('cuda')
            attention_mask_val = attention_mask_val.to('cuda')
            labels_val = labels_val.to('cuda')
            # concatenate train and val
            #TODO: make sure this is correct, no leak etc
            # maybe safer to create a TabPFNClassifier with the same parameters as the one in BertAndTabPFN
            input_ids = torch.cat((input_ids_train, input_ids_val), axis=0)
            attention_mask = torch.cat((attention_mask_train, attention_mask_val), axis=0)
            labels = torch.cat((labels_train, labels_val), axis=0)
            single_eval_pos = train_size
            output = model(input_ids, attention_mask=attention_mask, y=labels, single_eval_pos=single_eval_pos).squeeze()
            val_loss = nn.CrossEntropyLoss()(output, labels[single_eval_pos:].long().reshape(-1))
            if val_loss < best_val_loss - es_tolerance:
                print(f"Validation loss decreased from {best_val_loss} to {val_loss}")
                best_val_loss = val_loss
                # save the model
                torch.save(model.state_dict(), "checkpoints/model.pt")
                # save input_ids_train, attention_mask_train, labels_train
                torch.save(input_ids_train, "checkpoints/input_ids_train.pt")
                torch.save(attention_mask_train, "checkpoints/attention_mask_train.pt")
                torch.save(labels_train, "checkpoints/labels_train.pt")
            else:
                es_counter += 1
                print(f"Early stopping counter: {es_counter}")
                if es_counter >= es_patience:
                    print(f"Early stopping at epoch {epoch + 1}")
                    break


            val_losses.append(val_loss.cpu())
            preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
            val_preds.append(preds)
            val_labels.append(labels[single_eval_pos:].cpu().detach().numpy().reshape(-1))
        else:
            val_preds = np.concatenate(val_preds)
            val_labels = np.concatenate(val_labels)
            val_losses = np.mean(val_losses)
            print(f"Epoch {epoch + 1} - Validation loss: {val_losses}")
            # Compute accuracy
            val_accuracy = accuracy_score(val_labels, val_preds)
            print(f"Epoch {epoch + 1} - Validation accuracy: {val_accuracy}")

# Load the best model
model.load_state_dict(torch.load("checkpoints/model.pt"))
input_ids_train = torch.load("checkpoints/input_ids_train.pt")
attention_mask_train = torch.load("checkpoints/attention_mask_train.pt")
labels_train = torch.load("checkpoints/labels_train.pt")

# Test loop
model.eval()
test_preds, test_labels, test_losses = [], [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids_test = batch['input_ids']
        attention_mask_test = batch['attention_mask']
        labels_test = batch['labels']
        # move the inputs to GPU
        input_ids_test = input_ids_test.to('cuda')
        attention_mask_test = attention_mask_test.to('cuda')
        labels_test = labels_test.to('cuda')
        # concatenate train and val
        #TODO put this back
        #input_ids = torch.cat((input_ids_train, input_ids_test), axis=0)
        #attention_mask = torch.cat((attention_mask_train, attention_mask_test), axis=0)
        #labels = torch.cat((labels_train, labels_test), axis=0)
        single_eval_pos = train_size

        output = model(input_ids_test, attention_mask=attention_mask_test, y=labels_test, single_eval_pos=single_eval_pos).squeeze()
        test_loss = nn.CrossEntropyLoss()(output, labels_test[single_eval_pos:].long().reshape(-1))
        test_losses.append(test_loss.cpu())
        preds = torch.argmax(output, axis=-1).cpu().detach().numpy()
        test_preds.append(preds)
        test_labels.append(labels_test[single_eval_pos:].cpu().detach().numpy().reshape(-1))

test_preds = np.concatenate(test_preds)
test_labels = np.concatenate(test_labels)
test_losses = np.mean(test_losses)
print(f"Test loss: {test_losses}")
print(f"Test accuracy: {accuracy_score(test_labels, test_preds)}")

# Save the model
# model.save_pretrained("./fine_tuned_bert")


Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Epoch 1 - Training loss: 0.6134790182113647, Training accuracy: 0.6433333333333333
Validation loss decreased from inf to 0.5111220479011536
Epoch 1 - Validation loss: 0.5111220479011536
Epoch 1 - Validation accuracy: 0.732
Epoch 2 - Training loss: 0.6185330748558044, Training accuracy: 0.6633333333333333
Validation loss decreased from 0.5111220479011536 to 0.5057411789894104
Epoch 2 - Validation loss: 0.5057411789894104
Epoch 2 - Validation accuracy: 0.728
Epoch 3 - Training loss: 0.48609843850135803, Training accuracy: 0.7466666666666667
Early stopping counter: 1
Epoch 3 - Validation loss: 0.5087280869483948
Epoch 3 - Validation accuracy: 0.735
Epoch 4 - Training loss: 0.483054518699646, Training accuracy: 0.7733333333333333
Early stopping counter: 2
Epoch 4 - Validation loss: 0.520322859287262
Epoch 4 - Validation accuracy: 0.74
Epoch 5 - Training loss: 0.4642616808414459, Training accuracy:

In [14]:
torch.save(input_ids_train, "checkpoints/input_ids_train.pt")

In [15]:
print(2)

2


In [16]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
checkpoints  launch.ipynb  results2.csv  test  tests.ipynb
