## Print Start time

In [1]:
from utils import print_time

print_time.print_("Start-Time")

------------------------------------------------
Start-Time
2024-09-24 16:29:58
------------------------------------------------


## Hyperparameters

In [2]:
# Constants
epochs = 30
batch_size = 16
weight_decay = 0.01
learning_rate = 2e-5
warmup_steps = 1000
metric_for_best_model = "f1"
early_stopping_patience = 6
max_length = 512

hyperparameters = {
    'epochs': epochs,     # 1. Baseline
    'batch_size': batch_size,
    'weight_decay': weight_decay,
    'learning_rate': learning_rate,
    'warmup_steps': warmup_steps,
    'metric_for_best_model': metric_for_best_model,
    'early_stopping_patience': early_stopping_patience,
    'max_length': max_length,
    'use_weighted_loss': False
    }

## Specify Model

In [3]:
# model_checkpoint = 'mrm8488/longformer-base-4096-spanish-finetuned-squad'
# model_checkpoint = 'state-spaces/mamba2-130m'
model_checkpoint = 'Narrativa/legal-longformer-base-4096-spanish'
# model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base'
# model_checkpoint = 'bert-large-uncased'
# model_checkpoint = 'xlnet-base-cased'
# model_checkpoint = 'xlnet-large-cased'
# model_checkpoint = 'xlm-roberta-large'
# model_checkpoint = 'microsoft/deberta-v2-xxlarge'

## Load df

In [4]:
import pandas as pd

corpus_path='corpus/corpus_final_corregido.txt'
df = pd.read_csv(corpus_path, sep='\t', usecols=['Contenido Txt', 'Resultado binario de la acción'])

# rename columns
df.rename(columns = {'Contenido Txt':'text', 'Resultado binario de la acción':'label'}, inplace = True)

In [5]:
# # Separate the entries with label 1
# df_label_1 = df[df['label'] == 1]

# # Randomly sample the same number of entries from label 0
# df_label_0 = df[df['label'] == 0].sample(n=len(df_label_1), random_state=42)

# # Combine both balanced subsets
# df = pd.concat([df_label_1, df_label_0])

# # Shuffle the combined DataFrame to mix label 0 and 1
# df = df.sample(frac=1, random_state=42)

In [6]:
# cut df to X rows
df = df[:1000]

In [7]:
print(df.head())

                                                text  label
0  1 expediente recurso extraordinario de casacio...      0
1  expediente recurso extraordinario de casación ...      0
2  lowe oc corte expte nestor valentin gonzalez s...      0
3  expediente recurso extraordinario de re cx cas...      0
4  expediente recurso de casacion interpuesto por...      0


In [9]:
print(df['text'][0])

# print True or False if word "declarar" is in the text
print('declarar' in df['text'][0])

# print the word that comes after declarar
print(df['text'][0].split('declarar')[1].split()[0])

1 expediente recurso extraordinario de casacion interpuesto por el sr hans friedich schuchardt en la causa ivan yegros y otros s defraudacion, i falsificacion de instrumentos publicos yy cuerdo y sentencia numero noyecventos, sesenta 4 ocho ta en aaggincién del paraguay, a los tyece, seeeseees del mes ee reno cece del afio dos mil yec it ig reunidos en la sala de acuerdos los excelentisimos doctores alicia beatriz pucheta de correa, sindulfo blanco y josé ratl torres k., quien integra la sala penal en reemplazo del dr wildo rienzi galeano, por ante mi, el secretaria autorizante, ser trajo para acuerdo el expediente caratulado recurso extraordinario de casacion interpuesto por el sr hans friedich schuchardt en la causa ivan yegros y otros s defraudacion, falsificacion de instrumentos publicos y otros , a fin de resolver el recurso extraordinario de casacién interpuesto por ei sr hans friedich schuchardt por derecho propio y bajo patrociniol del abogado fabio cuevas storm en contra de la

## Split data

In [23]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, y_train, y_temp = train_test_split(
    df['text'], df['label'],
    test_size=0.3, random_state=42
)

val_texts, test_texts, y_val, y_test = train_test_split(
    temp_texts, y_temp,
    test_size=0.5, random_state=42
)

In [24]:
print('Train samples:', train_texts.shape[0])
print('Validation samples:', val_texts.shape[0])
print('Test samples:', test_texts.shape[0])
print()

# print labels distribution in train
print(y_train.value_counts())

Train samples: 700
Validation samples: 150
Test samples: 150

label
0    573
1    127
Name: count, dtype: int64


## Run Model

In [13]:
print("Converting train, val and test texts to csv...")
train_texts.to_csv('corpus/train_texts.csv', index=False, header=False)
val_texts.to_csv('corpus/val_texts.csv', index=False, header=False)
test_texts.to_csv('corpus/test_texts.csv', index=False, header=False)

Converting train, val and test texts to csv...


In [14]:
from models import tune_transformer
import torch
from transformers import BertTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AutoModel
import numpy as np

print("------------------------------------")
print("Model:", model_checkpoint)
print("------------------------------------")

test_pred_labels = tune_transformer.run(model_checkpoint, 2,
                                        train_texts, val_texts, test_texts,
                                        y_train, y_val, y_test,
                                        hyperparameters=hyperparameters)

# # replace original test labels with predicted labels
# df_test['label'] = test_pred_labels

# # save the dataframe with predicted labels to a csv file
# print("Saving predictions to csv...")
# df_test.to_csv('corpus/prediction_task3.tsv', sep='\t', index=False)

2024-09-20 12:46:02.122303: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-20 12:46:02.136328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-20 12:46:02.152797: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-20 12:46:02.157802: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-20 12:46:02.171000: I tensorflow/core/platform/cpu_feature_guar

---------------------------------------------
---------------------------------------------
Number of GPUs: 2
---------------------------------------------
---------------------------------------------
------------------------------------
Model: Narrativa/legal-longformer-base-4096-spanish
------------------------------------




Sample train input_ids: [101, 4654, 5669, 25099, 28667, 9236, 2080, 2139, 14124, 10446, 6970, 14289, 4355, 2080, 18499, 3449, 5034, 1046, 1039, 1039, 1061, 18499, 4315, 15937, 2080, 17678, 3695, 1061, 8670, 5558, 6986, 3217, 27085, 2080, 3972, 11113, 8649, 1050, 1046, 1054, 1012, 1010, 4372, 2474, 6187, 10383, 1037, 1041, 1055, 2572, 8189, 4143, 1050, 5890, 1012, 5890, 1012, 6021, 1012, 5890, 1012, 2268, 1012, 23475, 2019, 2080, 2249, 1010, 1050, 5354, 2620, 1010, 1042, 29401, 6421, 1058, 23223, 9353, 13094, 3527, 1061, 2741, 27742, 16371, 5017, 2080, 21864, 8034, 13663, 2015, 7367, 5054, 2696, 1061, 16371, 18697, 4372, 2474, 20759, 2139, 2004, 4609, 10446, 1010, 3007, 2139, 2474, 3072, 2050, 3972, 13884, 1010, 1037, 3050, 3280, 2480, 1061, 16371, 18697, 22939, 2015, 1010, 3972, 2033, 2015, 2139, 12022, 3695, 3972, 2019, 2080, 9998, 23689, 9986, 2063, 9765, 28574, 2128, 19496, 12269, 4372, 2474, 16183, 2050, 2139, 9353, 13094, 12269, 3050, 12411, 16610, 7163, 3367, 7352, 2139, 2474, 24

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Narrativa/legal-longformer-base-4096-spanish and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using automodel
Training arguments
Batch size: 16
Weight decay: 0.01
Learning rate: 2e-05
Warmup steps: 1000
Metric for best model: f1




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4568,0.499759,0.802667,0.445266,0.401333,0.5
2,0.4849,0.505483,0.802667,0.445266,0.401333,0.5
3,0.4549,0.508455,0.802667,0.445266,0.401333,0.5
4,0.4677,0.493187,0.802667,0.445266,0.401333,0.5
5,0.544,0.481291,0.802667,0.445266,0.401333,0.5
6,0.4561,0.472552,0.802667,0.445266,0.401333,0.5
7,0.4695,0.498149,0.802667,0.445266,0.401333,0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Predicted Labels [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Mamba

In [26]:
from transformers import MambaForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
# import f1_score from sklearn
from sklearn.metrics import f1_score
# import Loading Bar
from tqdm import tqdm

# Hyperparameters
epochs = 10
batch_size = 16
learning_rate = 2e-5
max_length = 512

# Create a SummaryWriter to log metrics
writer = SummaryWriter()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. Define Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # Remove batch dimension
        inputs["labels"] = torch.tensor(label, dtype=torch.long)
        return inputs

# 2. Modify the model to add a classification head
class MambaForTextClassification(nn.Module):
    def __init__(self, model, num_labels):
        super(MambaForTextClassification, self).__init__()
        self.mamba_model = model
        self.classifier = nn.Linear(self.mamba_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        # Get hidden states from the language model
        outputs = self.mamba_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1]  # Get the last hidden state

        # Pool the hidden states (take the hidden state corresponding to [CLS] token or mean pooling)
        pooled_output = hidden_states[:, 0, :]  # Using the first token's embedding (usually [CLS] token)

        # Pass the pooled output through the classifier
        logits = self.classifier(pooled_output)
        return logits

# 3. Initialize model, tokenizer, and dataset
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
mamba_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
classification_model = MambaForTextClassification(mamba_model, num_labels=2)
classification_model = nn.DataParallel(classification_model)
classification_model.to(device)

freeze_mamba = False
if freeze_mamba:
    for param in classification_model.module.mamba_model.parameters():
        param.requires_grad = False

# Tokenize and create dataset
train_dataset = TextDataset(train_texts.tolist(), y_train.tolist(), tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TextDataset(val_texts.tolist(), y_val.tolist(), tokenizer, max_length)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TextDataset(test_texts.tolist(), y_test.tolist(), tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# 4. Define optimizer and loss function
optimizer = optim.AdamW(classification_model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 6. Evaluation function
def evaluate_model(model, dataloader, epoch, phase='val'):
    if phase == 'test':
        print("\nEvaluating on test data...")

    model.eval()
    correct_predictions = 0
    total_predictions = 0
    total_loss = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch.get('attention_mask').to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask=attention_mask)
            probs = F.softmax(logits, dim=-1)
            predicted_class = torch.argmax(probs, dim=-1)
            
            correct_predictions += (predicted_class == labels).sum().item()
            total_predictions += labels.size(0)

            loss = criterion(logits, labels)
            total_loss += loss.item()

            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted_class.cpu().numpy())
            
    accuracy = correct_predictions / total_predictions
    f1 = f1_score(all_labels, all_predictions, average='macro')
    average_loss = total_loss / len(dataloader)
    # Log validation loss, accuracy, and f1 score
    writer.add_scalar(f'{phase}/Loss', average_loss, epoch)
    writer.add_scalar(f'{phase}/Accuracy', accuracy, epoch)
    writer.add_scalar(f'{phase}/F1_Score', f1, epoch)

    print(f"{phase.capitalize()} Accuracy: {accuracy * 100:.2f}% | F1 Score: {f1:.4f} | Loss: {average_loss:.4f}")
    if phase == 'test':
        # print predictions
        print("Predictions:", all_predictions)

# 5. Training loop
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, epochs):
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        model.train()
        loss_sum = 0
        for batch_idx, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch.get('attention_mask').to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)
            # Log loss
            loss_sum += loss.item()

            # Backpropagation
            loss.backward()
            optimizer.step()

            # Print batch progress
            if batch_idx % 10 == 0:  # Print every 10 batches
                print(f"Batch {batch_idx}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

            writer.add_scalar('Loss/train', loss.item(), epoch * len(train_dataloader) + batch_idx)

        print(f"Train Loss: {loss_sum/len(train_dataloader):.4f}")
        
        # calculate validation accuracy after each epoch
        evaluate_model(model, val_dataloader, epoch, phase='val')

# Train the model
train_model(classification_model, train_dataloader, val_dataloader, optimizer, criterion, epochs)

# Evaluate the model on test data
evaluate_model(classification_model, test_dataloader, epoch=-1, phase='test')

# Close TensorBoard writer
writer.close()


Using device: cuda

Epoch 1/10
Batch 0/44, Loss: 0.5739
Batch 10/44, Loss: 0.6282
Batch 20/44, Loss: 0.4389
Batch 30/44, Loss: 0.4573
Batch 40/44, Loss: 0.4556
Train Loss: 0.4886
Val Accuracy: 81.33% | F1 Score: 0.4485 | Loss: 0.5164

Epoch 2/10
Batch 0/44, Loss: 0.3976
Batch 10/44, Loss: 0.4092
Batch 20/44, Loss: 0.2421
Batch 30/44, Loss: 0.3954
Batch 40/44, Loss: 0.6082
Train Loss: 0.4609
Val Accuracy: 81.33% | F1 Score: 0.4485 | Loss: 0.4896

Epoch 3/10
Batch 0/44, Loss: 0.1922
Batch 10/44, Loss: 0.5348
Batch 20/44, Loss: 0.4201
Batch 30/44, Loss: 0.2930
Batch 40/44, Loss: 0.3846
Train Loss: 0.4511
Val Accuracy: 81.33% | F1 Score: 0.4485 | Loss: 0.4758

Epoch 4/10
Batch 0/44, Loss: 0.4785
Batch 10/44, Loss: 0.6792
Batch 20/44, Loss: 0.1791
Batch 30/44, Loss: 0.3837
Batch 40/44, Loss: 0.6539
Train Loss: 0.4451
Val Accuracy: 81.33% | F1 Score: 0.4485 | Loss: 0.4834

Epoch 5/10
Batch 0/44, Loss: 0.2782
Batch 10/44, Loss: 0.5488
Batch 20/44, Loss: 0.4874
Batch 30/44, Loss: 0.3738
Batch 

## Print End Time

In [21]:
print_time.print_("End-Time")

------------------------------------------------
End-Time
2024-09-17 01:07:12
------------------------------------------------
