## Print Start time

In [1]:
from utils import print_time

print_time.print_("Start-Time")

------------------------------------------------
Start-Time
2024-10-21 17:46:10
------------------------------------------------


## Hyperparameters

In [2]:
# Constants
epochs = 5
batch_size = 4
weight_decay = 0.01
learning_rate = 2e-5
warmup_steps = 1000
metric_for_best_model = "f1"
early_stopping_patience = 4
max_length = 1024
stride = 512

hyperparameters = {
    'epochs': epochs,     # 1. Baseline
    'batch_size': batch_size,
    'weight_decay': weight_decay,
    'learning_rate': learning_rate,
    'warmup_steps': warmup_steps,
    'metric_for_best_model': metric_for_best_model,
    'early_stopping_patience': early_stopping_patience,
    'max_length': max_length,
    'stride': stride,
    'use_weighted_loss': False
    }

## Specify Model

In [3]:
# model_checkpoint = 'mrm8488/longformer-base-4096-spanish-finetuned-squad'
# model_checkpoint = 'state-spaces/mamba2-130m'
model_checkpoint = 'Narrativa/legal-longformer-base-4096-spanish'
# model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base'
# model_checkpoint = 'bert-large-uncased'
# model_checkpoint = 'xlnet-base-cased'
# model_checkpoint = 'xlnet-large-cased'
# model_checkpoint = 'xlm-roberta-large'
# model_checkpoint = 'microsoft/deberta-v2-xxlarge'

## Load df

In [4]:
import pandas as pd

# corpus_path='corpus/cleaned_corpus_google_sin_resuelve.csv'
# corpus_path='corpus/corpus.csv'
corpus_path='corpus/corpus_google_min_line_len4_min_par_len2.csv'
# df = pd.read_csv(corpus_path, sep='\t', usecols=['Contenido Txt', 'Resultado binario de la acción'])
df = pd.read_csv(corpus_path, usecols=['text', 'label'])

# rename columns
# df.rename(columns = {'Contenido Txt':'text', 'Resultado binario de la acción':'label'}, inplace = True)

In [5]:
# # Separate the entries with label 1
# df_label_1 = df[df['label'] == 1]

# # Randomly sample the same number of entries from label 0
# df_label_0 = df[df['label'] == 0].sample(n=len(df_label_1), random_state=42)

# # Combine both balanced subsets
# df = pd.concat([df_label_1, df_label_0])

# # Shuffle the combined DataFrame to mix label 0 and 1
# df = df.sample(frac=1, random_state=42)

In [6]:
# cut df to X rows
# df = df[:100]

In [7]:
print(df.head())

   label                                               text
0      0  EXPEDIENTE: "RECURSO EXTRAORDINARIO DE\nCASACI...
1      0  EXPEDIENTE: RECURSO EXTRAORDINARIO \nDE CASACI...
2      0  S HOMICIDIO DOLOSO -TENTATIVA".\nACUERDO Y SEN...
3      0  EXPEDIENTE: RECURSO EXTRAORDINARIO DE\nCASACIÓ...
4      0  Bicentenario de la lodependencia Nacional: 181...


In [8]:
print(df['text'][0])

EXPEDIENTE: "RECURSO EXTRAORDINARIO DE
CASACIÓN INTERPUESTO POR EL SR. HANS
FRIEDICH SCHUCHARDT en la causa: IVAN
FALSIFICACION DE INSTRUMENTOS PUBLICOS
Corte Suprema de Justicia
 ACUERDO Y SENTENCIA NÚMERO: Novecientos sesenta y ocho.-;
En Asunción del Paraguay, a los.
del año dos mil.
estando reunidos en la Sala de Acuerdos los Excelentísimos
10 Senores Ministros de la Corte Suprema de Justicia, Sala Penal,
Doctores Alicia Beatriz Pucheta de Correa, Sindulfo Blanco y José
Raúl Torres K., quien integra la Sala Penal en reemplazo del Dr.
Wildo Rienzi Galeano, por ante mí, el Secretaria Autorizante,
trajo para acuerdo el expediente caratulado: "RECURSO
EXTRAORDINARIO DE CASACIÓN INTERPUESTO POR EL SR. HANS FRIEDICH
SCHUCHARDT en la causa: IVAN YEGROS Y OTROS S DEFRAUDACION,
FALSIFICACION DE
PUBLICOS Y OTROS", a fin de
resolver el recurso extraordinario de casación interpuesto por el
SR. HANS FRIEDICH SCHUCHARDT por derecho propio y bajo patrocinio.
del Abogado Fabio Cuevas Storm en cont

In [None]:
df.describe()

Unnamed: 0,label
count,5000.0
mean,0.1906
std,0.392814
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


## Split data

In [10]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, y_train, y_temp = train_test_split(
    df['text'], df['label'],
    test_size=0.3, random_state=42
)

val_texts, test_texts, y_val, y_test = train_test_split(
    temp_texts, y_temp,
    test_size=0.5, random_state=42
)

In [11]:
print('Train samples:', train_texts.shape[0])
print('Validation samples:', val_texts.shape[0])
print('Test samples:', test_texts.shape[0])
print()

# print labels distribution in train
print(y_train.value_counts())

Train samples: 3500
Validation samples: 750
Test samples: 750

label
0    2843
1     657
Name: count, dtype: int64


## Run Model

In [12]:
print("Converting train, val and test texts to csv...")
train_texts.to_csv('corpus/train_texts.csv', index=False, header=False)
val_texts.to_csv('corpus/val_texts.csv', index=False, header=False)
test_texts.to_csv('corpus/test_texts.csv', index=False, header=False)

Converting train, val and test texts to csv...


In [13]:
from models import tune_transformer

print("------------------------------------")
print("Model:", model_checkpoint)
print("------------------------------------")

test_pred_labels = tune_transformer.run(model_checkpoint, 2,
                                        train_texts, val_texts, test_texts,
                                        y_train, y_val, y_test,
                                        hyperparameters=hyperparameters)

# # replace original test labels with predicted labels
# df_test['label'] = test_pred_labels

# # save the dataframe with predicted labels to a csv file
# print("Saving predictions to csv...")
# df_test.to_csv('corpus/prediction_task3.tsv', sep='\t', index=False)

2024-10-21 17:46:15.696233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-21 17:46:15.710145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-21 17:46:15.727184: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-21 17:46:15.732345: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-21 17:46:15.745816: I tensorflow/core/platform/cpu_feature_guar

---------------------------------------------
---------------------------------------------
Number of GPUs: 2
---------------------------------------------
---------------------------------------------
------------------------------------
Model: Narrativa/legal-longformer-base-4096-spanish
------------------------------------
Max length: 1024
Stride 512




Reading google datasets from disk
Type of classes: <class 'numpy.ndarray'>
Classes: [0 1]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Narrativa/legal-longformer-base-4096-spanish and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


using automodel
Training arguments
Batch size: 10
Weight decay: 0.01
Learning rate: 2e-05
Warmup steps: 1000
Metric for best model: f1


OutOfMemoryError: Caught OutOfMemoryError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 1195, in forward
    outputs = self.roberta(
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 832, in forward
    encoder_outputs = self.encoder(
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 521, in forward
    layer_outputs = layer_module(
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 410, in forward
    self_attention_outputs = self.attention(
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 337, in forward
    self_outputs = self.self(
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/transformers/models/roberta/modeling_roberta.py", line 260, in forward
    attention_probs = self.dropout(attention_probs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/modules/dropout.py", line 70, in forward
    return F.dropout(input, self.p, self.training, self.inplace)
  File "/home/leon/.local/lib/python3.10/site-packages/torch/nn/functional.py", line 1425, in dropout
    _VF.dropout_(input, p, training) if inplace else _VF.dropout(input, p, training)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 480.00 MiB. GPU 1 has a total capacity of 23.68 GiB of which 181.12 MiB is free. Process 2381833 has 8.88 GiB memory in use. Including non-PyTorch memory, this process has 14.59 GiB memory in use. Of the allocated memory 14.07 GiB is allocated by PyTorch, and 100.32 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


## Mamba

In [15]:
# from transformers import MambaForCausalLM, AutoTokenizer
# import torch
# import torch.nn as nn
# from torch.utils.data import DataLoader
# from torch.utils.tensorboard import SummaryWriter
# # import Loading Bar
# from peft import LoraConfig, get_peft_model, TaskType

# from utils.train import MambaForTextClassification, TextDataset, train_model, evaluate_model


# # Hyperparameters
# epochs = 1
# batch_size = 16
# learning_rate = 2e-5
# max_length = 512

# # Create a SummaryWriter to log metrics
# # writer = SummaryWriter()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # 3. Initialize model, tokenizer, and dataset
# tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
# mamba_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
# classification_model = MambaForTextClassification(mamba_model, num_labels=2)

# print(classification_model)

# lora_config = LoraConfig(
#         target_modules=[
#             "mamba_model.backbone.layers.*.mixer.in_proj",
#             "mamba_model.backbone.layers.*.mixer.x_proj",
#             "mamba_model.backbone.layers.*.mixer.dt_proj",
#             "mamba_model.backbone.layers.*.mixer.out_proj"
#         ],
#         r=8,
#         # task_type="SEQ_CLS",
#         task_type=TaskType.SEQ_CLS,
#         lora_alpha=32,
#         lora_dropout=0.05,       # 0.05
#         use_rslora=True
#     )

# classification_model = get_peft_model(classification_model, lora_config)
# classification_model.print_trainable_parameters()

# classification_model = nn.DataParallel(classification_model)
# classification_model.to(device)

# freeze_mamba = False
# if freeze_mamba:
#     for param in classification_model.module.mamba_model.parameters():
#         param.requires_grad = False

# # Tokenize and create dataset
# train_dataset = TextDataset(train_texts.tolist(), y_train.tolist(), tokenizer, max_length)
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# val_dataset = TextDataset(val_texts.tolist(), y_val.tolist(), tokenizer, max_length)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# test_dataset = TextDataset(test_texts.tolist(), y_test.tolist(), tokenizer, max_length)
# test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# # Train the model
# train_model(classification_model, train_dataloader, val_dataloader, learning_rate, epochs, device)

# # Evaluate the model on test data
# evaluate_model(classification_model, test_dataloader, epoch=-1, device=device, phase='test')

# # # Close TensorBoard writer
# # writer.close()


Using device: cuda
MambaForTextClassification(
  (mamba_model): MambaForCausalLM(
    (backbone): MambaModel(
      (embeddings): Embedding(50280, 768)
      (layers): ModuleList(
        (0-23): 24 x MambaBlock(
          (norm): MambaRMSNorm(768, eps=1e-05)
          (mixer): MambaMixer(
            (conv1d): Conv1d(1536, 1536, kernel_size=(4,), stride=(1,), padding=(3,), groups=1536)
            (act): SiLU()
            (in_proj): Linear(in_features=768, out_features=3072, bias=False)
            (x_proj): Linear(in_features=1536, out_features=80, bias=False)
            (dt_proj): Linear(in_features=48, out_features=1536, bias=True)
            (out_proj): Linear(in_features=1536, out_features=768, bias=False)
          )
        )
      )
      (norm_f): MambaRMSNorm(768, eps=1e-05)
    )
    (lm_head): Linear(in_features=768, out_features=50280, bias=False)
  )
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)


ValueError: Target modules {'mamba_model.backbone.layers.*.mixer.out_proj', 'mamba_model.backbone.layers.*.mixer.in_proj', 'mamba_model.backbone.layers.*.mixer.dt_proj', 'mamba_model.backbone.layers.*.mixer.x_proj'} not found in the base model. Please check the target modules and try again.

## Print End Time

In [21]:
print_time.print_("End-Time")

------------------------------------------------
End-Time
2024-09-17 01:07:12
------------------------------------------------
