## TRansfer Learning

### preprocessing

In [1]:
import ast
import torch # type: ignore
import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split
import evaluate # type: ignore
from transformers import( # type: ignore
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments
)
from TokenClassificationEncoderModels import(
    RobertaTokenClassifier_With_GRU,
    RobertaTokenClassifier_With_LSTM,
    BertTokenClassifier_With_GRU,
    BertTokenClassifier_With_LSTM 
) 
from utilities_ner_functions import(
    tokenize_and_align_labels_black_box,
    baseDataset,
) 
import warnings
warnings.filterwarnings("ignore")

In [2]:

df = pd.read_csv(r'/home/tensorboard/Documentos/1. D4R/3. NER/training/dataset_NER_completo_23-05-24.csv')


In [3]:
df = df.drop('Unnamed: 0', axis=1)

df['sentence'] = df['sentence'].apply(ast.literal_eval)
df['tags'] = df['tags'].apply(ast.literal_eval)
df['encoded_tags'] = df['encoded_tags'].apply(ast.literal_eval)

In [4]:
labels_to_ids = {
    'O': 0,
    'GOD': 1,
    'JUS': 2,
    'CHRI': 3,
    'SACRA': 4,
    'HERESY': 5,
    'PER': 6,
    'PLACE': 7,
    'ORG': 8
    }
ids_to_labels = {v:k for k, v in labels_to_ids.items()}

labels_list = [x for x in ids_to_labels.values()]

In [5]:
df_prepared = df[['sentence', 'encoded_tags']]

In [6]:
train_df, eval_df = train_test_split(df_prepared, train_size= 0.8, random_state=42, shuffle=True)
eval_df, test_df = train_test_split(eval_df, test_size=0.5, random_state=42, shuffle=True)

In [7]:
tokenizer= AutoTokenizer.from_pretrained("/media/tensorboard/PRODUCCIÓN Y ACADÉMICO/TRABAJO/D4R_models/NER_models/General_models/roberta-base-bne-Linear-NER/checkpoint-50000",  add_prefix_space=True)

In [8]:
tokenized_inputs_train = tokenize_and_align_labels_black_box(train_df, tokenizer,token_column='sentence', tag_column='encoded_tags')
tokenized_inputs_eval = tokenize_and_align_labels_black_box(eval_df, tokenizer,token_column='sentence', tag_column='encoded_tags')

### Basic Linear classifier

In [9]:
MODEL_CHECKPOINT ='/media/tensorboard/PRODUCCIÓN Y ACADÉMICO/TRABAJO/D4R_models/NER_models/General_models/roberta-base-bne-Linear-NER/checkpoint-50000'

#carga el modelo qye ya está ajustado
old_model= AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT) #carga el modelo que ya está ajustado
#carga un nuevo modelo (aqui se puede probar Bi-LSTM,RNN o GRU en configuraciones propias)
new_model=AutoModelForTokenClassification.from_pretrained('PlanTL-GOB-ES/roberta-base-bne', num_labels= len(labels_list),id2label=ids_to_labels, label2id=labels_to_ids)

#Solo usamos la configuración de pesos aprendida del modelo
new_model.roberta =old_model.roberta


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
for param in new_model.parameters():
    param.requires_grad = False
# Create a new linear layer for NER
new_classifier = torch.nn.Linear(in_features=new_model.config.hidden_size, out_features=new_model.config.num_labels)

# Replace the existing classifier with the new one
new_model.classifier = new_classifier

In [11]:
from torchinfo import summary
batch_size = 16
input_ids = torch.randint(0, new_model.config.vocab_size, (batch_size, tokenized_inputs_train['input_ids'][0].shape[0]))
# 0, new_model.config.vocab_size, significa que creamos un tensor desde cero hasta el máximo del vocabulario del transformer
# (batch_size, tokenized_inputs_train['input_ids'][0].shape[0]) for que adquiere, en este caso, el batch es 16 y el shape del input ID es 460


summary(new_model,
            input_data=input_ids,
                       col_names=["input_size", "output_size", "num_params", "trainable"],
                       )

text_summary = str(summary(new_model,
            input_data=input_ids,
                       col_names=["input_size", "output_size", "num_params", "trainable"],
                       ))

with open(f'sumary of TF linear.txt', 'w', encoding = 'utf-8') as f:
    f.write(text_summary)
    f.close() 

In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [13]:
dataset_train= baseDataset(tokenized_inputs_train)
dataset_test =baseDataset(tokenized_inputs_eval)

In [None]:
import seqeval
import numpy as np
def compute_metrics(p):
    """
    Computes the precision, recall, F1, and accuracy scores for the given predictions and labels.

    Args:
        p (tuple): A tuple containing the predictions and labels.

    Returns:
        dict: A dictionary containing the precision, recall, F1, and accuracy scores.
    """
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l!= -100] # type: ignore
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l!= -100] # type: ignore
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"], # type: ignore
        "recall": results["overall_recall"], # type: ignore
        "f1": results["overall_f1"], # type: ignore
        "accuracy": results["overall_accuracy"], # type: ignore
    }

In [14]:
training_args = TrainingArguments(
    output_dir='roberta-base-bne-NER-TF-100-epochs-2-4e',
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps = 500


)

trainer = Trainer(
    model=new_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [15]:

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhector-investigacion-sociocul[0m ([33mhlhdatascience[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2290 [00:00<?, ?it/s]

{'loss': 1.8592, 'grad_norm': 10.82496166229248, 'learning_rate': 1.5633187772925764e-06, 'epoch': 2.18}


  0%|          | 0/15 [00:00<?, ?it/s]

NameError: name 'labels_list' is not defined

## Using costum models

### preprocessing.

In [None]:
from TokenClassificationEncoderModels import RobertaTokenClassifier_With_GRU, RobertaTokenClassifier_With_LSTM


In [None]:
tokenizer= RobertaTokenizerFast.from_pretrained("hlhdatscience/roberta-base-bne-NER",  add_prefix_space=True)

In [None]:
def tokenize_and_align_labels(dataframe, tokenizer):
    # Tokenize inputs using the provided tokenizer
    tokenized_inputs = tokenizer(dataframe['tokens'].tolist(),
                                 truncation=True,
                                 is_split_into_words=True,
                                 padding=True,
                                 max_length=200,
                                 return_tensors='pt'
                                )

    labels = []
    for i, label in enumerate(dataframe['ner_tags'].tolist()):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                if word_idx < len(label):  # Check if the word index is within the range of the label list
                    label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    # Convert lists to PyTorch tensors
    tokenized_inputs["labels"] = torch.tensor(labels)

    return tokenized_inputs



In [None]:
tokenized_inputs_train = tokenize_and_align_labels(train_df, tokenizer=tokenizer)

tokenized_inputs_test = tokenize_and_align_labels(test_df, tokenizer)

### LSTM architecture

In [None]:
#carga el modelo que ya está ajustado
old_model= RobertaForTokenClassification.from_pretrained("hlhdatscience/roberta-base-bne-NER") #carga el modelo qye ya está ajustado
#carga un nuevo modelo costumizado
new_model=RobertaTokenClassifierWithLSTM.from_pretrained('PlanTL-GOB-ES/roberta-base-bne', num_labels= len(labels_list),id2label=ids_to_lables, label2id=labels_to_ids)

#Solo usamos la configuración de pesos aprendida del modelo
new_model.roberta =old_model.roberta


In [None]:
from torch import nn
#fronzen model weights
for param in new_model.parameters():
    param.requires_grad = False

# Create the new layers
new_leaky_relu = nn.LeakyReLU(negative_slope=0.01)  # Adjust the negative slope as needed


# Linear layer for feature transformation
new_dense_1 = nn.Linear(new_model.config.hidden_size, 512)
new_dense_2 = nn.Linear(512, 256)
# Bidirectional LSTM layer
new_bidirectional = nn.LSTM(
            input_size=512,
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )

# Dropout for LSTM regularization
new_dropout_lstm = nn.Dropout(0.1)  # Adjust dropout rate as needed

# Linear layer for classification
new_classifier = nn.Linear(256, new_model.config.num_labels)

# Replace the existing frozen layers for the unfrozen ones

new_model.leaky_relu = new_leaky_relu
new_model.dense_1 = new_dense_1
new_model.bidirectional = new_bidirectional
new_model.dropout_lstm = new_dropout_lstm
new_model.dense_2 = new_dense_2
new_model.classifier = new_classifier

In [None]:
new_model

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
seqeval = evaluate.load("seqeval")


In [None]:
import numpy as np



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
class baseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    


In [None]:
dataset_train= baseDataset(tokenized_inputs_train)
dataset_test =baseDataset(tokenized_inputs_test)

In [None]:
training_args = TrainingArguments(
    output_dir='roberta-base-bne-NER-TF-Bi-LSTM-100-epochs-2-4e',
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    


)

trainer = Trainer(
    model=new_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

trainer.train()

### GRU architecture

In [None]:
#carga el modelo que ya está ajustado
old_model= RobertaForTokenClassification.from_pretrained("hlhdatscience/roberta-base-bne-NER") #carga el modelo qye ya está ajustado
#carga un nuevo modelo costumizado
new_model=RobertaTokenClassifierWithGRU.from_pretrained('PlanTL-GOB-ES/roberta-base-bne', num_labels= len(labels_list),id2label=ids_to_lables, label2id=labels_to_ids)

#Solo usamos la configuración de pesos aprendida del modelo
new_model.roberta =old_model.roberta


In [None]:
from torch import nn
#fronzen model weights
for param in new_model.parameters():
    param.requires_grad = False

# Create the new layers


new_leaky_relu = nn.LeakyReLU(negative_slope=0.01)  # Adjust the negative slope as needed

        # Linear layer for feature transformation
new_dense_1 = nn.Linear(new_model.config.hidden_size, 512)

        # Bidirectional GRU layer
new_bidirectional = nn.GRU(
            input_size=512,
            hidden_size=256,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )

        # Dropout for GRU regularization
new_dropout_GRU = nn.Dropout(0.1)  # Adjust dropout rate as needed

        # Second Linear layer for feature transformation
new_dense_2 = nn.Linear(512, 256)

        # Linear layer for classification
new_classifier = nn.Linear(256, new_model.config.num_labels)


# Replace the existing frozen layers for the unfrozen ones

new_model.leaky_relu = new_leaky_relu
new_model.dense_1 = new_dense_1
new_model.bidirectional = new_bidirectional
new_model.dropout_GRU = new_dropout_GRU
new_model.dense_2 = new_dense_2
new_model.classifier = new_classifier

In [None]:
new_model

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np



def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [labels_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [labels_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
class baseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx].clone().detach() for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    

In [None]:
dataset_train= baseDataset(tokenized_inputs_train)
dataset_test =baseDataset(tokenized_inputs_test)

In [None]:
training_args = TrainingArguments(
    output_dir='roberta-base-bne-NER-TF-Bi-GRU-100-epochs-2-4e',
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",


)

trainer = Trainer(
    model=new_model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

## Important link to understand val loss going up and rest of the val metricsa as well (positive correlation instead of the inverse one)

https://stats.stackexchange.com/questions/282160/how-is-it-possible-that-validation-loss-is-increasing-while-validation-accuracy