# TRL Draft



In [1]:
!pip install trl



## V2 : On PPV

In [1]:
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import json

In [14]:
def turn_to_dataset(df):
    return Dataset.from_pandas(df)

def extract_label(string:str, target_string:str):
     return 1 if target_string in string else 0

def split_train_test(df, train_prct:float, val_test_prct:float, random_state=42):
    """
    This function splits thet dataframe into 3 subsets: train, validation and test. 

    Args: 
        df : pandas Dataframe
        train_prct:float :percentage of the total sample to be trained
        val_test_prct:float : percentage of the remaining data to be trained
        random_state : seed
    """
    df_train = df.sample(frac=train_prct, random_state=random_state)
    df_val_temp = df.drop(df_train.index)
    df_val = df_val_temp.sample(frac=val_test_prct, random_state=random_state)
    df_test = df_val_temp.drop(df_val.index)
    return df_train, df_val, df_test

In [15]:
df = pd.read_csv(r"../../data/raw/data449_cats.csv")[["text", "label"]]
type(df.index)

pandas.core.indexes.range.RangeIndex

In [16]:
df["label_text"] = df.apply(lambda x: extract_label(x["label"], "PPV"), axis=1)

In [17]:
df["label_text"].values.sum()

94

In [18]:
df = df[["text", "label_text"]].copy()
df = df.rename(columns={"label_text":"label"})
df_train, df_eval, df_test = split_train_test(df, train_prct=0.8, val_test_prct=0.5,random_state=42)
print("shape of the train data:", df_train.shape, "shape of the eval data:", df_eval.shape, "shape of the test data:", df_test.shape)
print("number of labels in train data:", df_train["label"].values.sum(), "number of labels in eval data:", df_eval["label"].values.sum(), "number of labels in train data:", df_test["label"].values.sum())

shape of the train data: (359, 2) shape of the eval data: (45, 2) shape of the test data: (45, 2)
number of labels in train data: 74 number of labels in eval data: 12 number of labels in train data: 8


In [19]:
train_dataset = turn_to_dataset(df_train)
eval_dataset = turn_to_dataset(df_eval)
test_dataset = turn_to_dataset(df_test)

In [27]:
model_name = "gpt2"
batch_size = 8

args = TrainingArguments(
    f"../models/{model_name}-finetuned-ner",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    # push_to_hub=True,
)

trainer = SFTTrainer(
    args = args,
    model=model_name,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=512,
)

Using pad_token, but it is not set yet.


Map:   0%|          | 0/359 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [28]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.571958
2,No log,2.370533
3,No log,2.318974


TrainOutput(global_step=135, training_loss=2.7465876826533564, metrics={'train_runtime': 268.5205, 'train_samples_per_second': 4.011, 'train_steps_per_second': 0.503, 'total_flos': 281411518464000.0, 'train_loss': 2.7465876826533564, 'epoch': 3.0})

In [29]:
trainer.save_model()

In [30]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

results = []

for _ in range(1000):
    # Load the trained model and tokenizer
    model_name = "./tmp_trainer"  # Replace with the actual path to your trained model
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model = GPT2ForSequenceClassification.from_pretrained(model_name)
        
    # Define your input text
    input_text = "Les négociations sur l'attribution d'une prime pepa"
    
    # Preprocess the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Perform classification
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Assuming you have two classes (0 for non-négociation and 1 for négociation)
    predicted_class = torch.argmax(logits, dim=1).item()
    results.append(predicted_class)
    
    # # Map the predicted class to a label
    # class_labels = [ "Non-PPV","PPV"]
    # predicted_label = class_labels[predicted_class]

    
    # print(f"Predicted Class: {predicted_class}")
    # print(f"Predicted Label: {predicted_label}")

prob = sum(results)/1000.0
print(f"la probabilité est:{prob}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./tmp_trainer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./tmp_trainer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./tmp_trainer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./tmp_trainer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stre

la probabilité est:0.489


In [29]:
results

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


## V1 : On a random element of text

In [13]:
from datasets import load_dataset, Dataset
from trl import SFTTrainer
import pandas as pd

df = pd.read_csv(r"../../data/raw/data449_cats.csv")[["text"]]
df["label"] = df.apply(lambda x: 1 if "négociation" in x["text"] or "negociation" in x["text"] else 0, axis=1)
# df["label"].values.sum()
dataset = turn_to_dataset(df)
dataset['label']

[0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,


In [None]:
def turn_to_dataset(data:dict):
    """
    It turn dictionnary of conll into Dataset object of the datasets library from Huggingface
    """
    return Dataset.from_dict(data)

In [14]:
trainer = SFTTrainer(
    model="gpt2",
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    dataset_batch_size=20
)
trainer.train()

Using pad_token, but it is not set yet.


Map:   0%|          | 0/449 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=171, training_loss=2.900948150813231, metrics={'train_runtime': 322.7232, 'train_samples_per_second': 4.174, 'train_steps_per_second': 0.53, 'total_flos': 351960367104000.0, 'train_loss': 2.900948150813231, 'epoch': 3.0})

In [15]:
trainer.save_model()

In [61]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch

# Load the trained model and tokenizer
model_name = "./tmp_trainer"  # Replace with the actual path to your trained model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained(model_name)

# Define your input text
input_text = "Bonjour monsieur, votre baguette est délicieuse"

# Preprocess the input text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Perform classification
with torch.no_grad():
    logits = model(**inputs).logits

# Assuming you have two classes (0 for non-négociation and 1 for négociation)
predicted_class = torch.argmax(logits, dim=1).item()

# Map the predicted class to a label
class_labels = ["Non-Négociation", "Négociation"]
predicted_label = class_labels[predicted_class]

print(f"Predicted Class: {predicted_class}")
print(f"Predicted Label: {predicted_label}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ./tmp_trainer and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted Class: 0
Predicted Label: Non-Négociation


In [79]:
# dir(trainer)
from transformers import AutoTokenizer, DefaultDataCollator

tokenizer = AutoTokenizer.from_pretrained("./tmp_trainer")

example = tokenizer("Les négociations se sont bien passées", return_tensors="pt")
print(example, "type:", type(example))

# example_dc = data_collator(example)
trainer.predict([example]).predictions.shape

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': tensor([[35882,   299,  2634,    70,  1733,   602,   384,   264,   756,   275,
          2013,  1208,  2634,   274]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} type: <class 'transformers.tokenization_utils_base.BatchEncoding'>


(1, 1, 14, 50257)

In [76]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("./tmp_trainer")
model = AutoModelForCausalLM.from_pretrained("./tmp_trainer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
