In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer

import torch

from transformers import (
    AutoTokenizer,
)

from peft import (
    PeftConfig,
    PeftModel
)

import bitsandbytes as bnb
from datasets import Dataset
from huggingface_hub import notebook_login

import sys
sys.path.insert(0, '../..')
import utils as u

HF_USER = "GianniCatBug"
MODEL_ID = "flan-base-4bit-005-gender-debias-spanish"
REVISION = "main"
MODEL_TYPE = "SEQ_2_SEQ"
BIT = "4bit"

INPUT_COL =  {
    "SEQ_2_SEQ": "seq2seq_document",
    "CAUSAL": "causal_document"
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
notebook_login()

cuda


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data

In [3]:
df = pd.read_csv("../../data/processed/20231210_test_data_labeled.csv")
df["target"] = [
    eval(t)
    for t in df["target"]
]
print(df.shape)

(853, 6)


In [4]:
df.head(2)

Unnamed: 0,input,target,sesgo_pronombre,sesgo_otro,seq2seq_document,causal_document
0,Estimada comunidad beauchefiana: ¿Tienes papel...,[Estimada comunidad beauchefiana: ¿Tienes pape...,NO,NO,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...
1,Desde hoy y hasta el 19 de diciembre puedes de...,[Desde hoy y hasta el 19 de diciembre puedes d...,,,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...


# Model & Tokenizer

In [4]:
config = PeftConfig.from_pretrained(f"{HF_USER}/{MODEL_ID}", revision=REVISION)
print(config.base_model_name_or_path)

model = u.get_gender_model(config, MODEL_TYPE, BIT)

# Load the Lora model
model = PeftModel.from_pretrained(model, f"{HF_USER}/{MODEL_ID}", revision=REVISION, device_map="auto")
print(model.get_input_embeddings().num_embeddings)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, use_fast=MODEL_TYPE == "CAUSAL")
_ = tokenizer.add_tokens(new_tokens = u.new_tokens[MODEL_TYPE])

model.resize_token_embeddings(len(tokenizer))
print(model.get_input_embeddings().num_embeddings) # 32110

model.eval()
print("Peft model loaded")

google/flan-t5-base
32128


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


32110
Peft model loaded


In [5]:
generation_config = u.get_model_generation_config(model, MODEL_TYPE, tokenizer)

In [6]:
generation_config

GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "max_new_tokens": 1000,
  "pad_token_id": 0
}

# Inference

In [7]:
%%time
df["generation"] = [
    u.generate_output(model, i, tokenizer, generation_config, DEVICE, 1.2)
    for i in tqdm(df[INPUT_COL[MODEL_TYPE]], total=df.shape[0])
]

  0%|          | 0/853 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors


CPU times: user 26min 8s, sys: 319 ms, total: 26min 8s
Wall time: 26min 8s


In [8]:
df[["input", "target", "generation"]].sample(2).T

Unnamed: 0,492,563
input,serán realizadas en modalidad remota o reagend...,"En nombre del Departamento de Pregrado, invita..."
target,[serán realizadas en modalidad remota o reagen...,"[En nombre del Departamento de Pregrado, invit..."
generation,serán realizadas en modalidad remota o reagend...,"En nombre del Departamento de Pregrado, invita..."


```python
# To avoid new generation

df = pd.read_csv(f"../data/processed/20231220_test_generation_{MODEL_TYPE}.csv")
df["target"] = [
    [
        ti
        for ti in eval(t)
    ]
    for t in df["target"]
]
df.shape
```

## BLEU

* For metrics calc, targets with less than 5 tokens are discarded, to avoid 0 counts of 4-gram overlaps

In [10]:
df["reference_tokens"] = [
    [
        word_tokenize(tt)
        for tt in t
        if len(word_tokenize(tt)) >= 5
    ]
    for t in df["target"]
]

df["max_ref_len"] = [
    np.array([
        len(rti) for rti in rt
    ]).max() if len(rt) else 0
    for rt in df["reference_tokens"]
]

df_metric = df[df["max_ref_len"] >= 5].copy()
df_metric.shape, df.shape

((782, 9), (853, 9))

In [11]:
df_metric["generated_tokens"] = [
    word_tokenize(u.get_processed_generation(MODEL_TYPE, g))
    for g in df_metric["generation"]
]

df_metric["input_tokens"] = [
    word_tokenize(i)
    for i in df_metric["input"]
]

In [12]:
df_metric["bleu_gen"] = [
    sentence_bleu(t, g)
    for t, g in zip(df_metric["reference_tokens"], df_metric["generated_tokens"])
]

df_metric["bleu_input"] = [
    sentence_bleu(t, i)
    for t, i in zip(df_metric["reference_tokens"], df_metric["input_tokens"])
]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [13]:
df_metric["bleu_gen"].describe().round(5), df_metric["bleu_input"].describe().round(5)

(count    782.00000
 mean       0.94544
 std        0.13693
 min        0.00000
 25%        0.95686
 50%        1.00000
 75%        1.00000
 max        1.00000
 Name: bleu_gen, dtype: float64,
 count    782.00000
 mean       0.99495
 std        0.02723
 min        0.59460
 25%        1.00000
 50%        1.00000
 75%        1.00000
 max        1.00000
 Name: bleu_input, dtype: float64)

In [14]:
print(round(corpus_bleu([t for t in df_metric["reference_tokens"]], df_metric["generated_tokens"]), 5))
print(round(corpus_bleu([t for t in df_metric["reference_tokens"]], df_metric["input_tokens"]), 5))

0.96223
0.99454


In [15]:
bajos = df_metric[df_metric["bleu_gen"] < 0.7]
print(bajos.shape)

for _, r in bajos.iterrows():
    print(f"Input: {r['input']}")
    print(f"Target: {r['target']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print(f"BLEU: {round(r['bleu_gen'], 10)}")
    print()

(40, 13)
Input: ¡No te lo pierdas!
Target: ['¡No te lo pierdas!']
Generation: ¿No te lo pierdas!
BLEU: 0.668740305

Input: Cualquier duda que tengas, nos pueden mandar un mensaje 📩 por nuestro Instagram  @redes.beauchef Áreas a las que puedes postular 📑:  EstudiosLogísticaFeria LaboralExtensiónClub de ConsultoríaFinanzas✨Mucho éxito a todos quienes postulen este año✨   Redes Beauchef
Target: ['Cualquier duda que tengas, nos pueden mandar un mensaje 📩 por nuestro Instagram  @redes.beauchef Áreas a las que puedes postular 📑:  EstudiosLogísticaFeria LaboralExtensiónClub de ConsultoríaFinanzas✨Mucho éxito a tod@s quienes postulen este año✨   Redes Beauchef', 'Cualquier duda que tengas, nos pueden mandar un mensaje 📩 por nuestro Instagram  @redes.beauchef Áreas a las que puedes postular 📑:  EstudiosLogísticaFeria LaboralExtensiónClub de ConsultoríaFinanzas✨Mucho éxito a todos/as quienes postulen este año✨   Redes Beauchef', 'Cualquier duda que tengas, nos pueden mandar un mensaje 📩 por nues

### Diffs

In [16]:
df_metric["bleu_dif"] = df_metric["bleu_gen"] - df_metric["bleu_input"]
df_metric["bleu_dif"].describe()

count    782.000000
mean      -0.049510
std        0.135411
min       -1.000000
25%       -0.019916
50%        0.000000
75%        0.000000
max        0.204729
Name: bleu_dif, dtype: float64

## ROUGE

In [17]:
scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True
)

In [18]:
def get_rouge_f_mean(scorer, target, generation):
    scores = scorer.score(target, generation)
    return np.mean([
        score_tuple[2] # fmeasure
        for score_tuple in scores.values()
    ])

In [19]:
df_metric["rouge"] = [
    np.array([
        get_rouge_f_mean(scorer, tt, u.get_processed_generation(MODEL_TYPE, g))
        for tt in t
    ]).max()
    for t, g in zip(df_metric["target"], df_metric["generation"])
]

In [20]:
df_metric["rouge"].describe()

count    782.000000
mean       0.993797
std        0.041277
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: rouge, dtype: float64

In [21]:
bajos = df_metric[df_metric["rouge"] < 0.7]

for _, r in bajos.iterrows():
    print(f"Input: {r['input']}")
    print(f"Target: {r['target']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print()

Input: ----------------------------------------------- Ignacio J. González Coordinador HÉLICEEscuela de Ingeniería y Ciencias | Facultad de Ciencias Físicas y Matemáticas helice@ing.uchile.cl
Target: ['----------------------------------------------- Ignacio J. González Coordinador HÉLICEEscuela de Ingeniería y Ciencias | Facultad de Ciencias Físicas y Matemáticas helice@ing.uchile.cl']
Generation: -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



# Explore inputs with bias

In [22]:
df_with_bias = df_metric[
    (df_metric["sesgo_pronombre"] == "SI") | (df_metric["sesgo_otro"] == "SI") 
]

df_without_bias = df_metric[
    (df_metric["sesgo_pronombre"] == "NO") & (df_metric["sesgo_otro"] == "NO") 
]

df_not_able_to_bias = df_metric[
    (df_metric["sesgo_pronombre"].isna()) & (df_metric["sesgo_otro"].isna()) 
]

print(df_metric.shape, df_with_bias.shape[0] + df_without_bias.shape[0] + df_not_able_to_bias.shape[0])

print(
    f"Mean BLEU on biased inputs: {round(df_with_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on biased inputs: {round(df_with_bias['rouge'].mean(), 5)}\n\n"
    f"Mean BLEU on not biased inputs: {round(df_without_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on not biased inputs: {round(df_without_bias['rouge'].mean(), 5)}\n\n"
    f"Mean BLEU on inputs that can't be biased: {round(df_not_able_to_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on inputs that can't be biased: {round(df_not_able_to_bias['rouge'].mean(), 5)}\n"
)

(782, 15) 782
Mean BLEU on biased inputs: 0.90088
Mean ROUGE on biased inputs: 0.96363

Mean BLEU on not biased inputs: 0.95554
Mean ROUGE on not biased inputs: 0.99571

Mean BLEU on inputs that can't be biased: 0.94511
Mean ROUGE on inputs that can't be biased: 0.99652



In [23]:
for i, (_, r) in enumerate(df_with_bias[["input", "generation", "bleu_gen", "rouge"]].iterrows()):
    print(i+1)
    print(f"Input: {r['input']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print(f"BLEU: {r['bleu_gen']}. ROUGE: {r['rouge']}")
    print()

1
Input: Estimados estudiantes de Pregrado,Junto con saludar les invitamos al OPEN MDS, charla Abierta para conocer los detalles del Magíster en Ciencia de Datos MDS de nuestra Facultad el cual es articulable con las carreras de pregrado FCFM.
Generation: Estimados/as estudiantes de Pregrado,Junto con saludar les invitamos al OPEN MDS, charla Abierta para conocer los detalles del Magíster en Ciencia de Datos MDS de nuestra Facultad el cual es articulable con las carreras de pregrado FCFM.
BLEU: 1.0. ROUGE: 1.0

2
Input: Los estudiantes que inscriban Prácticas Profesionales deberán pagar 2 créditos (arancel mínimo).La fecha de pago del arancel del Semestre de Verano será desde el 02/01/2024 al 05/01/2024, el detalle de cómo realizar el pago del arancel se informará durante los próximos días.
Generation: Las/os estudiantes que inscriban Prácticas Profesionales deberán pagar 2 créditos (arancel mínimo).La fecha de pago del arancel del Semestre de Verano será desde el 02/01/2024 al 05/01/2

# Save

In [24]:
df_metric.to_csv(f"../../data/processed/20231220_metrics_{MODEL_TYPE}.csv", index=False)