In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer

import torch

from transformers import (
    AutoTokenizer,
)

from peft import (
    PeftConfig,
    PeftModel
)

import bitsandbytes as bnb
from datasets import Dataset
from huggingface_hub import notebook_login

import sys
sys.path.insert(0, '../..')
import utils as u

HF_USER = "GianniCatBug"
MODEL_ID = "falcon-7b-4bit-005-gender-debias-spanish"
REVISION = "87ae1730160cf7022b4a02584223fa82f3e6fe52"
MODEL_TYPE = "CAUSAL"
BIT = "4bit"

INPUT_COL =  {
    "SEQ_2_SEQ": "seq2seq_document",
    "CAUSAL": "causal_document"
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)
notebook_login()

cuda


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data

In [4]:
df = pd.read_csv("../../data/processed/20231210_test_data_labeled.csv")
df["target"] = [
    eval(t)
    for t in df["target"]
]
print(df.shape)

(853, 6)


In [5]:
df.head(2)

Unnamed: 0,input,target,sesgo_pronombre,sesgo_otro,seq2seq_document,causal_document
0,Estimada comunidad beauchefiana: ¿Tienes papel...,[Estimada comunidad beauchefiana: ¿Tienes pape...,NO,NO,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...
1,Desde hoy y hasta el 19 de diciembre puedes de...,[Desde hoy y hasta el 19 de diciembre puedes d...,,,Eliminar sesgo de género del siguiente texto:\...,<human>: ¿Puedes reescribir el siguiente texto...


# Model & Tokenizer

In [6]:
config = PeftConfig.from_pretrained(f"{HF_USER}/{MODEL_ID}", revision=REVISION)
print(config.base_model_name_or_path)

model = u.get_gender_model(config, MODEL_TYPE, BIT)

# Load the Lora model
model = PeftModel.from_pretrained(model, f"{HF_USER}/{MODEL_ID}", revision=REVISION, device_map="auto")
print(model.get_input_embeddings().num_embeddings)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, use_fast=MODEL_TYPE == "CAUSAL")
_ = tokenizer.add_tokens(new_tokens = u.new_tokens[MODEL_TYPE])

model.resize_token_embeddings(len(tokenizer))
print(model.get_input_embeddings().num_embeddings) # 32110

model.eval()
print("Peft model loaded")

vilsonrodrigues/falcon-7b-instruct-sharded


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

65024
65024
Peft model loaded


In [7]:
generation_config = u.get_model_generation_config(model, MODEL_TYPE, tokenizer)

# Inference

In [23]:
%%time
df["generation"] = [
    u.generate_output(model, i, tokenizer, generation_config, DEVICE, 1.2)
    for i in tqdm(df[INPUT_COL[MODEL_TYPE]], total=df.shape[0])
]

  0%|          | 0/853 [00:00<?, ?it/s]

CPU times: user 42min 2s, sys: 144 ms, total: 42min 2s
Wall time: 42min 2s


In [24]:
df[["input", "target", "generation"]].sample(2).T

Unnamed: 0,138,306
input,Carta motivacional expresando su interés en el...,Este paso es solo informativo en base a tu tra...
target,[Carta motivacional expresando su interés en e...,[Este paso es solo informativo en base a tu tr...
generation,<human>: ¿Puedes reescribir el siguiente texto...,<human>: ¿Puedes reescribir el siguiente texto...


```python
# To avoid new generation

df = pd.read_csv(f"../data/processed/20231219_test_generation_{MODEL_TYPE}.csv")
df["target"] = [
    [
        ti
        for ti in eval(t)
    ]
    for t in df["target"]
]
df.shape
```

## BLEU

* For metrics calc, targets with less than 5 tokens are discarded, to avoid 0 counts of 4-gram overlaps

In [26]:
df["reference_tokens"] = [
    [
        word_tokenize(tt)
        for tt in t
        if len(word_tokenize(tt)) >= 5
    ]
    for t in df["target"]
]

df["max_ref_len"] = [
    np.array([
        len(rti) for rti in rt
    ]).max() if len(rt) else 0
    for rt in df["reference_tokens"]
]

df_metric = df[df["max_ref_len"] >= 5].copy()
df_metric.shape, df.shape

((782, 9), (853, 9))

In [27]:
df_metric["generated_tokens"] = [
    word_tokenize(u.get_processed_generation(MODEL_TYPE, g))
    for g in df_metric["generation"]
]

df_metric["input_tokens"] = [
    word_tokenize(i)
    for i in df_metric["input"]
]

In [28]:
df_metric["bleu_gen"] = [
    sentence_bleu(t, g)
    for t, g in zip(df_metric["reference_tokens"], df_metric["generated_tokens"])
]

df_metric["bleu_input"] = [
    sentence_bleu(t, i)
    for t, i in zip(df_metric["reference_tokens"], df_metric["input_tokens"])
]

In [29]:
df_metric["bleu_gen"].describe().round(5), df_metric["bleu_input"].describe().round(5)

(count    782.00000
 mean       0.98959
 std        0.05479
 min        0.01496
 25%        1.00000
 50%        1.00000
 75%        1.00000
 max        1.00000
 Name: bleu_gen, dtype: float64,
 count    782.00000
 mean       0.99495
 std        0.02723
 min        0.59460
 25%        1.00000
 50%        1.00000
 75%        1.00000
 max        1.00000
 Name: bleu_input, dtype: float64)

In [30]:
print(round(corpus_bleu([t for t in df_metric["reference_tokens"]], df_metric["generated_tokens"]), 5))
print(round(corpus_bleu([t for t in df_metric["reference_tokens"]], df_metric["input_tokens"]), 5))

0.98914
0.99454


In [31]:
bajos = df_metric[df_metric["bleu_gen"] < 0.7]
print(bajos.shape)

for _, r in bajos.iterrows():
    print(f"Input: {r['input']}")
    print(f"Target: {r['target']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print(f"BLEU: {round(r['bleu_gen'], 10)}")
    print()

(7, 13)
Input: Saludos, Subdirección de Gestión Docente Escuela de Ingeniería y Ciencias
Target: ['Saludos, Subdirección de Gestión Docente Escuela de Ingeniería y Ciencias']
Generation: Saludos, Subdirección de Gestión Docente Escuela de Ingeniería y Ciencias
    (assistant): Sal
BLEU: 0.6525452579

Input: Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb
Target: ['Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb']
Generation: Inscripciones: https://docs.google.com/form

### Diffs

In [32]:
df_metric["bleu_dif"] = df_metric["bleu_gen"] - df_metric["bleu_input"]
df_metric["bleu_dif"].describe()

count    782.000000
mean      -0.005361
std        0.055144
min       -0.985040
25%        0.000000
50%        0.000000
75%        0.000000
max        0.405396
Name: bleu_dif, dtype: float64

## ROUGE

In [33]:
scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True
)

In [34]:
def get_rouge_f_mean(scorer, target, generation):
    scores = scorer.score(target, generation)
    return np.mean([
        score_tuple[2] # fmeasure
        for score_tuple in scores.values()
    ])

In [35]:
df_metric["rouge"] = [
    np.array([
        get_rouge_f_mean(scorer, tt, u.get_processed_generation(MODEL_TYPE, g))
        for tt in t
    ]).max()
    for t, g in zip(df_metric["target"], df_metric["generation"])
]

In [36]:
df_metric["rouge"].describe()

count    782.000000
mean       0.995158
std        0.028415
min        0.389474
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: rouge, dtype: float64

In [37]:
bajos = df_metric[df_metric["rouge"] < 0.7]

for _, r in bajos.iterrows():
    print(f"Input: {r['input']}")
    print(f"Target: {r['target']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print()

Input: Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb
Target: ['Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rYLsxdKE_hIDwzPRq92RdUDvogtKslxgHiTYIclQ/viewform?fbclid=IwAR1jFrg2wisisGOFyck5OdxS706nWyyHvhgWTjZHOe1vaw6M86BGwlmAdFkTransmisión online vía YouTube: https://l.facebook.com/l.php?u=https%3A%2F%2...aTwTwpU9NaMuaFAb51v8vHbNl4xPH5G0VvsJrYs50VRhNR-Zb']
Generation: Inscripciones: https://docs.google.com/forms/d/e/1FAIpQLSeoH95qz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIzqz0rLYzxgHIz



# Explore inputs with bias

In [38]:
df_with_bias = df_metric[
    (df_metric["sesgo_pronombre"] == "SI") | (df_metric["sesgo_otro"] == "SI") 
]

df_without_bias = df_metric[
    (df_metric["sesgo_pronombre"] == "NO") & (df_metric["sesgo_otro"] == "NO") 
]

df_not_able_to_bias = df_metric[
    (df_metric["sesgo_pronombre"].isna()) & (df_metric["sesgo_otro"].isna()) 
]

print(df_metric.shape, df_with_bias.shape[0] + df_without_bias.shape[0] + df_not_able_to_bias.shape[0])

print(
    f"Mean BLEU on biased inputs: {round(df_with_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on biased inputs: {round(df_with_bias['rouge'].mean(), 5)}\n\n"
    f"Mean BLEU on not biased inputs: {round(df_without_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on not biased inputs: {round(df_without_bias['rouge'].mean(), 5)}\n\n"
    f"Mean BLEU on inputs that can't be biased: {round(df_not_able_to_bias['bleu_gen'].mean(), 5)}\n"
    f"Mean ROUGE on inputs that can't be biased: {round(df_not_able_to_bias['rouge'].mean(), 5)}\n"
)

(782, 15) 782
Mean BLEU on biased inputs: 0.94809
Mean ROUGE on biased inputs: 0.9701

Mean BLEU on not biased inputs: 0.99355
Mean ROUGE on not biased inputs: 0.99734

Mean BLEU on inputs that can't be biased: 0.99254
Mean ROUGE on inputs that can't be biased: 0.99706



In [39]:
for i, (_, r) in enumerate(df_with_bias[["input", "generation", "bleu_gen", "rouge"]].iterrows()):
    print(i+1)
    print(f"Input: {r['input']}")
    print(f"Generation: {u.get_processed_generation(MODEL_TYPE, r['generation'])}")
    print(f"BLEU: {r['bleu_gen']}. ROUGE: {r['rouge']}")
    print()

1
Input: Estimados estudiantes de Pregrado,Junto con saludar les invitamos al OPEN MDS, charla Abierta para conocer los detalles del Magíster en Ciencia de Datos MDS de nuestra Facultad el cual es articulable con las carreras de pregrado FCFM.
Generation: Estimada Comunidad de Pregrado,Junto con saludar les invitamos al OPEN MDS, charla Abierta para conocer los detalles del Magíster en Ciencia de Datos MDS de nuestra Facultad el cual es articulable con las carreras de pregrado FCFM.
BLEU: 0.9493253852526038. ROUGE: 0.9530385226587758

2
Input: Los estudiantes que inscriban Prácticas Profesionales deberán pagar 2 créditos (arancel mínimo).La fecha de pago del arancel del Semestre de Verano será desde el 02/01/2024 al 05/01/2024, el detalle de cómo realizar el pago del arancel se informará durante los próximos días.
Generation: Las/os estudiantes que inscriban Prácticas Profesionales deberán pagar 2 créditos (arancel mínimo).La fecha de pago del arancel del Semestre de Verano será desde 

# Save

In [40]:
df_metric.to_csv(f"../../data/processed/20231220_metrics_{MODEL_TYPE}.csv", index=False)