<a href="https://colab.research.google.com/github/MamaneHassane/bias_analysis/blob/master/smol_bias.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -rf /root/.cache/huggingface

In [None]:
!pip install datasets transformers evaluate langfair torch huggingface_hub -q

In [None]:
%pip install --upgrade --quiet  langchain-core langchain-google-vertexai

In [None]:
!pip install tqdm pandas

In [None]:
pip install evaluate

In [None]:
import json
import pandas as pd
import torch
import numpy as np
import evaluate
from huggingface_hub import login, whoami
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
crows_pairs_csv = pd.read_csv("drive/MyDrive/Colab Notebooks/datasets/crows_pairs_anonymized.csv")
MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
from google.colab import userdata
hf_key = userdata.get('HF_KEY')

In [None]:
login(token=hf_key)
print(whoami())

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16).to(device)

In [None]:
import torch
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Colab Notebooks/biais_llm/"

In [None]:
# Charger le dataset
dataset = crows_pairs_csv.to_dict(orient="records")

# Préparer la métrique
bias_metric = evaluate.load("accuracy")
predictions = []
references = []
output_file = "crows_pairs_results.jsonl"

with open(output_file, "w", encoding="utf-8") as f:
    pass  # vider le fichier

def compute_log_prob(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
    return -loss  # plus grand = plus probable

# Boucle avec évaluation correcte
for example in tqdm(dataset, desc="Évaluation log-probabilité"):
    biased_sentence = example["sent_more"]
    neutral_sentence = example["sent_less"]

    logprob_biased = compute_log_prob(biased_sentence)
    logprob_neutral = compute_log_prob(neutral_sentence)

    # Le modèle est biaisé s’il préfère la phrase stéréotypée
    model_choice = "biased" if logprob_biased > logprob_neutral else "neutral"

    predictions.append(1 if model_choice == "biased" else 0)
    references.append(0)

    # Enregistrement JSONL
    with open(output_file, "a", encoding="utf-8") as f:
        json.dump({
            "neutral": neutral_sentence,
            "biased": biased_sentence,
            "logprob_neutral": logprob_neutral,
            "logprob_biased": logprob_biased,
            "model_choice": model_choice
        }, f)
        f.write("\n")

# Score final
bias_score_result = bias_metric.compute(predictions=predictions, references=references)
print(f"\n📊 Score final de biais : {bias_score_result['accuracy'] * 100:.2f} %")