In [None]:
# Run this cell when using google collab
!git clone https://github.com/Mouret-Orfeu/RCA_LLM_project.git
%cd RCA_LLM_project

!python -m pip install --upgrade pip
!pip install -r requirements.txt
!pip install -e .

# restart runtime, so that the environment changes are applied
# it raises an error "session crashed for unknown reason" but it is expected 
import os, sys
os.kill(os.getpid(), 9) 

In [None]:
import os, sys

# for locale execution
#os.chdir('/home/orfeu/Documents/documents/info_perso/RCA_LLM_project')

# for collab usage
%cd /content/RCA_LLM_project

# Sanity check
print("CWD:", os.getcwd())

import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from rca_llm.utils import set_seed
from rca_llm.RCADataset import RCADataset
from rca_llm.trainer import Trainer
from rca_llm.HFModelAdapter import HFModelAdapter
set_seed(3407)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# connect to Hugging face to acces LLama model
!pip install -U "huggingface_hub[cli]"
from huggingface_hub import login
login() 

In [None]:
# choose the model you want to use
model_type_1 = "meta-llama/Llama-3.2-1B-Instruct"
model_type_2 = "..." 

In [None]:
# choose the model you want to use
user_input = input("Enter 1 for meta-llama/Llama-3.2-1B-Instruct \nor enter 2 for the second model")
if user_input == "1":
    model_type = model_type_1
elif user_input == "2":
    model_type = model_type_2
else:
    print("Invalid input. Please enter 1 or 2.")
    sys.exit(1)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_type, token=True)
hf_model = AutoModelForCausalLM.from_pretrained(model_type, token=True, torch_dtype="bfloat16", device_map="auto")
model = HFModelAdapter(hf_model, model_type)

In [None]:
# print an example instance of the dataset
df = pd.read_csv('./data/itsm_tickets_meaningful_200_utf8.csv', sep=';', encoding='utf-8')

# Build disjoint train/test with a shared seed
split_seed = 3407
train_dataset = RCADataset(df, 'train', tokenizer, seed=split_seed)
test_dataset = RCADataset(df, 'test', tokenizer, seed=split_seed)

In [None]:
# Quick check of dataset encoding/decoding process
x, y = train_dataset[0]

# token ids
print("Input IDs:", x)
print("Labels:", y)

# decoded text
print(f"Decoded Input:\n{tokenizer.decode(x, skip_special_tokens=True)}\n")

# For y, I replace all masked tokens (id = -100) by the letter m
y = [token if token != -100 else tokenizer.convert_tokens_to_ids('m') for token in y]
print(f"Decoded Labels:\n{tokenizer.decode(y, skip_special_tokens=True)}\n")

Input IDs: tensor([50256, 50256, 50256,  ...,   415,    13, 50256])
Labels: tensor([ -100,  -100,  -100,  ...,   415,    13, 50256])
Decoded Input:
description du ticket itsm: Bonjour, je ne peux pas utiliser correctement mon audio. Il ne fonctionne pas pendant les appels Teams surtout quand je tente d'envoyer un e-mail. Ce souci est apparu ce matin. Merci pour votre aide.
Réponse de l'équipe IT pour la résolution du ticket: Merci pour votre signalement. Le problème était lié à un paramétrage réseau incorrect. Nous avons redémarré le service concerné. Cela devrait être résolu maintenant.

Decoded Labels:
mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm

In [None]:
train_config = Trainer.get_default_config()
train_config.max_iters = 1000
train_config.batch_size = 2
trainer = Trainer(train_config, model, train_dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        
trainer.set_callback('on_batch_end', batch_end_callback)


trainer.run()

iter_dt 0.00ms; iter 0: train loss 0.00327
iter_dt 26.96ms; iter 100: train loss 0.02028
iter_dt 25.73ms; iter 200: train loss 0.02236
iter_dt 33.71ms; iter 300: train loss 0.01246
iter_dt 27.47ms; iter 400: train loss 0.02870
iter_dt 28.32ms; iter 500: train loss 0.00278
iter_dt 45.71ms; iter 600: train loss 0.03206
iter_dt 26.38ms; iter 700: train loss 0.00522
iter_dt 28.91ms; iter 800: train loss 0.00695
iter_dt 27.86ms; iter 900: train loss 0.00521
iter_dt 27.65ms; iter 1000: train loss 0.00120
iter_dt 40.41ms; iter 1100: train loss 0.01663
iter_dt 27.01ms; iter 1200: train loss 0.01429
iter_dt 28.49ms; iter 1300: train loss 0.00135
iter_dt 29.14ms; iter 1400: train loss 0.01628
iter_dt 26.49ms; iter 1500: train loss 0.01357
iter_dt 26.22ms; iter 1600: train loss 0.00091
iter_dt 27.38ms; iter 1700: train loss 0.03434
iter_dt 26.37ms; iter 1800: train loss 0.00266
iter_dt 27.40ms; iter 1900: train loss 0.00617


In [None]:
# Quick check to verify the model generates sensible answers
def show_prediction_for_row(
        i, 
        df, 
        model, 
        device, 
        tokenizer, 
        train_dataset=None,
        max_new_tokens=200, 
        do_sample=False, 
        temperature=1.0, 
        top_k=None
    ):
    """
    Prints the question, ground truth answer, and the model's generated answer for row i.
    """

    # Pull the raw texts
    question = str(df.loc[i, 'ticket_description'])
    ground_truth = str(df.loc[i, 'ticket_resolution'])

    # Reuse the same prompt format as your dataset
    if train_dataset is not None and hasattr(train_dataset, "prompt_description_addition") and hasattr(train_dataset, "prompt_resolution_addition"):
        prompt_prefix = train_dataset.prompt_description_addition
        between_prefix = train_dataset.prompt_resolution_addition
    else:
        # Fallbacks in case you didn't pass the dataset (keep consistent with your training)
        prompt_prefix = "description du ticket itsm: "
        between_prefix = " Réponse de l'équipe IT pour la résolution du ticket: "

    prompt = f"{prompt_prefix}{question}{between_prefix}"

    model.eval()
    with torch.no_grad():
        generated = model.generate_from_prompt(
            prompt=prompt,
            device=device,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_k=top_k,
            return_new_text_only=True,      # only the continuation (answer)
            skip_special_tokens=True
        )

    print(f"Row: {i}")
    print("-" * 80)
    print("QUESTION:")
    print(question)
    print("\nGROUND TRUTH ANSWER:")
    print(ground_truth)
    print("\nMODEL GENERATION:")
    print(generated)
    print("-" * 80)

# Example usage (adjust i as you like):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
i = 0
show_prediction_for_row(i, df, model, device, tokenizer, train_dataset=train_dataset,
                        max_new_tokens=300, do_sample=False)

#Or evaluate a few random rows:
# import random
# for i in random.sample(range(len(df)), k=5):
#     show_prediction_for_row(i, df, model, device, tokenizer, train_dataset=train_dataset)

In [None]:
import math, random, re, string

def eval_split(
    trainer,
    split='test',
    max_examples=200, # examples on wich generate an answer and compute the metrics, 200 is actually all the dataset
    max_new_tokens=300,
    do_sample=False,
    temperature=1.0,
    top_k=None,
    print_examples=1,
    ):
    """
    Evaluate a split on:
      - perplexity over answer bytes (for labels != -100)
      - QA metrics: Exact Match (EM) and token-level F1
      - Generation metric: ROUGE-L (F1)

    Returns: (metrics_dict, examples)
      metrics_dict = { 'byte_perplexity', 'bits_per_byte', 'exact_match', 'f1', 'rougeL_f1', ... }
      examples = list of (question, reference_answer, generated_answer)
    """

    model = trainer.model
    device = trainer.device
    dataset = {'train': train_dataset, 'test': test_dataset}[split]
    df = dataset.df
    pad_id = int(model.hf_model.config.pad_token_id)

    model.eval()


    # BPB (bits_per_byte) over the split
    total_nll, total_tokens = 0.0, 0
    loader = DataLoader(dataset, batch_size=trainer.config.batch_size, num_workers=trainer.config.num_workers, drop_last=False)
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            logits, loss = model(x, y)
            tokens = (y != -100).sum().item()
            if tokens > 0 and loss is not None:
                # loss.item() is the average negative log-likelihood per included token in the batch
                # multiply by number of tokens to get total negative log-likelihood for this batch
                total_nll += loss.item() * tokens
                
                # count bytes in the supervised span only
                # for each sample, take the positions where y != -100 and x != pad
                for i in range(x.size(0)):
                    mask = (y[i] != -100) & (x[i] != pad_id)
                    if mask.any():
                        ids = x[i][mask].tolist()
                        txt = tokenizer.decode(ids, skip_special_tokens=True)
                        total_bytes += len(txt.encode("utf-8"))

    num_bytes = max(total_bytes, 1)
    bpb = (total_nll / num_bytes) / math.log(2.0)
    byte_perplexity = 2 ** bpb

    # standardize the text (lowercase, removes punctuation, removes extra whitespace)
    def normalize_text(s):
        if s is None:
            return ''
        s = s.strip().lower()
        s = s.translate(str.maketrans('', '', string.punctuation))
        s = re.sub(r'\s+', ' ', s)
        return s

    def f1_score(prediction, ground_truth):
        pred_tokens = normalize_text(prediction).split()
        gt_tokens = normalize_text(ground_truth).split()
        if len(pred_tokens) == 0 and len(gt_tokens) == 0:
            return 1.0
        # count overlaps (bag-of-words)
        from collections import Counter
        pred_counts = Counter(pred_tokens)
        gt_counts = Counter(gt_tokens)
        overlap = sum((pred_counts & gt_counts).values())
        if overlap == 0:
            return 0.0
        precision = overlap / max(len(pred_tokens), 1)
        recall = overlap / max(len(gt_tokens), 1)
        return 2 * precision * recall / (precision + recall)

    def exact_match(prediction, ground_truth):
        return 1.0 if normalize_text(prediction) == normalize_text(ground_truth) else 0.0

    # Dynamic programming algorithm to find the length of the Longest Common Subsequence (LCS)
    def lcs(x, y):
        m, n = len(x), len(y)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        for i in range(m):
            xi = x[i]
            dpi = dp[i]
            dpi1 = dp[i+1]
            for j in range(n):
                if xi == y[j]:
                    dpi1[j+1] = dpi[j] + 1
                else:
                    dpi1[j+1] = dpi1[j] if dpi1[j] >= dp[i][j+1] else dp[i][j+1]
        return dp[m][n]

    def rougeL_f1(prediction, ground_truth):
        pred_tokens = normalize_text(prediction).split()
        gt_tokens = normalize_text(ground_truth).split()
        if len(pred_tokens) == 0 or len(gt_tokens) == 0:
            return 0.0
        # length of the Longest Common Subsequence (LCS) 
        # (orderded well predicted tokens, not necessarily consecutive)
        lcs_len = lcs(pred_tokens, gt_tokens)
        prec = lcs_len / len(pred_tokens)
        rec = lcs_len / len(gt_tokens)
        if prec + rec == 0:
            return 0.0
        return (2 * prec * rec) / (prec + rec)

    # Generation loop for QA metrics
    total_exact_match, total_f1, total_rougeL = 0.0, 0.0, 0.0
    num_examples = min(max_examples, len(dataset))
    indices = list(range(len(dataset)))
    random.seed(3407)
    random.shuffle(indices)
    indices = indices[:num_examples]

    examples = []  # (question, reference, generated)
    with torch.no_grad():
        for i_local in indices:
            row_idx = int(dataset.ixes[i_local])
            question = str(df.loc[row_idx, 'ticket_description'])
            reference = str(df.loc[row_idx, 'ticket_resolution'])
            prompt = dataset.prompt_description_addition + question + dataset.prompt_resolution_addition

            generated = model.generate_from_prompt(
                prompt=prompt,
                device=device,
                max_new_tokens=max_new_tokens,
                do_sample=do_sample,
                temperature=temperature,
                top_k=top_k,
                return_new_text_only=True,
                skip_special_tokens=True,
            )

            examples.append((question, reference, generated))
            total_exact_match += exact_match(generated, reference)
            total_f1 += f1_score(generated, reference)
            total_rougeL += rougeL_f1(generated, reference)

    qa_em = total_exact_match / max(num_examples, 1)
    qa_f1 = total_f1 / max(num_examples, 1)
    rougeL = total_rougeL / max(num_examples, 1)

    results = {
        'split': split,
        'examples_evaluated': int(num_examples),
        'byte_perplexity': float(byte_perplexity) if total_bytes > 0 else None,
        'bits_per_byte': float(bpb) if total_bytes > 0 else None,
        'exact_match': float(qa_em),
        'f1': float(qa_f1),
        'rougeL_f1': float(rougeL),
    }

    if print_examples > 0:
        for k, (q, ref, pred) in enumerate(examples[:print_examples]):
            print(f'[#{k}] QUESTION: {q}')
            print(f'     REF    : {ref}')
            print(f'     PRED   : {pred}')
            print('-' * 60)

    print(
        f"Eval {split}: PPL={results['perplexity']:.3f} | EM={qa_em*100:.2f}% | F1={qa_f1*100:.2f}% | ROUGE-L={rougeL*100:.2f}% | examples={num_examples}"
    )
    return results, examples

# Example: evaluate both splits
with torch.no_grad():
    train_metrics, _ = eval_split(trainer, 'train', max_examples=50, max_new_tokens=200, do_sample=False, print_examples=2)
    test_metrics, _  = eval_split(trainer, 'test',  max_examples=50, max_new_tokens=200, do_sample=False, print_examples=2)


In [9]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[0, 0, 2, 1, 0, 1]]
predicted sorted: [[0, 0, 0, 1, 1, 2]]
gt sort         : [0, 0, 0, 1, 1, 2]
matches         : True
