In [None]:
# Run this cell when using google collab
!git clone https://github.com/Mouret-Orfeu/RCA_LLM_project.git
%cd RCA_LLM_project

!python -m pip install --upgrade pip
!pip install -r requirements.txt
!pip install -e .

# restart runtime, so that the environment changes are applied
import os, sys
os.kill(os.getpid(), 9) 

In [None]:
import os, sys

# for locale execution
#os.chdir('/home/orfeu/Documents/documents/info_perso/RCA_LLM_project')

# for collab usage
%cd /content/RCA_LLM_project

# Sanity check
print("CWD:", os.getcwd())

import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from rca_llm.utils import set_seed
from rca_llm.RCADataset import RCADataset
from rca_llm.trainer import Trainer
from rca_llm.HFModelAdapter import HFModelAdapter
set_seed(3407)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# connect to Hugging face to acces LLama model
!pip install -U "huggingface_hub[cli]"
from huggingface_hub import login
login() 

In [None]:
model_type = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_type, token=True)
hf_model = AutoModelForCausalLM.from_pretrained(model_type, token=True, torch_dtype="bfloat16", device_map="auto")
model = HFModelAdapter(hf_model, model_type)

In [None]:
# print an example instance of the dataset
df = pd.read_csv('./data/itsm_tickets_meaningful_200_utf8.csv', sep=';', encoding='utf-8')

train_dataset = RCADataset(df, 'train', tokenizer)
test_dataset = RCADataset(df, 'test', tokenizer)

x, y = train_dataset[0]

# token ids
print("Input IDs:", x)
print("Labels:", y)

# decoded text
print(f"Decoded Input:\n{tokenizer.decode(x, skip_special_tokens=True)}\n")

# For y, I replace all masked tokens (id = -100) by the letter m
y = [token if token != -100 else tokenizer.convert_tokens_to_ids('m') for token in y]
print(f"Decoded Labels:\n{tokenizer.decode(y, skip_special_tokens=True)}\n")

Input IDs: tensor([50256, 50256, 50256,  ...,   415,    13, 50256])
Labels: tensor([ -100,  -100,  -100,  ...,   415,    13, 50256])
Decoded Input:
description du ticket itsm: Bonjour, je ne peux pas utiliser correctement mon audio. Il ne fonctionne pas pendant les appels Teams surtout quand je tente d'envoyer un e-mail. Ce souci est apparu ce matin. Merci pour votre aide.
Réponse de l'équipe IT pour la résolution du ticket: Merci pour votre signalement. Le problème était lié à un paramétrage réseau incorrect. Nous avons redémarré le service concerné. Cela devrait être résolu maintenant.

Decoded Labels:
mmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm

In [None]:
train_config = Trainer.get_default_config()
train_config.max_iters = 1000
train_config.batch_size = 2
trainer = Trainer(train_config, model, train_dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        
trainer.set_callback('on_batch_end', batch_end_callback)


trainer.run()

iter_dt 0.00ms; iter 0: train loss 0.00327
iter_dt 26.96ms; iter 100: train loss 0.02028
iter_dt 25.73ms; iter 200: train loss 0.02236
iter_dt 33.71ms; iter 300: train loss 0.01246
iter_dt 27.47ms; iter 400: train loss 0.02870
iter_dt 28.32ms; iter 500: train loss 0.00278
iter_dt 45.71ms; iter 600: train loss 0.03206
iter_dt 26.38ms; iter 700: train loss 0.00522
iter_dt 28.91ms; iter 800: train loss 0.00695
iter_dt 27.86ms; iter 900: train loss 0.00521
iter_dt 27.65ms; iter 1000: train loss 0.00120
iter_dt 40.41ms; iter 1100: train loss 0.01663
iter_dt 27.01ms; iter 1200: train loss 0.01429
iter_dt 28.49ms; iter 1300: train loss 0.00135
iter_dt 29.14ms; iter 1400: train loss 0.01628
iter_dt 26.49ms; iter 1500: train loss 0.01357
iter_dt 26.22ms; iter 1600: train loss 0.00091
iter_dt 27.38ms; iter 1700: train loss 0.03434
iter_dt 26.37ms; iter 1800: train loss 0.00266
iter_dt 27.40ms; iter 1900: train loss 0.00617


In [None]:
def show_prediction_for_row(i, df, model, device, tokenizer, train_dataset=None,
                            max_new_tokens=200, do_sample=False, temperature=1.0, top_k=None):
    """
    Prints the question, ground truth answer, and the model's generated answer for row i.
    """

    # Pull the raw texts
    question = str(df.loc[i, 'ticket_description'])
    ground_truth = str(df.loc[i, 'ticket_resolution'])

    # Reuse the same prompt format as your dataset
    if train_dataset is not None and hasattr(train_dataset, "prompt_description_addition") and hasattr(train_dataset, "prompt_resolution_addition"):
        prompt_prefix = train_dataset.prompt_description_addition
        between_prefix = train_dataset.prompt_resolution_addition
    else:
        # Fallbacks in case you didn't pass the dataset (keep consistent with your training)
        prompt_prefix = "description du ticket itsm: "
        between_prefix = " Réponse de l'équipe IT pour la résolution du ticket: "

    prompt = f"{prompt_prefix}{question}{between_prefix}"

    model.eval()
    with torch.no_grad():
        generated = model.generate_from_prompt(
            prompt=prompt,
            device=device,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_k=top_k,
            return_new_text_only=True,      # only the continuation (answer)
            skip_special_tokens=True
        )

    print(f"Row: {i}")
    print("-" * 80)
    print("QUESTION:")
    print(question)
    print("\nGROUND TRUTH ANSWER:")
    print(ground_truth)
    print("\nMODEL GENERATION:")
    print(generated)
    print("-" * 80)

# Example usage (adjust i as you like):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
i = 0
show_prediction_for_row(i, df, model, device, tokenizer, train_dataset=train_dataset,
                        max_new_tokens=300, do_sample=False)

#Or evaluate a few random rows:
# import random
# for i in random.sample(range(len(df)), k=5):
#     show_prediction_for_row(i, df, model, device, tokenizer, train_dataset=train_dataset)

In [11]:
# now let's perform some evaluation
model.eval();

In [None]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


In [9]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[0, 0, 2, 1, 0, 1]]
predicted sorted: [[0, 0, 0, 1, 1, 2]]
gt sort         : [0, 0, 0, 1, 1, 2]
matches         : True
