#**Step 1: installazione delle librerie**

In [None]:
!pip install -q accelerate
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl
!pip install huggingface_hub
!pip install accelerate
#!pip install --upgrade torch
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

#**Step 2: caricamento delle librerie**

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

import pandas as pd
from datasets import Dataset
import re
import random
import gc
from huggingface_hub import login

In [None]:
import json

In [None]:
# hugging face
login(token="...")

#**Step 3: impostazione delle variabili**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Paths for datasets and models
data_path_datasets = {
    'en': f'./drive/MyDrive/Colab Notebooks/NoVAGraphS/Slot filling/dataset'
}
data_path_models = {
    'en': f'./drive/MyDrive/Colab Notebooks/NoVAGraphS/Slot filling/models'
}
data_path_generations = {
    'en': f'./drive/MyDrive/Colab Notebooks/NoVAGraphS/Slot filling/generations'
}

In [None]:
# List of languages and model names
langs = [
    'en',
]
models_name = {
    'en': [
      'mistralai/Mistral-Nemo-Instruct-2407', # https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407
    ]

}

In [None]:
device_map = {"": 0} # Load the entire model on the GPU 0

## load model

In [None]:
def load_model(model_name, fine_tuned_model_path):
    # Reload model in FP16 and merge it with LoRA weights
    base_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map=device_map,
    )

    model = base_model

    # Reload tokenizer to save it
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

## evaluation

In [None]:
def clear_output(output):
    #match = re.search(r'<s>(.*?)</s>', output)
    match = re.search(r'\[ANW\](.*?)\[/ANW\]', output, re.DOTALL)
    return match.group(1) if match else "Output not found"



---



In [None]:
def get_dialogue_act(pipe, sentence):
  system_prompt = '''
    Given the label "dialogue act" and the following possible values:
      - AutoF:autoNegative, DS:opening, SOM:initGreeting, DS:suggest, OCM:selfCorrection, SOM:initGoodbye, SOM:returnGreeting, SOM:thanking, Ta:answer, Ta:checkQuestion, Ta:propositionalQuestion, Ta:request, Ta:setQuestion, TuM:turnAccept

  You must extract the dialogue act from the user input.

  Examples:

  - Example 1:
      User input: "How many transitions are there in the automaton?"
      System response: "Ta:setQuestion"

  - Example 2:
      User input: "There are a total of 3 states: q0, q1, and q2. q0 is both initial and final state."
      System response: "Ta:answer"

  - Example 3:
      User input: "Is q4 the final state?"
      System response: "Ta:propositionalQuestion"

  Respond **only** with a the value, without any additional text.
  '''

  user_prompt = sentence

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
  ]

  max_new_tokens = int(len('Ta:propositionalQuestion') * 1.2)

  result = pipe(
      messages,
      max_new_tokens=max_new_tokens,
  )

  if result is not None and result and len(result) > 0 and 'generated_text' in result[0]:
      return result[0]['generated_text']
  else:
      text = "None"

  return text

In [None]:
def get_argument(pipe, sentences):
  system_prompt = '''
    Given the label "argument" and the following possible values:
      - Automata, Language, Pattern, State, Transition, Null

    You must extract the argument from the user input.

    Examples:

    - Example 1:
        User input: "How many transitions are there in the automaton?"
        System response: "Transition"

    - Example 2:
        User input: "There are a total of 3 states: q0, q1, and q2. q0 is both initial and final state."
        System response: "State"

    - Example 3:
        User input: "What is the language of an automaton?"
        System response: "Language"

    Respond **only** with a the value, without any additional text.
  '''

  user_prompt = sentences

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
  ]

  max_new_tokens = int(len('Transition') * 1.2)

  result = pipe(
      messages,
      max_new_tokens=max_new_tokens,
  )

  if result is not None and result and len(result) > 0 and 'generated_text' in result[0]:
      return result[0]['generated_text']
  else:
      text = "None"

  return text

In [None]:
def get_intent(pipe, sentences):
  system_prompt = '''
    Given the label "intent" and the following possible values:
      - fsa-theoretical, fsa-practical, Null

    You must extract the argument from the user input.

    Examples:

    - Example 1:
        User input: "How many transitions are there in the automaton?"
        System response: "fsa-practical"

    - Example 2:
        User input: "There are a total of 3 states: q0, q1, and q2. q0 is both initial and final state."
        System response:"fsa-practical",

    - Example 3:
        User input: "What is the final state of an automaton?"
        System response: "fsa-theoretical",
        """

    Respond **only** with a the value, without any additional text.
  '''

  user_prompt = sentences

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
  ]

  max_new_tokens = int(len('fsa-theoretical') * 1.2)

  result = pipe(
      messages,
      max_new_tokens=max_new_tokens,
  )

  if result is not None and result and len(result) > 0 and 'generated_text' in result[0]:
      return result[0]['generated_text']
  else:
      text = "None"

  return text

In [None]:
def get_frame(pipe, sentences):
  system_prompt = '''
    Given the following slot names:
          alphabet: The system or user asks or provides information about the alphabet of the automaton (e.g. "alphabet":["1","0"] or "alphabet":"?")
          automataType: The system or user asks or provides information about the typology of the automaton (e.g. "automataType": "deterministic" or "automataType":"?")
          finalStates: The system or user asks or provides information about the final states of the automaton (e.g. "finalStates": ["Q1", "Q2"] or "finalStates":"?")
          graphicRepresentation: The system or user asks or provides information about the graphical representation of the automaton (e.g. "graphicRepresentation": "pentagon" or "graphicRepresentation":"?")
          initialState: The system or user asks or provides information about the initial state of the automaton (e.g. "initialState": "Q0" or "initialState":"?")
          input: The system or user asks or provides information about the input of the automaton (e.g. "input": ["11000","1100011000"] or "input":"?")
          languageType: The system or user asks or provides information about the language type of the automaton (e.g. "languageType": "regular" or "languageType":"?")
          numberOfFinalStates: The system or user asks or provides information about the number of final states of the automaton (e.g. "numberOfFinalStates": "2" or "numberOfFinalStates":"?")
          numberOfStates: The system or user asks or provides information about the number of states of the automaton (e.g. "numberOfStates": "5" or "numberOfStates":"?")
          numberOfTransitions: The system or user asks or provides information about the number of transitions of the automaton (e.g. "numberOfTransitions": "7" or "numberOfTransitions":"?")
          optimalSpatialRepresentation: The system or user asks or provides information about the optimal spatial representation of the automaton (e.g. "optimalSpatialRepresentation":"?")
          output: The system or user asks or provides information about the output of the automaton (e.g. "output": "accepted/denied" or "output":"?")
          patternType: The system or user asks or provides information about the pattern type of the automaton (e.g. ``patternType'': ``clockwise'' or ``patternType'':``?'')
          stateFrom: The system or user requests or provides information about a specific starting state (e.g.``stateFrom'': ``q3'' or ``stateFrom'':``?'')
          stateTo: The system or user requests or provides information about a specific ending state (e.g.``stateTo'': ``q3'' or ``stateTo'':``?'')
          stateWithMostTransitions: The system or user asks or provides information about the state with most transitions (e.g.``stateWithMostTransitions'': ``q3'' or ``stateWithMostTransitions'':``?'')
          stateWithoutTransitions: The system or user asks or provides information about the states without transitions (e.g.``stateWithoutTransitions'': ``q3'' or ``stateWithoutTransitions'':``?'')
          states: The system or user asks or provides information about the states of the automaton (e.g. ``states'': [``Q1'',``Q2''] or ``states'':``?'')
          transitions: The system or user asks or provides information about the transitions of the automaton (e.g. ``transitions'': [[``Q0'',``Q1'',``1''],[``Q1'',``Q2'',``0'']] or ``transitions'':``?'')
          

  You must respond with a JSON containing information extracted from the user input.

  Examples:

  - Example 1:
      User input: "How many transitions are there in the automaton?"
      System response:
      """
      {
          "slot names": ["numberOfTransitions"],
          "slot values": ["?"]
      }
      """

  - Example 2:
      User input: "There are a total of 3 states: q0, q1, and q2. q0 is both initial and final state."
      System response:
      """
      {
          "slot names": ["numberOfStates", "states", "initialState", "finalStates"],
          "slot values": ["3", ["q0", "q1", "q2"], "q0", ["q0"]]
      }
      """

  - Example 3:
      User input: "What is the final state of an automaton?"
      System response:
      """
      {
          "slot names": ["finalStates"],
          "slot values": ["?"]
      }
      """

  Respond **only** with a valid JSON, without any additional text.
  '''

  user_prompt = sentences

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
  ]

  max_new_tokens = 200

  result = pipe(
      messages,
      max_new_tokens=max_new_tokens,
  )

  if result is not None and result and len(result) > 0 and 'generated_text' in result[0]:
      return result[0]['generated_text']
  else:
      text = "None"

  return text

In [None]:
import ast

def text_to_data(model, tokenizer, testset, file_name):
    rows = []

    #pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=500, temperature=0.1)

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        temperature=0.1,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    for i in range(len(testset)):
        dialogue_act_prediction = get_dialogue_act(pipe, testset['input'][i])
        cleared_dialogue_act_prediction = dialogue_act_prediction[2]['content']

        argument_prediction = get_argument(pipe, testset['input'][i])
        cleared_argument_prediction = argument_prediction[2]['content']

        intent_prediction = get_intent(pipe, testset['input'][i])
        cleared_intent_prediction = intent_prediction[2]['content']

        frame_prediction = get_frame(pipe, testset['input'][i])
        cleared_frame_prediction = frame_prediction[2]['content']

        print(f"===== STEP {i}")
        print(f"eid: {testset['id'][i]}")
        print(f"input: {testset['input'][i]}")
        print(f"cleared_dialogue_act_prediction: {cleared_dialogue_act_prediction}")
        print(f"cleared_argument_prediction: {cleared_argument_prediction}")
        print(f"cleared_intent_prediction: {cleared_intent_prediction}")
        print(f"cleared_frame_prediction: {cleared_frame_prediction}")
        print('\n')

        row = {
            'id' : testset['id'][i],
            'input' : testset['input'][i],
            'cleared_dialogue_act_prediction' : cleared_dialogue_act_prediction,
            'cleared_argument_prediction' : cleared_argument_prediction,
            'cleared_intent_prediction' : cleared_intent_prediction,
            'cleared_frame_prediction' : cleared_frame_prediction,
            'actual' : testset['output'][i],
            'dialogue_act_prediction' : dialogue_act_prediction,
            'argument_prediction' : argument_prediction,
            'intent_prediction' : intent_prediction,
            'frame_prediction' : frame_prediction,
        }

        rows.append(row)

        # salvataggio parziale
        df = pd.DataFrame(rows)
        df.to_csv(f"{data_path_generations['en']}/{file_name}.csv", index=False)

    # salvataggio finale
    df = pd.DataFrame(rows)
    df.to_csv(f"{data_path_generations['en']}/{file_name}.csv", index=False)


#**Step 5: main**


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# main
numGenPerModel = 1

for lang in langs:
  testset = pd.read_csv(f"{data_path_datasets[lang]}/test.csv")

  for model_name in models_name[lang]:
    print(f"Model: {model_name}")
    clean_model_name = model_name.split('/')[1]

    # Load base model
    model, tokenizer = load_model(model_name, None)

    # Text-to-Data
    for i in range(numGenPerModel):
      print(f"Model: {model_name}, Generation {i}")

      # Load dataset
      text_to_data(model, tokenizer, testset, f"{clean_model_name}_decoding_few_shot_divided_gen{i}")

      del model
      del tokenizer
      gc.collect()
      torch.cuda.empty_cache()