<a href="https://colab.research.google.com/github/Ilvecho/Happy-Customers/blob/main/Generation_with_Tuned_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook allows you to run the LoRA fine tuned model by Syllog directly on Google Colab.

The model was tuned on topics relevant to HR professionals.

Please use the **T4 GPU** runtime accelerator

In [None]:
# @title Import dependencies and get GPU
%%capture

!pip install trl transformers datasets torch peft
!pip install -qU accelerate
!pip install -qU bitsandbytes
!pip install thefuzz

import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import AutoPeftModelForCausalLM, PeftConfig, PeftModel
from thefuzz import fuzz

# Might be removed in future
from google.colab import files,drive
drive.mount('/content/gdrive')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load the configuration

We are going to load:
- The Bits and Bytes configuration for the quantization (needed because of resource availability)
- The tuned model
- The associated tokenizer
- The pipeline used for the output generation

In [None]:
#@title Load the configuration

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype= torch.bfloat16,
        bnb_4bit_use_double_quant= False,
)

# For the time being we load the model from Drive.
# In the future, once we have a Syllog HuggingFace account, we will load the model from there
PEFT_MODEL = '/content/gdrive/MyDrive/Syllog/full_results/tuned_model'

# Perf configuration
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL)

# Pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

Now that everything is loaded, you only need to ask your prompt and wait for the model to answer!

In [None]:
#@title Input the prompt
user_prompt = input("Ask me anything related to HR: ")

In [None]:
#@title Here is were the magic happens
%%capture
##############################################
#############     GENERATION     #############
##############################################

system_message = "Sei un assistente AI utile e conciso. Rispondi in massimo cinque frasi, va bene anche usarne meno."

prompt_template=f"""<|im_start|>Sistema: {system_message}<|im_end|>
<|im_start|>Utente: {user_prompt}<|im_end|>
<|im_start|>Assistente: """

# Call the pipeline also with args to be passed to the model
sequences = pipe(
    prompt_template,
    max_new_tokens=200,
    do_sample=False,
    return_full_text=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    decoder_start_token_id=0,
)

answer = sequences[0]['generated_text']

##############################################
#############     PROCESSING     #############
##############################################

# If there is the end tag, let's just consider what's before it
if '<|im_end|>' in answer:
  answer = answer.split('<|im_end|>')[0]

# Then, we want to remove the numbers of the numbered item list
answer = re.sub(r'\d+\.\s*', '- ', answer)

# Then, what we want  to do is to verify that each sentence generated by the model is not similar to the others
# We want to discard the last element as the model will always close a sentence with a dot.
# If no dot is present, it means that the generation was interrupted because of the max tokens limit
sentences = re.split(r'[.?!:;]', answer.strip())

if len(sentences[-1]) > 0:
  answer = answer[:-len(sentences[-1])]

# If there are multiple sentences, check that they are different from each other
if len(sentences) > 1:
  sentences = sentences[:-1]

  # Build the Fuzzy matching matrix
  size = len(sentences)
  fuzz_match = np.zeros((size, size))

  for i, sentence in enumerate(sentences):
    for j, compare in enumerate(sentences):
      if sentence is compare:
        continue
      else:
        score = fuzz.token_set_ratio(sentence,compare)
        fuzz_match[i][j] = score

  # Discard sentences with high score
  max_score = np.max(fuzz_match)
  argmax_score = np.argmax(fuzz_match)

  while max_score > 80:
    # Find the two matching sentences
    i = argmax_score // size
    j = argmax_score % size

    # print(f'Size: {size}, argmax: {argmax_score}, i: {i}, j: {j}')

    # out of the two, find the one with the highest average score (the sentence on average more similar to all the others)
    if fuzz_match[i].mean() < fuzz_match[j].mean():
      to_delete = j
    else:
      assert fuzz_match[i].mean() >= fuzz_match[j].mean()
      to_delete = i

    # Delete sentence from the fuzz match
    fuzz_match = np.delete(fuzz_match, to_delete, axis=0)
    fuzz_match = np.delete(fuzz_match, to_delete, axis=1)

    # Since we are deleting one sentence, we need to reduce the size as well
    size -= 1

    # Delete sentence from sentences
    sentences.pop(to_delete)


    # Values for the new While cycle
    max_score = np.max(fuzz_match)
    argmax_score = np.argmax(fuzz_match)

  output = ''

  for sentence in sentences:
    idx = answer.find(sentence)

    if idx != -1 and idx + len(sentence) < len(answer):
        punctuation = answer[idx + len(sentence)]
        output += sentence.strip() + punctuation + '\n'
    else:
        print("Substring not found or character after the substring does not exist.")

else:
  assert len(sentences) == 1
  output = sentences[0]

In [None]:
print(output)