<a href="https://colab.research.google.com/github/Gjeffroy/LLM_finetuning/blob/main/Eval_untrained_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Infering

In [None]:
import pandas as pd
# load df
df = pd.read_csv('df_reponse_orientees_mistral.csv', sep = ';')

In [None]:
from unsloth.chat_templates import get_chat_template

eos_token_id = tokenizer.convert_tokens_to_ids("<|end|>")

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
def extract_assistant_response(generated_text):
    # Ensure the input is a string (if it's in a list or batch, adjust accordingly)
    if isinstance(generated_text, list):
        generated_text = generated_text[0]  # Take the first entry in case it's a batch

    # Find where the assistant response starts and ends
    start_token = "<|assistant|>"
    end_token = "<|end|>"

    # Find start and end indices for the assistant's response
    start_idx = generated_text.find(start_token)
    end_idx = generated_text.find(end_token, start_idx)

    if start_idx == -1 or end_idx == -1:
        return ""  # Return empty string if tokens aren't found

    # Extract the assistant's response and strip any surrounding whitespace
    return generated_text[start_idx + len(start_token):end_idx].strip()

In [None]:
def infer(question):

  system_message = """
    Tu es un assistant qui fournit des réponses en rapport avec le code d'urbanisme.
    Si la question n'est pas orienté code de l'ubrannisme, repond que ce n'est pas ton domaine de connaissance.
    Renvoi une reponse si la question concernent le code de l'urbanisme.

  """

  messages = [
      {"from": "system", "value": system_message },
      {"from": "human", "value": question}
  ]
  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True, # Must add for generation
      return_tensors = "pt",
  ).to("cuda")

  outputs = model.generate(input_ids = inputs, max_new_tokens = 2000, use_cache = True, eos_token_id = eos_token_id)
  return extract_assistant_response(tokenizer.batch_decode(outputs))

In [None]:
if not 'response_untrained' in df.columns:
  df['response_untrained'] = pd.NA

from tqdm.notebook import tqdm
from datetime import datetime

sub_df = df[0:5]

tic = datetime.now()
for i in tqdm(range(len(sub_df))):
  df.loc[i, 'response_untrained'] = infer(df.loc[i, 'question_orientée'])
tac = datetime.now()
print(tac - tic)
print('average time: ' + str((tac - tic) / len(sub_df)))