In [None]:
#### The code is from Irene Yi-Ju Su ####
!pip3 install datasets
!pip3 install unsloth

In [None]:
from datasets import load_dataset
dataset = load_dataset("sentence-transformers/eli5", split = "train")

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from unsloth import FastLanguageModel
import numpy as np
import pandas as pd
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/TEST')
os.environ['WANDB_MODE'] = 'disabled'
model_name = "./MODEL/llama3.2-1b"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
max_seq_length = 512
dtype = None
load_in_4bit = True

In [None]:
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name, # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
def formatting_prompts_func(examples):
    questions = examples['question']
    answers = examples['answer']
    convos = []
    for question, answer in zip(questions, answers):
        convos.append([{"role": "user", "content": question}, {"role": "assistant", "content": answer}])
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

In [None]:
from unsloth.chat_templates import standardize_sharegpt
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset = dataset.train_test_split(test_size=0.2)

Map:   0%|          | 0/325475 [00:00<?, ? examples/s]

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)

In [None]:
def stream_answer(question, model=model, tokenizer=tokenizer):
    messages = [
        {"role": "user", "content": question},
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        return_tensors = "pt",
    ).to("cuda")

    # Create attention mask
    attention_mask = torch.ones_like(inputs).to("cuda")
    input_length = inputs.shape[1]
    generated_ids = model.generate(
        input_ids = inputs,
        attention_mask = attention_mask,
        max_new_tokens = 512,
        use_cache = True,

    )
    answer = tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0]

    return answer

In [None]:
import random
random.seed(42)
selected_indices = random.sample(range(len(dataset['test'])), 1200)
selected_dataset = dataset['test'].select(selected_indices)

questions, answers = [], []
for i in range(1200):
  print (i)
  question = selected_dataset['question'][i]
  questions.append(question)
  #answer= stream_answer(question) #uncomment to run, very long
  #answers.append(answer)

In [None]:
#df_QA = pd.DataFrame({'questions': questions,'answers': answers})
#df_QA.to_csv("qa_pairs2.csv", index = False)
answers = pd.read_csv("qa_pairs2.csv")["answers"].to_list()

In [None]:
!pip3 install rouge-score
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=1690409a85251935bcd48656036e4e6068012c5c4c55a6e1014ddad7e0ac978e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import re
def find_max_similarity(answer):
  segments = [s.strip() for s in re.split('[,.!?]', answer) if len(s.strip().split()) >= 2]
  max_score, s1, s2 = 0, "", ""
  if len(segments) > 0:
    for i in range(len(segments)):
      for j in range(i + 1, len(segments)):
        score = scorer.score(segments[i], segments[j])['rougeL'].fmeasure
        if score > max_score: max_score, s1, s2 = score, segments[i], segments[j]
  return (max_score, s1, s2)

In [None]:
data = []
for i in range(1200):
  question, temp = questions[i], answers[i]
  answer = re.sub(r'\(_URL_\d+_\)', '', temp)
  last_period = answer.rfind('.')
  if last_period != -1: answer = answer[:last_period + 1].strip()
  else: answer = answer.strip()
  max_score, s1, s2 = find_max_similarity(answer)
  if max_score > 0.651: label = 0
  else: label = 1
  data.append({
      "questions": question,
      "answers": answer,
      "labels": label,
      "max_rouge_l": round(max_score, 3),
      "s1": s1,
      "s2": s2,
  })
df_QA = pd.DataFrame(data)

In [None]:
len(df_QA[df_QA["labels"]==1]), len(df_QA[df_QA["labels"]==0])

(567, 633)

In [None]:
df_QA.to_csv("qa_pairs2.csv", index = False)