### The goal here is to subsample the dataset, in particular we want only the reviews of italian restaurants

#### To subsample we use semantic search on the whole dataset using a query for italian restaurants

In [None]:
##### This is the code to create the embeddings for each review using a pretrained transformer for semantic search #####
# Expected run time ~ 4 hours


# !pip install sentence_transformers
# from datasets import load_dataset
# from sentence_transformers import SentenceTransformer, util
# import torch

# dataset_train=load_dataset("yelp_review_full",split='train')
# # Usual preprocessing for the text
# def clean_sentence(sentence):
#     sentence = re.sub(r'\\n',"",sentence)

#     #removing emoticons
#     sentence = re.sub(r'(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)',"", sentence)

#     #removing websites
#     sentence = re.sub(r"(http)?s?:?\/\/[A-Za-z0-9^,!.\/'+-=_?]+", "", sentence)

#     #numbers
#     sentence = re.sub(r"(\d+)(k)", r"\g<1> thousand", sentence)
#     sentence = re.sub(r"(\d+)([a-zA-z]+)", r"\g<1> \g<2>", sentence)
#     #convert numbers to words
#     sentence = re.sub(r"1", " one ", sentence)
#     sentence = re.sub(r"2", " two ", sentence)
#     sentence = re.sub(r"3", " three ", sentence)
#     sentence = re.sub(r"4", " four ", sentence)
#     sentence = re.sub(r"5", " five ", sentence)
#     sentence = re.sub(r"6", " six ", sentence)
#     sentence = re.sub(r"7", " seven ", sentence)
#     sentence = re.sub(r"8", " eight ", sentence)
#     sentence = re.sub(r"9", " nine ", sentence)
#     sentence = re.sub(r"0", " zero ", sentence)

#     # removing extraneous symbols
#     sentence = re.sub(r"[^A-Za-z0-9^,!.\/'+-=%]", " ", sentence)

#     # expanding contraction
#     sentence = re.sub(r"\'ve", " have ", sentence)
#     sentence = re.sub(r"n't", " not ", sentence)
#     sentence = re.sub(r"i'm", " i am ", sentence)
#     sentence = re.sub(r"\'re", " are ", sentence)
#     sentence = re.sub(r"\'d", " would ", sentence)
#     sentence = re.sub(r"\'ll", " will ", sentence)

#     #spacing out symbols
#     sentence = re.sub(r",", " ", sentence)
#     sentence = re.sub(r"\.", " . ", sentence)
#     sentence = re.sub(r"!", " ! ", sentence)
#     sentence = re.sub(r"\/", " ", sentence)
#     sentence = re.sub(r"\^", " ^ ", sentence)
#     sentence = re.sub(r"\+", " + ", sentence)
#     sentence = re.sub(r"\-", " - ", sentence)
#     sentence = re.sub(r"\=", " = ", sentence)
#     sentence = re.sub(r"'", " ", sentence)
#     sentence = re.sub(r":", " : ", sentence)
#     sentence = re.sub(r"%", " : ", sentence)

#     return sentence

# def preprocess_text(example):
#     example['text'] = clean_sentence(example['text'])
#     return example

# dataset_train = dataset_train.map(preprocess_text,num_proc = 4)

# # Load the pretrained model
# model = SentenceTransformer('all-mpnet-base-v2')

# # Create embeddings
# reviews_embeddings = model.encode(dataset_train['text'], convert_to_tensor=True, device='cuda')

# # Save the embeddings tensor
# file_path = "embeddings_tensor.pth"
# torch.save(reviews_embeddings, file_path)

In [None]:
# Perform semantic search

from sentence_transformers import CrossEncoder
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np

dataset_train=load_dataset("yelp_review_full",split='train')
labels = np.array(dataset_train['label'])
# Calculate the indexes referring to each star 
label_map = {lab: np.where(labels == lab)[0] for lab in range(5)}

In [None]:
# We also use a CrossEncoder to re-rank the search results

model = SentenceTransformer('all-mpnet-base-v2')
cross_encoder = CrossEncoder('cross-encoder/stsb-distilroberta-base')

In [None]:
# Load the tensor containing the embeddings of the reviews
file_path = 'embeddings_tensor.pth'
reviews_embeddings = torch.load(file_path)

In [None]:
# Create the embedding for the query

query = 'italian restaurant italia crostino carbonara cucina'
query_embedding = model.encode(query, convert_to_tensor=True)

# Normalize the embeddings to perform only the dot product to calculate the cosine similarity
normalised_reviews_embeddings = util.normalize_embeddings(reviews_embeddings)

In [None]:
hits = util.semantic_search(query_embedding,normalised_reviews_embeddings, score_function=util.dot_score, top_k = 5000)[0]

# For each review index associate its position
doc_mapping = {num:pair['corpus_id'] for (num,pair) in enumerate(hits)}

# Rerank the search results

# Concatenate each found review with the query and encode them using the cross-encoder
model_inputs = [(query, dataset_train[idx]['text']) for idx in doc_mapping.values()] 
cross_scores = cross_encoder.predict(model_inputs)

# For each review index associate its new position after reranking
new_doc_mapping = {}
for (num,idx) in enumerate(np.argsort(-cross_scores)):
    new_doc_mapping[num] = doc_mapping[idx]

In [None]:
# Calculate for each number of stars the most relevant reviews given the query

italian_restaurants_idx = []
for i in doc_mapping.values():
    italian_restaurants_idx.append(i)
italian_restaurants_idx = np.array(italian_restaurants_idx) # In this way the dataset is balanced
top300_italian_by_label = np.array([np.intersect1d(italian_restaurants_idx,label_map[lab])[:300] for lab in range(5)])

In [None]:
# Print 2 reviews from the top 300 reviews for each number of stars

for star in range(5):
    print('-------------------------------------------------------------------------------------')
    print('Stars: ' + str(star+1) + '/5' )
    num_reviews_print = 2
    idxs = np.random.choice(300,num_reviews_print)
    for idx in idxs:
        print('------------ Position:' + str(idx) + '------------')
        print(dataset_train[int(top300_italian_by_label[star][idx])]['text'])

In [None]:
# Plot the distribution of the labels of the italian restaurants 
subset = dataset_train.select(italian_restaurants_idx)
stars,count = np.unique(subset['label'], return_counts = True)
star_labels = ['1 Star', '2 Stars', '3 Stars', '4 Stars', '5 Stars']

plt.figure(figsize=(10, 6))
plt.bar(star_labels, count, color='skyblue', edgecolor='black')

plt.title('Distribution of italian restaurant reviews')
plt.xlabel('Review Rating')
plt.ylabel('Number of Reviews')

plt.show()

In [None]:
# Of course the expected value of the number of stars for italian restaurant is greater than 2.5 stars :)

expected_value = np.dot(range(1,6),count)/np.sum(count)
expected_value

In [None]:
# Save the indexes
np.save('idxs.npy',italian_restaurants_idx)

### Now let's generate new italian restaurant reviews by finetuning the newest LLM by META Llama 3 8B using q-LORA
<img src="llama.jpeg" alt="Example Image" width="800"/>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl

import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
import os
import torch
from time import time
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)
from trl import SFTTrainer,setup_chat_format

In [None]:
model_id = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
# For hardware reasons we will use the 4bit quantized version and finetune with q-LORA which is LORA for quantized LLM's

compute_dtype = torch.bfloat16
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True)

In [None]:
# Load the model

time_start = time()

model_config = AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

In [None]:
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

In [None]:
# Setting the configuration for parameter-efficient finetuning
peft_config = LoraConfig(
        lora_alpha=64,
        lora_dropout=0.05,
        r=4,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules= ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
)

In [None]:
# Import the indexes of the reviews talking about italian restaurants and select only those reviews
italian_restaurant_idxs = np.load('/kaggle/input/italian-restaurant-idxs-new/idxs (1).npy')
dataset_train=load_dataset("yelp_review_full",split='train')
dataset_train=dataset_train.select(italian_restaurant_idxs)

In [None]:
# Defining a function to generate the prompt for a review of given stars
def prompt_generation(star):
    prompt = 'Write a review of maximum 100 words of an italian restaurant in english. The review you need to write needs to be of ' + str(star) + ' stars out of 5.'
    return prompt
def generate_prompt(example):
    example['prompt'] = prompt_generation(example['label'] + 1)
    return example

dataset_train = dataset_train.map(generate_prompt)

In [None]:
def model_testing():
    for i in range(5):
        prompt = prompt_generation(i+1)
        inputs = tokenizer(prompt, return_tensors="pt")
        generate_ids = model.generate(inputs.input_ids, max_length=128)
        print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])


In [None]:
model_testing()

In [None]:
os.environ["WANDB_DISABLED"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
torch.cuda.empty_cache()

In [None]:
training_arguments = TrainingArguments(
        output_dir="./results_llama3_sft/",
#         evaluation_strategy="steps",
#         do_eval=True,
        optim="paged_adamw_8bit",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
#         per_device_eval_batch_size=8,
        log_level="debug",
        save_steps=1,
        logging_steps=1,
        learning_rate=8e-6,
#         max_steps=20,
        num_train_epochs=1,
        warmup_steps=3,
        lr_scheduler_type="linear",
)

In [None]:
# SFTTrainer stands for supervised fine-tuning. The trl (Transformer Reinforcement Learning) library from HuggingFace provides a simple API to fine-tune models using SFTTrainer.
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset_train,
#         eval_dataset=dataset_train,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_arguments,
)

trainer.train()