In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import warnings
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, OPTForCausalLM
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig
import numpy as np
import evaluate
import random
import os
warnings.filterwarnings("ignore")

def set_random_seed(seed: int):
    print("Seed: {}".format(seed))

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


set_random_seed(1)

print("starting")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

file_path = 'dataset.xlsx'  
data = pd.read_excel(file_path)

hf_dataset = Dataset.from_pandas(data)
train_testval = hf_dataset.train_test_split(test_size=0.3, seed=42)
test_val = train_testval['test'].train_test_split(test_size=2/3, seed=42)

dataset = DatasetDict({
    'train': train_testval['train'],
    'validation': test_val['train'],
    'test': test_val['test']
})

model_name = "lmsys/vicuna-7b-v1.5"
tokenizer = LlamaTokenizer.from_pretrained(model_name, model_max_length = 512, truncation_side = 'left')

tokenizer.pad_token = tokenizer.eos_token

model = LlamaForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model=model.to(device)

 
max_input_length = 1024
max_target_length = 1024
conversation_history = {}
tokenizer.padding_side = "left"



def preprocess_function(examples):
    global conversation_history

    user_ids = examples["ID"]
    user_texts = [text if isinstance(text, str) else "" for text in examples["User"]]
    bot_texts = [text if isinstance(text, str) else "" for text in examples["BOT"]]
    best_contexts = [text if isinstance(text, str) else "" for text in examples["BestContextValue"]]

    updated_inputs = []

    for user_id, user_text, best_context in zip(user_ids, user_texts, best_contexts):
        if user_id not in conversation_history:
            conversation_history[user_id] = ""

        historical_context = conversation_history[user_id]
        updated_input = f"Context: {best_context} History: {historical_context} User: {user_text}"
        updated_inputs.append(updated_input)

        if bot_texts:
            conversation_history[user_id] += f" User: {user_text} Bot: {bot_texts[user_ids.index(user_id)]}"

    model_inputs = tokenizer(
        updated_inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        bot_texts,
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )["input_ids"]

    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_sequence]
        for label_sequence in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs



class FinDataset(Dataset):

    def _init_(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask


    def _len_(self):
        return len(self.input_ids)

    def _getitem_(self, idx):
        return self.input_ids[idx], self.attention_mask[idx]


tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(dataset["train"].column_names)

for split in ["train", "validation", "test"]:
    print(f"{split} Input IDs Shape: {len(tokenized_datasets[split]['input_ids'][0])}")
    print(f"{split} Labels Shape: {len(tokenized_datasets[split]['labels'][0])}")


data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)


rouge_score = evaluate.load("rouge")
bleu = evaluate.load('bleu')
bertscore = evaluate.load('bertscore')
meteor = evaluate.load('meteor')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_result = rouge_score.compute(predictions=decoded_preds, references=decoded_labels)

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    bleu_scores = {f"bleu{i+1}": round(score, 4) for i, score in enumerate(bleu_result['precisions'])}

    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_result["gen_len"] = np.mean(prediction_lens)

    metrics = {
        "rouge": {k: round(v, 4) for k, v in rouge_result.items()},
        **bleu_scores,
        "meteor": round(meteor_result["meteor"], 4),
        "bertscore_precision": round(np.mean(bertscore_result['precision']), 4),
        "bertscore_recall": round(np.mean(bertscore_result['recall']), 4),
        "bertscore_f1": round(np.mean(bertscore_result['f1']), 4),
        "gen_len": round(np.mean(prediction_lens), 4)
    }

    return metrics


from huggingface_hub import login
login('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')

args = Seq2SeqTrainingArguments(
    output_dir="vicuna-context",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_strategy='epoch',
    push_to_hub=True,
)



trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(metrics)
