In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
import torch
import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from fuzzywuzzy import fuzz
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.cuda.device_count.cache_clear()

In [4]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/amazon_garden/splits/train.jsonl"
df_recommender_train = pd.read_json(input_file, lines=True)
for _, row in df_recommender_train.iterrows():
    row["recommended_product"]["product_name"] = row["recommended_product"]["product_name"].lower()

In [5]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/amazon_garden/splits/val.jsonl"
df_recommender_validation = pd.read_json(input_file, lines=True)
for _, row in df_recommender_validation.iterrows():
    row["recommended_product"]["product_name"] = row["recommended_product"]["product_name"].lower()

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100



In [7]:
prompt_validation = []
recommend_validation = []
            
for _, row in tqdm(df_recommender_validation.iterrows(), total=len(df_recommender_validation)):
    prompt = ""
    for turn in row["turns"]:
        
        if "COMPUTER" in turn:
            computer = "Computer: " + turn["COMPUTER"] 
            recommend_validation.append(computer)
            prompt_validation.append(prompt)
            prompt += computer + "\n"
            
            if "HUMAN" in turn:
                prompt += "Human: " + turn["HUMAN"] + "\n"
            
tokenizer.truncation_side = 'left'  
prompt_encodings = tokenizer(prompt_validation, padding='max_length', max_length=1024, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_validation, padding='max_length', max_length=128, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_validation = Dataset.from_dict(dataset)

100%|██████████| 1828/1828 [00:00<00:00, 17342.46it/s]


In [10]:
prompt_train = []
recommend_train = []

for _, row in tqdm(df_recommender_train.iterrows(), total=len(df_recommender_train)):
    prompt = ""
    for turn in row["turns"]:
        
        if "COMPUTER" in turn:
            computer = "Computer: " + turn["COMPUTER"] 
            recommend_train.append(computer)
            prompt_train.append(prompt)
            prompt += computer + "\n"
            
            if "HUMAN" in turn:
                prompt += "Human: " + turn["HUMAN"] + "\n"
            

tokenizer.truncation_side = 'left'
prompt_encodings = tokenizer(prompt_train, padding='max_length', max_length=1024, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_train, padding='max_length', max_length=128, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_train = Dataset.from_dict(dataset)

  0%|          | 0/8533 [00:00<?, ?it/s]

100%|██████████| 8533/8533 [00:00<00:00, 20223.08it/s]


In [11]:
def data_collator(batch):
    input_ids, attention_mask, labels,  = [], [], []
    for sample in batch:
        input_ids.append(sample['input_ids'])
        attention_mask.append(sample['attention_mask'])
        labels.append(sample['labels'])
    max_encoder_len = max(sum(x) for x in attention_mask)
    max_decoder_len = max(sum([0 if item == IGNORE_INDEX else 1 for item in x]) for x in labels)
    return {
        'input_ids': torch.tensor(input_ids)[:, :max_encoder_len],
        'attention_mask': torch.tensor(attention_mask)[:, :max_encoder_len],
        'labels': torch.tensor(labels)[:, :max_decoder_len]
    }

In [12]:
training_args = TrainingArguments(
    output_dir="/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/amazon_garden/T5_responder",
    num_train_epochs=5,
    # logging_steps=500,
    # logging_dir=self.cfg.logging_dir,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=0.3,#self.cfg.save_steps,
    eval_steps=0.3, #self.cfg.eval_steps,
    save_total_limit=3,
    gradient_accumulation_steps=2, #gradient_accumulation_steps,
    per_device_train_batch_size=6, #train_batch_size,
    per_device_eval_batch_size=6, #self.cfg.eval_batch_size,
    warmup_steps=100,
    weight_decay=0.01,
    # dataloader_drop_last=True,
    disable_tqdm=False,
    push_to_hub=False
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_train,
        eval_dataset=dataset_validation,
        data_collator=data_collator,
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
11955,1.0998,1.078982
23910,1.0225,0.998205
35865,0.9269,0.963926


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


# Load the model and test it on the test dataset

In [14]:
torch.cuda.empty_cache()

In [15]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/amazon_garden/splits/test.jsonl"
df_recommender_test = pd.read_json(input_file, lines=True)
for _, row in df_recommender_test.iterrows():
    row["recommended_product"]["product_name"] = row["recommended_product"]["product_name"].lower()

In [16]:
prompt_test = []
recommend_test = []
            
for _, row in tqdm(df_recommender_test.iterrows(), total=len(df_recommender_test)):
    prompt = ""
    for turn in row["turns"]:
        
        if "COMPUTER" in turn:
            computer = "Computer: " + turn["COMPUTER"] 
            recommend_test.append(computer)
            prompt_test.append(prompt)
            prompt += computer + "\n"
            
            if "HUMAN" in turn:
                prompt += "Human: " + turn["HUMAN"] + "\n"

  0%|          | 0/1829 [00:00<?, ?it/s]

100%|██████████| 1829/1829 [00:00<00:00, 13814.59it/s]


In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/amazon_garden/T5_responder")
model.eval()
model = model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100
tokenizer.truncation_side = 'left'



In [18]:
bleu = evaluate.load("bleu")

In [19]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=8, bleu=bleu):
  prompt_batches = list(chunk(prompt_test, batch_size))
  generation_batches = list(chunk(recommend_test, batch_size))

  predictions = []
  references = []
  for prompt_batch, generation_batch in tqdm(zip(prompt_batches, generation_batches), total = len(generation_batches)):

    inputs = tokenizer(prompt_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt") 

    generations_predicted = model.generate(input_ids=inputs["input_ids"].to('cuda'), attention_mask=inputs["attention_mask"].to('cuda'),
                            max_new_tokens=128,
                            num_beams=8,
                            eos_token_id=tokenizer.eos_token_id,
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id) # length_penalty=0.8, Set length_penalty to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences.

    decoded_generations = [tokenizer.decode(generation, clean_up_tokenization_spaces=True).replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "") for generation in generations_predicted]
    generation_batch = [generation for generation in generation_batch]
    
    predictions.extend(decoded_generations)
    references.extend([[gen] for gen in generation_batch])
    
    results = bleu.compute(predictions=predictions, references=references)

  return results

In [20]:
prompt_test

['',
 "Computer: Hello! What can I help you with today?\nHuman: I'm interested in soil meters. Any recommendations?\n",
 "Computer: Hello! What can I help you with today?\nHuman: I'm interested in soil meters. Any recommendations?\nComputer: Absolutely! Are there any specific details you're focusing on?\nHuman: I'd like it to be accurate.\n",
 "Computer: Hello! What can I help you with today?\nHuman: I'm interested in soil meters. Any recommendations?\nComputer: Absolutely! Are there any specific details you're focusing on?\nHuman: I'd like it to be accurate.\nComputer: Of course! Is product quality important to you?\nHuman: Product quality doesn't really influence my decision.\n",
 "Computer: Hello! What can I help you with today?\nHuman: I'm interested in soil meters. Any recommendations?\nComputer: Absolutely! Are there any specific details you're focusing on?\nHuman: I'd like it to be accurate.\nComputer: Of course! Is product quality important to you?\nHuman: Product quality doesn

In [21]:
results = evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=8, bleu=bleu)
print("results: ", results)

100%|██████████| 2601/2601 [2:51:12<00:00,  3.95s/it]  

results:  {'bleu': 0.2099343801669992, 'precisions': [0.45189901161637774, 0.24489212526431328, 0.15081572985934746, 0.11637814135780099], 'brevity_penalty': 1.0, 'length_ratio': 1.069924244454569, 'translation_length': 479065, 'reference_length': 447756}



