In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
import torch
import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from fuzzywuzzy import fuzz
import evaluate
import csv
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import top_k_accuracy_score, ndcg_score
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/yelp/splits/train.jsonl"
df_recommender_train = pd.read_json(input_file, lines=True)
for _, row in df_recommender_train.iterrows():
    row["recommended_place"]["place_name"] = row["recommended_place"]["place_name"].lower()

In [4]:
apps_training_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/yelp/yelp_df.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["name"].lower())
        
all_apps = list(set(all_apps))

In [5]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/yelp/splits/val.jsonl"
df_recommender_validation = pd.read_json(input_file, lines=True)
for _, row in df_recommender_validation.iterrows():
    row["recommended_place"]["place_name"] = row["recommended_place"]["place_name"].lower()

In [6]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_place"]["place_name"]]), 24, replace=False).astype(new_dtype)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_place"]["place_name"]) 

df_recommender_train['candidate'] = df_recommender_train.apply(candidate_creator, axis=1)

In [7]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_place"]["place_name"]]), 24, replace=False).astype(new_dtype)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_place"]["place_name"]) 

df_recommender_validation['candidate'] = df_recommender_validation.apply(candidate_creator, axis=1)

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:", "candidate_apps:", "previous_interactions:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100



In [9]:
df_recommender_train

Unnamed: 0,user_id,user_previous_interactions,recommended_place,negative_recommended_place,turns,candidate
0,_BHTC7nyCBoZcfiiD5cOXg,"[{'place_name': 'Amish Country Store', 'busine...","{'place_name': 'laziz', 'business_id': 'wS5eU8...",[],"[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[zia's on the hill, manayunk diner, south jers..."
1,y5qEpAfJQaOjjG8ZJWrp6w,"[{'place_name': 'Trader Joe's', 'business_id':...","{'place_name': 'vincente's restaurant', 'busin...",[],"[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[tavern 17, stonewood grill & tavern, fred's w..."
2,6s-g2vFu12OemhiK3FJuOQ,"[{'place_name': 'Rouge', 'business_id': '4tU9h...","{'place_name': 'cafe saigon', 'business_id': '...","[{'place_name': 'Pho Ha Saigon', 'business_id'...","[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[sap sap lao cafe, festiva, sabrina's café, id..."
3,Zqu-tMR9CHjA44sU7I5bXA,"[{'place_name': 'Caribbean Delight', 'business...","{'place_name': 'morimoto', 'business_id': '6_T...",[{'place_name': 'Zento Contemporary Japanese C...,"[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[tortilla press cantina, a&w restaurant, ocean..."
4,sZUC9GbwvDHWrHl0D_d2dA,"[{'place_name': 'Saturn Club', 'business_id': ...","{'place_name': 'silk city diner & lounge', 'bu...","[{'place_name': 'Parc', 'business_id': 'j-qtdD...","[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[michael's grill, marlene's original breakfast..."
...,...,...,...,...,...,...
9738,WwulXySQN8t2hwqH_yWurA,"[{'place_name': 'McDonald's', 'business_id': '...","{'place_name': '3 in 1 restaurant', 'business_...",[],"[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[taqueria doña maria, 3 in 1 restaurant, taque..."
9739,61yERKmEa_0mkZYy_3PNAg,"[{'place_name': 'Mesa Verde', 'business_id': '...","{'place_name': 'cajun kitchen cafe', 'business...","[{'place_name': 'State & Fig', 'business_id': ...","[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[cajun kitchen cafe, ms. kelli's karaoke bar, ..."
9740,3CwMwbY9UgP-qvTDxNs_0g,"[{'place_name': 'Reed's Dairy - Meridian', 'bu...","{'place_name': 'modern bar and restaurant', 'b...","[{'place_name': 'Sid's Garage', 'business_id':...","[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[modern bar and restaurant, rickshaw lounge, j..."
9741,wXdbkFZsfDR7utJvbWElyA,"[{'place_name': 'Chin Brothers Restaurant', 'b...","{'place_name': 'ale emporium', 'business_id': ...",[],"[{'turn': 1, 'is_rec': False, 'user_accept_rec...","[modo mio taverna, ale emporium, jamaican styl..."


In [10]:
prompt_train = []
recommend_train = []
not_founds = 0

for _, row in df_recommender_train.iterrows():
    if len(row["user_previous_interactions"]) > 0:
        row["user_previous_interactions"] = random.sample(row["user_previous_interactions"], min(3, len(row["user_previous_interactions"])))
        previous_interactions_items = [previous_interactions["place_name"] for previous_interactions in row["user_previous_interactions"]]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
    found = False
    recommended = row["recommended_place"]["place_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "candidate: "
            for app in row["candidate"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            prompt_train.append(prompt)
            recommend_train.append(recommended)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
            
    if not found:
        not_founds += 1
            
print(f"Could not find {not_founds}")
print(f"len(prompt_train): {len(prompt_train)}")
print(f"len(recommend_train): {len(recommend_train)}")
            
tokenizer.truncation_side = 'left'
prompt_encodings = tokenizer(prompt_train, padding='max_length', max_length=1400, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_train, padding='max_length', max_length=128, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_train = Dataset.from_dict(dataset)

Could not find 49
len(prompt_train): 9694
len(recommend_train): 9694


In [11]:
prompt_validation = []
recommend_validation = []
not_founds = 0

for _, row in df_recommender_validation.iterrows():
    if len(row["user_previous_interactions"]) > 0:
        row["user_previous_interactions"] = random.sample(row["user_previous_interactions"], min(3, len(row["user_previous_interactions"])))
        previous_interactions_items = [previous_interactions["place_name"] for previous_interactions in row["user_previous_interactions"]]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
        
    found = False
    recommended = row["recommended_place"]["place_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "candidate: "
            for app in row["candidate"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            prompt_validation.append(prompt)
            recommend_validation.append(recommended)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
            
    if not found:
        not_founds += 1
        
print(f"Could not find {not_founds}")
print(f"len(prompt_validation): {len(prompt_validation)}")
print(f"len(recommend_validation): {len(recommend_validation)}")
            
tokenizer.truncation_side = 'left'
prompt_encodings = tokenizer(prompt_validation, padding='max_length', max_length=1400, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_validation, padding='max_length', max_length=128, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_validation = Dataset.from_dict(dataset)

Could not find 19
len(prompt_validation): 2068
len(recommend_validation): 2068


In [12]:
def data_collator(batch):
    input_ids, attention_mask, labels,  = [], [], []
    for sample in batch:
        input_ids.append(sample['input_ids'])
        attention_mask.append(sample['attention_mask'])
        labels.append(sample['labels'])
    max_encoder_len = max(sum(x) for x in attention_mask)
    max_decoder_len = max(sum([0 if item == IGNORE_INDEX else 1 for item in x]) for x in labels)
    return {
        'input_ids': torch.tensor(input_ids)[:, :max_encoder_len],
        'attention_mask': torch.tensor(attention_mask)[:, :max_encoder_len],
        'labels': torch.tensor(labels)[:, :max_decoder_len]
    }

In [13]:
training_args = TrainingArguments(
    output_dir="/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/yelp/T5_previous_interactions_candidate_apps",
    num_train_epochs=5,
    # logging_steps=500,
    # logging_dir=self.cfg.logging_dir,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=0.3,#self.cfg.save_steps,
    eval_steps=0.3, #self.cfg.eval_steps,
    save_total_limit=3,
    gradient_accumulation_steps=4, #gradient_accumulation_steps,
    per_device_train_batch_size=3, #train_batch_size,
    per_device_eval_batch_size=3, #self.cfg.eval_batch_size,
    warmup_steps=100,
    weight_decay=0.01,
    # dataloader_drop_last=True,
    disable_tqdm=False,
    push_to_hub=False
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_train,
        eval_dataset=dataset_validation,
        data_collator=data_collator,
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
1212,0.7166,0.515867
2424,0.5526,0.48194
3636,0.4633,0.475215


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


# Load the model and test it on the test dataset

In [15]:
torch.cuda.empty_cache()

In [16]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/yelp/splits/test.jsonl"
df_recommender_test = pd.read_json(input_file, lines=True)
for _, row in df_recommender_test.iterrows():
    row["recommended_place"]["place_name"] = row["recommended_place"]["place_name"].lower()

In [17]:
apps_training_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/yelp/yelp_df.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["name"].lower())
        
all_apps = list(set(all_apps))

In [18]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    np.random.seed(row.name)
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_place"]["place_name"]]), 24, replace=False).astype(new_dtype) # filter_candidate_apps(row["recommended_product"]["product_name"]) 
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_place"]["place_name"]) 

df_recommender_test['candidate'] = df_recommender_test.apply(lambda row: candidate_creator(row), axis=1)

In [19]:
prompt_test = []
recommend_test = []
candidate_apps = []
true_candidate_indexes = []
not_founds = 0

for _, row in df_recommender_test.iterrows():
    # creating candidate apps
    candidates = []
    for index, candidate_app in enumerate(row["candidate"].tolist()):
        candidates.append(candidate_app)
        if candidate_app == row["recommended_place"]["place_name"]:
            true_candidate_index = index
            
    if len(row["user_previous_interactions"]) > 0:
        row["user_previous_interactions"] = random.sample(row["user_previous_interactions"], min(3, len(row["user_previous_interactions"])))
        previous_interactions_items = [previous_interactions["place_name"] for previous_interactions in row["user_previous_interactions"]]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
    found = False
    recommended = row["recommended_place"]["place_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "candidate: "
            for app in row["candidate"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            prompt_test.append(prompt)
            recommend_test.append(recommended)
            candidate_apps.append(candidates)
            true_candidate_indexes.append(true_candidate_index)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
    
    if not found:
        not_founds += 1
            
            
print(f"Could not find {not_founds}")
print(f"Number of prompt: {len(prompt_test)}")
print(f"Number of generations: {len(recommend_test)}")
print(f"Number of candidate apps: {len(candidate_apps)}")
print(f"Number of true candidate indexes: {len(true_candidate_indexes)}")

Could not find 11
Number of prompt: 2078
Number of generations: 2078
Number of candidate apps: 2078
Number of true candidate indexes: 2078


In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/yelp/T5_previous_interactions_candidate_apps")
model.eval()
model = model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:",  "candidate_apps:", "previous_interactions:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100
tokenizer.truncation_side = 'left'



In [21]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=8, threshold=70):
  prompt_batches = list(chunk(prompt_test, batch_size))
  generation_batches = list(chunk(recommend_test, batch_size))

  correctly_predicted = []
  for prompt_batch, generation_batch in tqdm(zip(prompt_batches, generation_batches), total = len(generation_batches)):

    inputs = tokenizer(prompt_batch, max_length=1400, truncation=True, padding="max_length", return_tensors="pt") 

    generations_predicted = model.generate(input_ids=inputs["input_ids"].to('cuda'), attention_mask=inputs["attention_mask"].to('cuda'),
                            max_new_tokens=32,
                            num_beams=8,
                            eos_token_id=tokenizer.eos_token_id,
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id) # length_penalty=0.8, Set length_penalty to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences.

    decoded_generations = [tokenizer.decode(generation, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generation in generations_predicted]
    generation_batch = [generation for generation in generation_batch]
    
    correctly_predicted.extend([1 if fuzz.ratio(predicted, ground_truth) > threshold else 0 for predicted, ground_truth in zip(decoded_generations, generation_batch)])

  return correctly_predicted

In [22]:
correctly_predicted = evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=8, threshold=70)
success_rate = sum(correctly_predicted) / len(correctly_predicted)
print("success_rate: ", success_rate)

  0%|          | 0/260 [00:00<?, ?it/s]

100%|██████████| 260/260 [05:04<00:00,  1.17s/it]

success_rate:  0.43647738209817133





In [23]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]
    
def convert_to_sublists(numbers, sublist_size):
    return [numbers[i:i+sublist_size] for i in range(0, len(numbers), sublist_size)]

def recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=8):
  model.eval()
  encoder_max_length = 1400
  decoder_max_length = 32
  tokenizer.truncation_side='left'
  prompts_tokenized = tokenizer(prompts, max_length=encoder_max_length, truncation=True, padding="max_length", return_tensors="pt")
  tokenizer.truncation_side='right'
  input_ids_decoder = []
  attention_mask_decoder = []
  input_ids_encoder = []
  attention_mask_encoder  = []
  for index, candidate_app_elements in enumerate(candidate_apps):
    candidate_app_elements = [tokenizer.pad_token+element for element in candidate_app_elements] # adding pad token to the beginning of each candidate app
    candidate_apps_tokenized = tokenizer(candidate_app_elements, max_length=decoder_max_length, truncation=True, padding="max_length", return_tensors="pt")
    for candidate_app_index in range(len(candidate_app_elements)):
      input_ids_decoder.append(candidate_apps_tokenized["input_ids"][candidate_app_index])
      attention_mask_decoder.append(candidate_apps_tokenized["attention_mask"][candidate_app_index])
      input_ids_encoder.append(prompts_tokenized["input_ids"][index])
      attention_mask_encoder.append(prompts_tokenized["attention_mask"][index])
  
  input_ids_encoder_batches = list(chunk(input_ids_encoder, batch_size))
  attention_mask_encoder_batches = list(chunk(attention_mask_encoder, batch_size))
  input_ids_decoder_batches = list(chunk(input_ids_decoder, batch_size))
  attention_mask_decoder_batches = list(chunk(attention_mask_decoder, batch_size))
  

  scores = []
  for input_ids_encoder_batch, attention_mask_encoder_batch, input_ids_decoder_batch, attention_mask_decoder_batch in tqdm(zip(input_ids_encoder_batches, attention_mask_encoder_batches, input_ids_decoder_batches, attention_mask_decoder_batches), total = len(input_ids_encoder_batches)):
    decoder_input_ids = torch.stack(input_ids_decoder_batch).to("cuda")
    decoder_attention_mask = torch.stack(attention_mask_decoder_batch).to("cuda")
    input_ids = torch.stack(input_ids_encoder_batch).to("cuda")
    attention_mask = torch.stack(attention_mask_encoder_batch).to("cuda")
    with torch.no_grad():
      model_output = model(decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, 
                           input_ids=input_ids, attention_mask=attention_mask)
    
    logprobs = F.log_softmax(model_output["logits"], dim=-1)[:, :-1, :] # remove the eos token
    output_tokens = decoder_input_ids[:, 1:] # remove the bos token
        
    tokens_logprobs = torch.gather(logprobs, 2, output_tokens[:, :, None]).squeeze(-1).to(torch.float32)
        
    mask = torch.ones(tokens_logprobs.shape, dtype=torch.bool, device="cuda")
    for i, _output in enumerate(output_tokens):
      for j, _token in enumerate(_output):
        if _token == tokenizer.pad_token_id:
          mask[i, j] = False
              
    score = (tokens_logprobs * mask).sum(-1) / mask.sum(-1)
    scores.extend(score.to('cpu').tolist())
    
  # batch_input_representations = torch.cat(batch_input_representations)
  
  scores = convert_to_sublists(scores, len(candidate_apps[0]))
  
  return scores

In [24]:
scores = recommender_rank(prompt_test, candidate_apps, model, tokenizer, batch_size=8)

100%|██████████| 6494/6494 [37:43<00:00,  2.87it/s]


Similar

In [25]:
# this is the one
[top_k_accuracy_score(true_candidate_indexes, scores, k=k) for k in range(1, 11)]

[np.float64(0.4292589027911453),
 np.float64(0.5567853705486044),
 np.float64(0.6246390760346487),
 np.float64(0.6626564003849855),
 np.float64(0.6891241578440809),
 np.float64(0.7151106833493744),
 np.float64(0.7430221366698749),
 np.float64(0.7714148219441771),
 np.float64(0.7925890279114534),
 np.float64(0.8055822906641001)]

In [26]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_indexes]

In [None]:
# this is the one
[ndcg_score(true_relevance, scores, k=k) for k in range(1, 11)]

[np.float64(0.4292589027911453),
 np.float64(0.5097191456671973),
 np.float64(0.5436459984102195),
 np.float64(0.5600191688085825),
 np.float64(0.5702582950828363),
 np.float64(0.5795148822357877),
 np.float64(0.588818700009288),
 np.float64(0.5977755949709618),
 np.float64(0.6041496661014792),
 np.float64(0.6079055613423757)]

: 

Sampled

In [None]:
[top_k_accuracy_score(true_candidate_indexes, scores, k=k) for k in range(1, 11)]

In [17]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_indexes]

In [None]:
[ndcg_score(true_relevance, scores, k=k) for k in range(1, 11)]