In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
import torch
import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from fuzzywuzzy import fuzz
import evaluate
import csv
import numpy as np
from sklearn.metrics import top_k_accuracy_score, ndcg_score
import torch.nn.functional as F
import pandas as pd
from fuzzywuzzy import fuzz
import random

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/mobilerec/splits/train.jsonl"
df_recommender_train = pd.read_json(input_file, lines=True)
for _, row in df_recommender_train.iterrows():
    row["recommended_app"]["app_name"] = row["recommended_app"]["app_name"].lower()

In [4]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/mobilerec/splits/val.jsonl"
df_recommender_validation = pd.read_json(input_file, lines=True)
for _, row in df_recommender_validation.iterrows():
    row["recommended_app"]["app_name"] = row["recommended_app"]["app_name"].lower()

In [5]:
device = torch.device("cuda:0")
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "google/flan-t5-base").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:", "previous_interactions:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100



In [6]:
prompt_train = []
recommend_train = []
not_founds = 0

for _, row in tqdm(df_recommender_train.iterrows(), total=len(df_recommender_train)):
    if len(row["user_previous_interactions"]) > 0:
        sorted_interactions = sorted(
            row["user_previous_interactions"],
            key=lambda x: x['date'],
            reverse=True
        )
        previous_interactions_items = [previous_interactions["app_name"].lower() for previous_interactions in sorted_interactions]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
    found = False
    recommended = row["recommended_app"]["app_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "computer: I would recommend the "
            prompt_train.append(prompt)
            recommend_train.append(recommended)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
            
    if not found:
        not_founds += 1
        
print(f"Could not find {not_founds}")
print(f"len(prompt_train): {len(prompt_train)}")
print(f"len(recommend_train): {len(recommend_train)}")
            
prompt_encodings = tokenizer(prompt_train, padding='max_length', max_length=1024, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_train, padding='max_length', max_length=32, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_train = Dataset.from_dict(dataset)

  0%|          | 0/8635 [00:00<?, ?it/s]

100%|██████████| 8635/8635 [00:30<00:00, 278.94it/s]


Could not find 145
len(prompt_train): 8490
len(recommend_train): 8490


In [7]:
prompt_train

["previous_interactions: threema\ncomputer: Hi! How can I help you today?\nhuman: I am interested in children's train games that are compatible with Android.\ncomputer: Alright! Any specific features you are interested in?\nhuman: I am looking for it to be interactive.\ncomputer: Do you want the game to have power-ups?\nhuman: Yes, I am looking for power-ups.\ncomputer: Are you interested in a free app, or would you consider paying for additional features?\nhuman: I prefer free apps.\ncomputer: Do you want the game to be challenging?\nhuman: I need the game to be challenging.\ncomputer: Are app bugs an issue for you?\nhuman: I prefer the app not to have bugs and glitches.\ncomputer: I would recommend the ",
 "previous_interactions: credit sesame: credit score & mobile banking, explorer, star warfare2:payback, philadelphia eagles, papa's freezeria to go!, graalonline era, city maps 2go pro offline maps, sector strike\ncomputer: Hey there! How can I help you today?\nhuman: I'm looking fo

In [8]:
prompt_validation = []
recommend_validation = []
not_founds = 0

for _, row in tqdm(df_recommender_validation.iterrows(), total=len(df_recommender_validation)):
    if len(row["user_previous_interactions"]) > 0:
        sorted_interactions = sorted(
            row["user_previous_interactions"],
            key=lambda x: x['date'],
            reverse=True
        )
        
        previous_interactions_items = [previous_interactions["app_name"].lower() for previous_interactions in sorted_interactions]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
    found = False
    recommended = row["recommended_app"]["app_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "computer: I would recommend the "
            prompt_validation.append(prompt)
            recommend_validation.append(recommended)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
            
    if not found:
        not_founds += 1
        
print(f"Could not find {not_founds}")
print(f"len(prompt_train): {len(prompt_train)}")
print(f"len(recommend_train): {len(recommend_train)}")
            
prompt_encodings = tokenizer(prompt_validation, padding='max_length', max_length=1024, truncation=True, return_tensors='pt')
recommend_encodings = tokenizer(recommend_validation, padding='max_length', max_length=32, truncation=True, return_tensors='pt')

labels = recommend_encodings['input_ids']
labels[labels == tokenizer.pad_token_id] = IGNORE_INDEX

dataset = {
    'input_ids': prompt_encodings['input_ids'],
    'attention_mask': prompt_encodings['attention_mask'],
    'labels': labels,
}
dataset_validation = Dataset.from_dict(dataset)

  1%|          | 21/1847 [00:00<00:08, 208.69it/s]

100%|██████████| 1847/1847 [00:06<00:00, 274.91it/s]


Could not find 24
len(prompt_train): 8490
len(recommend_train): 8490


In [9]:
def data_collator(batch):
    input_ids, attention_mask, labels,  = [], [], []
    for sample in batch:
        input_ids.append(sample['input_ids'])
        attention_mask.append(sample['attention_mask'])
        labels.append(sample['labels'])
    max_encoder_len = max(sum(x) for x in attention_mask)
    max_decoder_len = max(sum([0 if item == IGNORE_INDEX else 1 for item in x]) for x in labels)
    return {
        'input_ids': torch.tensor(input_ids)[:, :max_encoder_len],
        'attention_mask': torch.tensor(attention_mask)[:, :max_encoder_len],
        'labels': torch.tensor(labels)[:, :max_decoder_len]
    }

In [10]:
training_args = TrainingArguments(
    output_dir="/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/mobilerec/T5_previous_interations",
    num_train_epochs=10,
    # logging_steps=500,
    # logging_dir=self.cfg.logging_dir,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=0.2,#self.cfg.save_steps,
    eval_steps=0.2, #self.cfg.eval_steps,
    save_total_limit=3,
    gradient_accumulation_steps=3, #gradient_accumulation_steps,
    per_device_train_batch_size=4, #train_batch_size,
    per_device_eval_batch_size=4, #self.cfg.eval_batch_size,
    warmup_steps=100,
    weight_decay=0.01,
    # dataloader_drop_last=True,
    disable_tqdm=False,
    push_to_hub=False
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_train,
        eval_dataset=dataset_validation,
        data_collator=data_collator
    )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
trainer.train(resume_from_checkpoint=True)
trainer.save_model()

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].
	eval_steps: 0.2 (from args) != 1061 (from trainer_state.json)
	save_steps: 0.2 (from args) != 1061 (from trainer_state.json)


Step,Training Loss,Validation Loss
6366,0.8491,1.075009


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


# Load the model and test it on the test dataset

In [3]:
torch.cuda.empty_cache()

In [4]:
input_file = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/mobilerec/splits/test.jsonl"
df_recommender_test = pd.read_json(input_file, lines=True)
for _, row in df_recommender_test.iterrows():
    row["recommended_app"]["app_name"] = row["recommended_app"]["app_name"].lower()

In [5]:
apps_training_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/dataset/mobilerec/mobile_df.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["app_name"].lower())
        
all_apps = list(set(all_apps))

In [6]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    np.random.seed(row.name)
    selected_values = np.random.choice(np.setdiff1d(all_apps , [row["recommended_app"]["app_name"]]), 24, replace=False).astype(new_dtype) # filter_candidate_apps(row["recommended_product"]["product_name"]) 
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app"]["app_name"])

df_recommender_test['candidate'] = df_recommender_test.apply(lambda row: candidate_creator(row), axis=1)

In [7]:
prompt_test = []
recommend_test = []
candidate_apps = []
true_candidate_indexes = []
not_founds = 0
for _, row in df_recommender_test.iterrows():
    candidates = []
    for index, candidate_app in enumerate(row["candidate"].tolist()):
        candidates.append(candidate_app)
        if candidate_app == row["recommended_app"]["app_name"]:
            true_candidate_index = index
    if len(row["user_previous_interactions"]) > 0:
        sorted_interactions = sorted(
            row["user_previous_interactions"],
            key=lambda x: x['date'],
            reverse=True
        )
        
        previous_interactions_items = [previous_interactions["app_name"].lower() for previous_interactions in sorted_interactions]
        prompt = "previous_interactions: " + ", ".join(previous_interactions_items) + "\n"
    else:
        prompt = "previous_interactions: No previous interactions" + "\n"
    
    found = False
    recommended = row["recommended_app"]["app_name"]
    
    for index, turn in enumerate(row["turns"]):
        computer = turn["COMPUTER"]
        
        if fuzz.partial_ratio(recommended, computer.lower()) >= 95:
            prompt += "computer: I would recommend the "
            prompt_test.append(prompt)
            recommend_test.append(recommended)
            candidate_apps.append(candidates)
            true_candidate_indexes.append(true_candidate_index)
            found = True
            break
        else:
            prompt += "computer: "+ computer + "\n"
        
        if "HUMAN" in turn:
            human = turn["HUMAN"]
            prompt += "human: " + human + "\n"
    
    if not found:
        not_founds += 1

print(f"Could not find {not_founds}")
print(f"Number of prompt: {len(prompt_test)}")
print(f"Number of generations: {len(recommend_test)}")
print(f"Number of candidate apps: {len(candidate_apps)}")
print(f"Number of true candidate indexes: {len(true_candidate_indexes)}")

Could not find 16
Number of prompt: 1831
Number of generations: 1831
Number of candidate apps: 1831
Number of true candidate indexes: 1831


In [8]:
prompt_test[10]

'previous_interactions: investment run: invest fast, lords mobile: tower defense, fairy town, speech services by google, route: package tracker, dunkin’, water sort - color puzzle game, stone miner, avg secure browser, baby panda\'s juice shop\ncomputer: Hi! How can I help you today?\nhuman: I\'m searching for social networking apps that support Android devices.\ncomputer: Could you share some more details about what you\'re looking for?\nhuman: I would like it to be easy to use.\ncomputer: Do you have any preferences for the app\'s content rating?\nhuman: I prefer apps with a Teen content rating.\ncomputer: Do you care about app bugs and glitches?\nhuman: I can tolerate some bugs and glitches.\ncomputer: Hmm, are you comfortable with intermittent ads in the app?\nhuman: I\'m open to exploring apps that contain ads and offer in-app purchases.\ncomputer: Based on what you\'ve told me, I recommend giving "Tantan" a try.\nhuman: What are the potential downsides people have experienced wit

In [9]:
candidate_apps[10]

['my time at portia',
 'solitaire card games, klondike',
 'toca life: school',
 'teppen',
 'usa dating',
 'safe surfer: block porn & apps',
 'pdf scanner app - scan to pdf',
 'death worm™ deluxe',
 'learn spanish - español',
 'instagram',
 'nysoh mobile upload',
 'syncup tracker',
 'idle guardians: never die',
 'slots vegas magic casino 777',
 'candy camera - photo editor',
 'showtime viewer from zoho',
 'kink dating life: bdsm dating, fetish & kinky life',
 'farkle online - 10000 dice game',
 'teladoc | telehealth & therapy',
 'creative destruction',
 'geico mobile - car insurance',
 'baby panda care',
 'kawaiinihongo: learn japanese',
 'yokai tamer-new contents',
 'seven - 7 minute workout']

In [10]:

model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path = "/u/spa-d4/grad/mfe261/Projects/MobileConvRec/models/new_models/mobilerec/T5_previous_interations")
model.eval()
model = model.to('cuda')
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", additional_special_tokens=["computer:", "human:", "previous_interactions:"])
model.resize_token_embeddings(len(tokenizer))
IGNORE_INDEX = -100



In [11]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=8, threshold=70):
  prompt_batches = list(chunk(prompt_test, batch_size))
  generation_batches = list(chunk(recommend_test, batch_size))

  correctly_predicted = []
  for prompt_batch, generation_batch in tqdm(zip(prompt_batches, generation_batches), total = len(generation_batches)):

    inputs = tokenizer(prompt_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt") 

    generations_predicted = model.generate(input_ids=inputs["input_ids"].to('cuda'), attention_mask=inputs["attention_mask"].to('cuda'),
                            max_new_tokens=32,
                            num_beams=8,
                            eos_token_id=tokenizer.eos_token_id,
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id) # length_penalty=0.8, Set length_penalty to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences.

    decoded_generations = [tokenizer.decode(generation, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generation in generations_predicted]
    generation_batch = [generation for generation in generation_batch]
    
    correctly_predicted.extend([1 if fuzz.ratio(predicted, ground_truth) > threshold else 0 for predicted, ground_truth in zip(decoded_generations, generation_batch)])

  return correctly_predicted

In [12]:
correctly_predicted = evaluate_recommender(prompt_test, recommend_test, model, tokenizer, batch_size=16, threshold=70)
success_rate = sum(correctly_predicted) / len(correctly_predicted)
print("success_rate: ", success_rate)

  0%|          | 0/115 [00:00<?, ?it/s]

100%|██████████| 115/115 [03:34<00:00,  1.87s/it]

success_rate:  0.375204806116876





In [13]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]
    
def convert_to_sublists(numbers, sublist_size):
    return [numbers[i:i+sublist_size] for i in range(0, len(numbers), sublist_size)]

def recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=8):
  model.eval()
  encoder_max_length = 1024
  decoder_max_length = 32
  prompts_tokenized = tokenizer(prompts, max_length=encoder_max_length, truncation=True, padding="max_length", return_tensors="pt")
  input_ids_decoder = []
  attention_mask_decoder = []
  input_ids_encoder = []
  attention_mask_encoder  = []
  for index, candidate_app_elements in enumerate(candidate_apps):
    candidate_app_elements = [tokenizer.pad_token+element for element in candidate_app_elements] # adding pad token to the beginning of each candidate app
    candidate_apps_tokenized = tokenizer(candidate_app_elements, max_length=decoder_max_length, truncation=True, padding="max_length", return_tensors="pt")
    for candidate_app_index in range(len(candidate_app_elements)):
      input_ids_decoder.append(candidate_apps_tokenized["input_ids"][candidate_app_index])
      attention_mask_decoder.append(candidate_apps_tokenized["attention_mask"][candidate_app_index])
      input_ids_encoder.append(prompts_tokenized["input_ids"][index])
      attention_mask_encoder.append(prompts_tokenized["attention_mask"][index])
  
  input_ids_encoder_batches = list(chunk(input_ids_encoder, batch_size))
  attention_mask_encoder_batches = list(chunk(attention_mask_encoder, batch_size))
  input_ids_decoder_batches = list(chunk(input_ids_decoder, batch_size))
  attention_mask_decoder_batches = list(chunk(attention_mask_decoder, batch_size))
  

  scores = []
  for input_ids_encoder_batch, attention_mask_encoder_batch, input_ids_decoder_batch, attention_mask_decoder_batch in tqdm(zip(input_ids_encoder_batches, attention_mask_encoder_batches, input_ids_decoder_batches, attention_mask_decoder_batches), total = len(input_ids_encoder_batches)):
    decoder_input_ids = torch.stack(input_ids_decoder_batch).to("cuda")
    decoder_attention_mask = torch.stack(attention_mask_decoder_batch).to("cuda")
    input_ids = torch.stack(input_ids_encoder_batch).to("cuda")
    attention_mask = torch.stack(attention_mask_encoder_batch).to("cuda")
    with torch.no_grad():
      model_output = model(decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, 
                           input_ids=input_ids, attention_mask=attention_mask)
    
    logprobs = F.log_softmax(model_output["logits"], dim=-1)[:, :-1, :] # remove the eos token
    output_tokens = decoder_input_ids[:, 1:] # remove the bos token
        
    tokens_logprobs = torch.gather(logprobs, 2, output_tokens[:, :, None]).squeeze(-1).to(torch.float32)
        
    mask = torch.ones(tokens_logprobs.shape, dtype=torch.bool, device="cuda")
    for i, _output in enumerate(output_tokens):
      for j, _token in enumerate(_output):
        if _token == tokenizer.pad_token_id:
          mask[i, j] = False
              
    score = (tokens_logprobs * mask).sum(-1) / mask.sum(-1)
    scores.extend(score.to('cpu').tolist())
    
  # batch_input_representations = torch.cat(batch_input_representations)
  
  scores = convert_to_sublists(scores, len(candidate_apps[0]))
  
  return scores

In [None]:
scores = recommender_rank(prompt_test, candidate_apps, model, tokenizer, batch_size=8)

100%|██████████| 5722/5722 [21:38<00:00,  4.41it/s]


: 

Similar Candidates

In [None]:
# this one
[top_k_accuracy_score(true_candidate_indexes, scores, k=k) for k in range(1, 11)]

[np.float64(0.6968869470234844),
 np.float64(0.7640633533588204),
 np.float64(0.8017476788640088),
 np.float64(0.8312397596941562),
 np.float64(0.8547241944292736),
 np.float64(0.8738394320043692),
 np.float64(0.888585472419443),
 np.float64(0.9060622610595304),
 np.float64(0.9197160021845986),
 np.float64(0.9355543418896778)]

In [None]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_indexes]

In [None]:
# this one
[ndcg_score(true_relevance, scores, k=k) for k in range(1, 11)]

[np.float64(0.6968869470234844),
 np.float64(0.739270540518454),
 np.float64(0.7581127032710482),
 np.float64(0.7708142511333984),
 np.float64(0.7798992706369949),
 np.float64(0.7867082556445214),
 np.float64(0.7916236024495459),
 np.float64(0.7971369154245013),
 np.float64(0.8012471010561778),
 np.float64(0.8058254079721903)]

Sampled Candidates

In [None]:
[top_k_accuracy_score(true_candidate_indexes, scores, k=k) for k in range(1, 11)]

In [18]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_indexes]

In [None]:
[ndcg_score(true_relevance, scores, k=k) for k in range(1, 11)]