In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from dataclasses import dataclass
from typing import Optional, List
from fuzzywuzzy import fuzz
import csv
import numpy as np
import evaluate
import torch.nn.functional as F
from sklearn.metrics import top_k_accuracy_score, ndcg_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def is_approximate_substring(substring, string, threshold=70):
    for i in range(len(string) - len(substring) + 1):
        window = string[i:i+len(substring)]
        similarity_ratio = fuzz.ratio(substring, window)
        if similarity_ratio >= threshold:
            return True
    return False

In [4]:
conversation_training_path = "/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/dialogs/training"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_training_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_training_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_train = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_train)}")

  0%|          | 0/8720 [00:00<?, ?it/s]

100%|██████████| 8720/8720 [01:08<00:00, 126.81it/s]

8720
8720
8720
8720

number of rows: 8720





In [5]:
df_recommender_train = df_recommender_train[(df_recommender_train["recommend_indexes"] != -1) & (df_recommender_train["turns"].apply(lambda x: len(x) > 0))]

In [6]:
apps_training_path = "/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/data/master_app_data_V1_true.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["app_name"].lower())

In [7]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_train['recommended_app_name'] = df_recommender_train.apply(fix_recommended_apps_names, axis=1)

In [8]:
counter = 0
counter_not_found = 0
found = False
recommended_apps = df_recommender_train['recommended_app_name'].values
for recommended in recommended_apps:
    if recommended not in all_apps:
        found = False
        for app in all_apps:
            if fuzz.ratio(recommended, app) > 80:
                print(recommended, app)
                found = True
                break
        if not found:
            counter_not_found += 1
            print(recommended, "not found")
        counter += 1
        
        
print(counter)
print(counter_not_found)
print(len(recommended_apps))

0
0
8644


In [9]:
cols = ['app_name','app_type','app_category']
df_app = pd.read_csv(apps_training_path, usecols=cols)
df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)
candidate_apps = []
def filter_candidate_apps(rec_app_name):
    
    df_rec_app=df_apps[df_apps['app_name'] == rec_app_name]
    recommended_app_type = df_rec_app['app_type'].iloc[0]
    recommended_app_category = df_rec_app['app_category'].iloc[0]
    category_filtered = df_apps[df_apps['app_category'] == recommended_app_category]
    df_apps_filter_1 = df_apps[df_apps['app_category'] != recommended_app_category]
    #print(recommended_app_type, recommended_app_category)
    #print(len(category_filtered))
    
    max_ratio = 0
    best_match = None
    for index, row in category_filtered.iterrows():
        if len(candidate_apps) >= 25:
            break
        if fuzz.ratio(row["app_type"], recommended_app_type) > 30:
            app = row['app_name']
            
            candidate_apps.append(app)
    
    for index, row in df_apps_filter_1.iterrows():
        if len(candidate_apps) >= 25:
            break
        ratio = fuzz.ratio(row["app_type"], recommended_app_type)
        if ratio > max_ratio:
            max_ratio = ratio
            app = row['app_name']
            best_match = app
            candidate_apps.append(best_match)
    
    apps_to_remove_2 = set(candidate_apps)
    df_apps_filter_2 = df_apps[~df_apps['app_name'].isin(apps_to_remove_2)]

    for index, row in df_apps_filter_1.iterrows():
        if len(candidate_apps) >= 25:
            break
        app = row['app_name']
        candidate_apps.append(app)

    
     
    return candidate_apps

  df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [10]:
def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(filter_candidate_apps(row["recommended_app_name"]), [row["recommended_app_name"]]), 24, replace=False)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_train['candidate_apps'] = df_recommender_train.apply(candidate_creator, axis=1)

In [11]:
df_recommender_train["recommended_app_name"][874] 

'six pack in 30 days'

In [12]:
df_recommender_train["turns"][874][df_recommender_train["recommend_indexes"][874]]

'computer: alright, then i recommend giving the six pack in 30 days app a try.'

In [13]:
df_recommender_train["candidate_apps"][874]

array(['dop 4: draw one part', 'disney frozen free fall games',
       'sweet fruit candy', 'magic jigsaw puzzles - game hd',
       'six pack in 30 days', 'human: fall flat', 'pokémon café remix',
       'flow free', 'antistress - relaxation toys', 'lily’s garden',
       'sudoku - classic sudoku puzzle',
       'sweet escapes: design a bakery with puzzle games',
       'bubble shooter rainbow', 'empires & puzzles: match-3 rpg',
       'drop the number® : merge game', 'fishdom',
       'my home design - modern city', 'angry birds dream blast',
       'bad piggies hd', 'cut the rope', 'the room: old sins',
       'bubble shooter: panda pop!',
       'math | riddles and puzzles maths games', 'lemmings',
       'candy crush jelly saga'], dtype='<U48')

In [14]:
conversation_validation_path = "/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/dialogs/validation"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_validation_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_validation_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_validation = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_validation)}")

  0%|          | 0/1285 [00:00<?, ?it/s]

100%|██████████| 1285/1285 [00:07<00:00, 172.91it/s]

1285
1285
1285
1285

number of rows: 1285





In [15]:
df_recommender_validation = df_recommender_validation[(df_recommender_validation["recommend_indexes"] != -1) & (df_recommender_validation["turns"].apply(lambda x: len(x) > 0))]

In [16]:
count = 0
for turn in df_recommender_validation['turns']:
    if len(turn) == 0:
        count += 1
print(count)

count = 0
for recommend_index in df_recommender_validation['recommend_indexes']:
    if recommend_index == -1:
        count += 1
print(count)

0
0


In [17]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_validation['recommended_app_name'] = df_recommender_validation.apply(fix_recommended_apps_names, axis=1)

In [18]:
def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(filter_candidate_apps(row["recommended_app_name"]), [row["recommended_app_name"]]), 24, replace=False)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_validation['candidate_apps'] = df_recommender_validation.apply(candidate_creator, axis=1)

In [19]:
df_recommender_validation

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,vtg1e7gwd7akgogt,"drag battle, twitch: live game streaming, mam...",uno!™,"[computer: hello! what brings you here today?,...",22,"[empires & puzzles: match-3 rpg, bubble shoote..."
1,kp38ptk4vwffenzn,"blockstarplanet, lotsa slots - casino games",snapchat,[computer: hi there! what are you looking for ...,20,"[candy crush jelly saga, sweet escapes: design..."
2,4704gqsclz6315op,"golf clash, xender - share music transfer, nf...",video downloader,[computer: hi there! how can i assist you toda...,2,[sweet escapes: design a bakery with puzzle ga...
3,rnvsafiijygjzyvk,"csr classics, tata cliq shopping app india, f...",piano fire: edm music & piano,"[computer: hello! how can i help you today?, h...",22,"[flow free, candy crush jelly saga, bad piggie..."
4,tmxwkzm2cat22pvh,"the house of da vinci, pet rescue saga, facem...",facebook,[computer: hi there! how can i help you today?...,22,[sweet escapes: design a bakery with puzzle ga...
...,...,...,...,...,...,...
1280,rsbsgtmy4ncjvhdx,"waze - gps, maps, traffic alerts & live navig...",bitmoji,[computer: hey there! what are you looking for...,12,"[bad piggies hd, my home design - modern city,..."
1281,nzhuiko02pyj8vro,video editor app - vivacut,music player - mp3 player,[computer: hi there! how can i help you today?...,24,"[drop the number® : merge game, bubble shooter..."
1282,ldzrids0o452udvr,"the frostrune, ark: survival evolved, street ...",colornote notepad notes,[computer: hi there! how can i help you today?...,20,"[candy crush jelly saga, angry birds dream bla..."
1283,lekzvahdmv1bcian,"pandora - music & podcasts, kingdom rush veng...",minecraft,"[computer: hi! how can i help you today?, huma...",20,"[sudoku - classic sudoku puzzle, bubble shoote..."


In [20]:
df_recommender_train

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,mw3eetfcrkivh692,"calm - meditate, sleep, relax, kahoot! play &...",fishdom,"[computer: hi! how can i help you today?, huma...",20,"[human: fall flat, sudoku - classic sudoku puz..."
1,7ttbcevntpi81xyq,,uno!™,[computer: hi! need help finding a pokãâãâ©...,14,"[flow free, empires & puzzles: match-3 rpg, li..."
2,xj3hkxd3ms7sbgrk,"cat escape, math kids: math games for kids, r...",sonic dash - endless running,[computer: hi there! how can i help you today?...,24,"[pokémon café remix, the room: old sins, my ho..."
3,pdfiesy5vzbpkz58,"animal jam, brave private web browser, starz,...",pixel gun 3d - battle royale,[computer: hello there! how can i assist you t...,20,"[pixel gun 3d - battle royale, sweet escapes: ..."
4,p1putumtxe7yw3vl,"war heroes: strategy card game, meetme: chat ...",stick war: legacy,"[computer: hello! how can i help you today?, h...",20,"[candy crush jelly saga, bubble shooter: panda..."
...,...,...,...,...,...,...
8715,dxkdnbbfqx25oqa9,"google go: a lighter, faster way to search, a...",whatsapp messenger,[computer: hi there! how can i help you today?...,18,"[magic jigsaw puzzles - game hd, angry birds d..."
8716,r0ad7rnsodtnrkea,"speedtest by ookla, kitten match, gamee prize...",pk xd - play with your friends,[computer: hello there! how can i help you tod...,20,"[lily’s garden, math | riddles and puzzles mat..."
8717,1c30afayr0kyblrc,"stickman party: 1 2 3 4 player games free, li...",ludo club - fun dice game,[computer: hi there! how can i help you today?...,18,"[empires & puzzles: match-3 rpg, antistress - ..."
8718,mkh0usitaxezqehu,"warhammer 40,000: lost crusade, hot wheels un...",garena free fire max,[computer: hi there! how can i assist you toda...,18,"[bad piggies, sweet escapes: design a bakery w..."


In [21]:
df_recommender_validation["recommended_app_name"][213]

'gacha life'

In [22]:
df_recommender_validation["candidate_apps"][213]

array(['lemmings', 'angry birds dream blast',
       'math | riddles and puzzles maths games', 'flow free',
       'sweet escapes: design a bakery with puzzle games',
       'my home design - modern city', 'antistress - relaxation toys',
       'disney frozen free fall games', 'sweet fruit candy',
       'sudoku - classic sudoku puzzle', 'bubble shooter rainbow',
       'human: fall flat', 'magic jigsaw puzzles - game hd',
       'lily’s garden', 'pokémon café remix', 'cut the rope',
       'gacha life', 'the room: old sins', 'bubble shooter: panda pop!',
       'bad piggies hd', 'fishdom', 'empires & puzzles: match-3 rpg',
       'dop 4: draw one part', 'bad piggies',
       'drop the number® : merge game'], dtype='<U48')

In [23]:
model_checkpoint = "gpt2"
bos = '<|startoftext|>'
eos = '<|endoftext|>'
pad = '<|pad|>'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, bos_token=bos, eos_token=eos, pad_token=pad, additional_special_tokens=["computer:", "human:", "candidate_apps:"])

model = GPT2LMHeadModel.from_pretrained(model_checkpoint).to(device)
model.resize_token_embeddings(len(tokenizer))
model_max_length=1024

In [25]:
@dataclass
class RecommenderItem:
    prompt: str
    generation: Optional[str] = None
    
class recommenderDataset(Dataset):
    def __init__(self, data: List[RecommenderItem]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> RecommenderItem:
        return self.data[idx]

In [26]:
items_validation = []
for _, row in df_recommender_validation.iterrows():
    prompt = bos
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            items_validation.append(RecommenderItem(prompt, row["recommended_app_name"] + " app." + eos))
            break
        else:
            print("error!!")

In [27]:
items_validation[4].prompt

"<|startoftext|>computer: hi there! how can i help you today?\nhuman: i'm looking for social media apps for android phones.\ncomputer: great choice! do you want apps that help you connect with friends and family?\nhuman: yes, that's what i'm looking for.\ncomputer: any specific age rating you prefer for these apps?\nhuman: i usually go for apps with a teen rating.\ncomputer: how many user reviews are you looking for in an app?\nhuman: at least 1 billion user reviews.\ncomputer: what about the app's average rating?\nhuman: i prefer apps with a rating of 3.1 stars or higher.\ncomputer: based on your preferences, i recommend checking out tagged - meet, chat & dating app.\nhuman: can you give me some information about the app's users?\ncomputer: some users have mentioned encountering fake profiles on the app.\nhuman: hmm, that doesn't sound good. i might explore other options.\ncomputer: no problem. how about trying tango-live stream & video chat? \nhuman: is their customer service good?\n

In [28]:
items_validation[4].generation

'facebook app.<|endoftext|>'

In [29]:
items_train = []
for _, row in df_recommender_train.iterrows():
    prompt = bos
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            items_train.append(RecommenderItem(prompt, row["recommended_app_name"] + " app." + eos))
            break
        else:
            print("error!!")

In [30]:
len(items_train)

8644

In [31]:
items_train[70].prompt

"<|startoftext|>computer: hello! how can i assist you today?\nhuman: i'm looking for a video editing app for my android phone. any recommendations?\ncomputer: are you looking for an easy-to-use video editing app?\nhuman: yes, i prefer apps that are easy to use.\ncomputer: would you like a free app or are you willing to pay for extra features?\nhuman: i prefer free apps.\ncomputer: do you want an app with many reviews?\nhuman: yes, i prefer popular apps with at least 10 million reviews.\ncomputer: how about trying video editor, crop video, edit video, magic effect app?\nhuman: can you tell me more about it?\ncomputer: it has cool effects, but some users have mentioned hidden charges. want to try it out?\nhuman: i think i'll pass on that one.\ncomputer: what do you think about magisto - video editor & music slideshow maker?\nhuman: can you provide more information about it?\ncomputer: it has ai-driven features, but some users reported hidden fees. interested in giving it a try?\nhuman: i

In [32]:
items_train[70].generation

'capcut - video editor app.<|endoftext|>'

In [33]:
def training_collator(batch: list[recommenderDataset]): # for training a language model
    input_ids = []
    attention_masks = []
    labels = []
    for item in batch:
        prompt_tokens = tokenizer.encode(item.prompt, return_tensors="pt")[0] 
        generation_tokens = tokenizer.encode(item.generation, return_tensors="pt")[0]
        prompt_len = len(prompt_tokens)
        generation_len = len(generation_tokens)
        unused_len = model_max_length - prompt_len - generation_len
        # handling case when input is greater than tokenizer length.
        if unused_len < 0:
            prompt_start_tokens = prompt_tokens[:1]
            trimmed_prompt = prompt_tokens[unused_len * -1 + 1 :] # TODO: you could delete the prompt to reach the first |beginuser| token
            prompt_tokens = torch.cat(
                [prompt_start_tokens, trimmed_prompt], axis=0
            )
            prompt_len = len(prompt_tokens)
            unused_len = 0
        pad = torch.full([unused_len], tokenizer.pad_token_id)
        input_tokens = torch.cat(
            [prompt_tokens, generation_tokens, pad]
        )
        label = torch.cat(
            [
                torch.full(
                    [prompt_len],
                    -100,
                ),
                generation_tokens,
                torch.full([unused_len], -100),
            ]
        )
        attention_mask = torch.cat(
            [
                torch.full([prompt_len + generation_len], 1),
                torch.full([unused_len], 0),
            ]
        )
        input_ids.append(input_tokens)
        attention_masks.append(attention_mask)
        labels.append(label)

    out = {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_masks),
        "labels": torch.stack(labels),
    }

    return out

In [34]:
training_args = TrainingArguments(
    output_dir="/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/metrics/outputs_GPT2_no_interations_with_candidate_apps_V1",
    num_train_epochs=5,
    # logging_steps=500,
    # logging_dir=self.cfg.logging_dir,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=1000,#self.cfg.save_steps,
    eval_steps=1000, #self.cfg.eval_steps,
    save_total_limit=3,
    gradient_accumulation_steps=3, #gradient_accumulation_steps,
    per_device_train_batch_size=4, #train_batch_size,
    per_device_eval_batch_size=4, #self.cfg.eval_batch_size,
    warmup_steps=100,
    weight_decay=0.01,
    # dataloader_drop_last=True,
    disable_tqdm=False,
    push_to_hub=False
)


trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=recommenderDataset(items_train),
        eval_dataset=recommenderDataset(items_validation), #dm.datasets[DataNames.dev_language_model.value],
        data_collator=training_collator,
    )

In [35]:
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
1000,0.0372,0.026167
2000,0.0124,0.008297
3000,0.0055,0.007624


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [3]:
torch.cuda.empty_cache()

Load the model and test it on the test dataset

In [4]:
def is_approximate_substring(substring, string, threshold=70):
    for i in range(len(string) - len(substring) + 1):
        window = string[i:i+len(substring)]
        similarity_ratio = fuzz.ratio(substring, window)
        if similarity_ratio >= threshold:
            return True
    return False

In [5]:
conversation_test_path = "/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/dialogs/testing"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_test_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_test_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_test = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_test)}")

100%|██████████| 2557/2557 [00:06<00:00, 406.67it/s]

2557
2557
2557
2557

number of rows: 2557





In [6]:
df_recommender_test = df_recommender_test[(df_recommender_test["recommend_indexes"] != -1) & (df_recommender_test["turns"].apply(lambda x: len(x) > 0))]

In [7]:
apps_training_path = "/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/data/master_app_data_V1_true.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["app_name"].lower())

In [8]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_test['recommended_app_name'] = df_recommender_test.apply(fix_recommended_apps_names, axis=1)

In [9]:
cols = ['app_name','app_type','app_category']
df_app = pd.read_csv(apps_training_path, usecols=cols)
df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)
candidate_apps = []
def filter_candidate_apps(rec_app_name):
    
    df_rec_app=df_apps[df_apps['app_name'] == rec_app_name]
    recommended_app_type = df_rec_app['app_type'].iloc[0]
    recommended_app_category = df_rec_app['app_category'].iloc[0]
    category_filtered = df_apps[df_apps['app_category'] == recommended_app_category]
    df_apps_filter_1 = df_apps[df_apps['app_category'] != recommended_app_category]
    #print(recommended_app_type, recommended_app_category)
    #print(len(category_filtered))
    
    max_ratio = 0
    best_match = None
    for index, row in category_filtered.iterrows():
        if len(candidate_apps) >= 25:
            break
        if fuzz.ratio(row["app_type"], recommended_app_type) > 30:
            app = row['app_name']
            
            candidate_apps.append(app)
    
    for index, row in df_apps_filter_1.iterrows():
        if len(candidate_apps) >= 25:
            break
        ratio = fuzz.ratio(row["app_type"], recommended_app_type)
        if ratio > max_ratio:
            max_ratio = ratio
            app = row['app_name']
            best_match = app
            candidate_apps.append(best_match)
    
    apps_to_remove_2 = set(candidate_apps)
    df_apps_filter_2 = df_apps[~df_apps['app_name'].isin(apps_to_remove_2)]

    for index, row in df_apps_filter_1.iterrows():
        if len(candidate_apps) >= 25:
            break
        app = row['app_name']
        candidate_apps.append(app)

    
     
    return candidate_apps

  df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [10]:
def candidate_creator(row):
    np.random.seed(row.name)
    selected_values = np.random.choice(np.setdiff1d(filter_candidate_apps(row["recommended_app_name"]), [row["recommended_app_name"]]), 24, replace=False)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_test['candidate_apps'] = df_recommender_test.apply(lambda row: candidate_creator(row), axis=1)

In [11]:
@dataclass
class RecommenderItem:
    prompt: str
    generation: Optional[str] = None
    
class recommenderDataset(Dataset):
    def __init__(self, data: List[RecommenderItem]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> RecommenderItem:
        return self.data[idx]

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bos = '<|startoftext|>'
eos = '<|endoftext|>'
pad = '<|pad|>'

tokenizer = GPT2Tokenizer.from_pretrained("gpt2", bos_token=bos, eos_token=eos, pad_token=pad, additional_special_tokens=["computer:", "human:", "candidate_apps:"], padding_side='left')

model = GPT2LMHeadModel.from_pretrained("/u/amo-d1/grad/sma340/project/llmrank/MobileConvRec-Main/metrics/outputs_GPT2_no_interations_with_candidate_apps_V1").to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50262, 768)

In [13]:
prompts = []
generations = []
candidate_apps = []
true_candidate_index = []
for _, row in df_recommender_test.iterrows():
    # creating candidate apps
    candidates = []
    for index, candidate_app in enumerate(row["candidate_apps"].tolist()):
        candidates.append(candidate_app + " app." + eos)
        if candidate_app == row["recommended_app_name"]:
            true_candidate_index.append(index)
    candidate_apps.append(candidates)
    # creating prompts and generations
    prompt = bos
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            # prompt += "computer: I would recommend the "
            prompt += "computer: I would recommend the "
            prompts.append(prompt)
            generations.append(row["recommended_app_name"] + " app." + eos)
            break
        else:
            print("error!!")

In [14]:
prompts[40]

"<|startoftext|>computer: hello there! how can i help you today?\nhuman: i'm looking for an action adventure game app for my android phone. any recommendations?\ncomputer: are you looking for an action adventure game app or something specific with high-resolution graphics?\nhuman: i want an app with high-resolution graphics.\ncomputer: how many user reviews are you looking for?\nhuman: i prefer apps with at least 1 million user reviews.\ncomputer: what average rating do you prefer?\nhuman: i usually look for apps with an average rating of 4.2 stars or higher.\ncomputer: are you okay with some ads in the app?\nhuman: a few ads are fine.\ncomputer: i suggest trying out the rider app based on your interests.\nhuman: can you tell me about the ads in this app?\ncomputer: users have mentioned frequent ads in the rider app.\nhuman: i'd rather not deal with frequent ads. do you have any other suggestions?\ncomputer: how about checking out ab transformers?\nhuman: can you provide information on

In [15]:
generations[40]

'grand theft auto: vice city app.<|endoftext|>'

In [16]:
true_candidate_index[40]

21

In [17]:
candidate_apps[40]

['bloons td battles app.<|endoftext|>',
 'plants vs. zombies free app.<|endoftext|>',
 'last shelter: survival app.<|endoftext|>',
 "evony: the king's return app.<|endoftext|>",
 'castle clash: guild royale app.<|endoftext|>',
 'clash of clans app.<|endoftext|>',
 'mobile royale mmorpg - build a strategy for battle app.<|endoftext|>',
 "five nights at freddy's 2 app.<|endoftext|>",
 'lords mobile: tower defense app.<|endoftext|>',
 'dota underlords app.<|endoftext|>',
 'onmyoji arena app.<|endoftext|>',
 'bloons td 6 app.<|endoftext|>',
 'top war: battle game app.<|endoftext|>',
 'call me emperor app.<|endoftext|>',
 'stick war: legacy app.<|endoftext|>',
 'puzzles & survival app.<|endoftext|>',
 'the grand mafia app.<|endoftext|>',
 'the ants: underground kingdom app.<|endoftext|>',
 'king of avalon: dominion app.<|endoftext|>',
 'army men strike: toy wars app.<|endoftext|>',
 'mafia city app.<|endoftext|>',
 'grand theft auto: vice city app.<|endoftext|>',
 'art of war 3:rts strategy

In [18]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def evaluate_recommender(prompts, generations, model, tokenizer, batch_size=8, device=device, threshold=70):
  prompt_batches = list(chunk(prompts, batch_size))
  generation_batches = list(chunk(generations, batch_size))
  max_length = 992
  generation_length = 32
  correctly_predicted = []
  for prompt_batch, generation_batch in tqdm(zip(prompt_batches, generation_batches), total = len(generation_batches)):

    inputs = tokenizer(prompt_batch, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt") 

    generations_predicted = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device),
                            max_new_tokens=generation_length,
                            num_beams=8,
                            eos_token_id=tokenizer.eos_token_id,
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id) # length_penalty=0.8, Set length_penalty to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences.

    generations_predicted = generations_predicted[:, max_length:] # we only need the generation part, not the prompt part.
    decoded_generations = [tokenizer.decode(generation, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" app.", "")  for generation in generations_predicted]
    generation_batch = [generation.replace(" app.", "").replace("<|endoftext|>", "") for generation in generation_batch]
    
    correctly_predicted.extend([1 if fuzz.ratio(predicted, ground_truth) >= threshold else 0 for predicted, ground_truth in zip(decoded_generations, generation_batch)])
  
  return correctly_predicted

In [20]:
correctly_predicted = evaluate_recommender(prompts, generations, model, tokenizer, batch_size=2, device=device)
success_rate = sum(correctly_predicted) / len(correctly_predicted)
print("success_rate: ", success_rate)

100%|██████████| 1268/1268 [12:05<00:00,  1.75it/s]

success_rate:  0.062302839116719244





In [21]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]
    
def convert_to_sublists(numbers, sublist_size):
    return [numbers[i:i+sublist_size] for i in range(0, len(numbers), sublist_size)]

def recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=8, device=device):
  model.eval()
  tokenizer.padding_side='left'
  tokenizer.truncation_side='left'
  max_length = 992
  prompts_ids = tokenizer(prompts, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
  
  tokenizer.padding_side='right'
  tokenizer.truncation_side='right'
  input_ids = []
  attention_mask = []
  for index, candidate_app_elements in enumerate(candidate_apps):
    candidate_apps_ids = tokenizer(candidate_app_elements, max_length=32, truncation=True, padding="max_length", return_tensors="pt")
    for candidate_app_index in range(len(candidate_app_elements)):
      input_ids.append(torch.cat([prompts_ids["input_ids"][index], candidate_apps_ids["input_ids"][candidate_app_index]]))
      attention_mask.append(torch.cat([prompts_ids["attention_mask"][index], candidate_apps_ids["attention_mask"][candidate_app_index]]))
      
  input_ids_batches = list(chunk(input_ids, batch_size))
  attention_mask_batches = list(chunk(attention_mask, batch_size))

  scores = []
  for input_ids_batch, attention_mask_batch in tqdm(zip(input_ids_batches, attention_mask_batches), total = len(attention_mask_batches)):

    input_ids = torch.stack(input_ids_batch).to(device)
    attention_mask = torch.stack(attention_mask_batch).to(device)
    with torch.no_grad():
      model_output = model(input_ids=input_ids) # attention_mask=attention_mask

    logprobs = F.log_softmax(model_output["logits"], dim=-1)[:, max_length -1:-1, :]
    output_tokens = input_ids[:, max_length:]
    
    tokens_logprobs = torch.gather(logprobs, 2, output_tokens[:, :, None]).squeeze(-1).to(torch.float32)
    
    mask = torch.ones(tokens_logprobs.shape, dtype=torch.bool, device=device)
    for i, _output in enumerate(output_tokens):
      for j, _token in enumerate(_output):
        if _token == tokenizer.pad_token_id:
          mask[i, j] = False
          
    score = (tokens_logprobs * mask).sum(-1) / mask.sum(-1)
    scores.extend(score.to('cpu').tolist())
  
  scores = convert_to_sublists(scores, len(candidate_apps[0]))
  
  return scores

In [22]:
scores = recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=16, device=device)

100%|██████████| 3963/3963 [26:14<00:00,  2.52it/s]


In [23]:
np.argmax(scores[1])

2

In [33]:
top_k_accuracy_score(true_candidate_index, scores, k=1)

0.04258675078864353

In [34]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_index]

In [44]:
ndcg_score(true_relevance, scores, k=10)

0.21199435049249388

In [None]:
print(true_candidate_index[0])
print(prompts[0])
print(candidate_apps[0][true_candidate_index[0]])

In [None]:
tokenizer.padding_side='left'
inputs = tokenizer(prompts[0], max_length=992, truncation=True, padding="max_length", return_tensors="pt") 

generations_predicted = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device),
                max_new_tokens=32,
                num_beams=8,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                bos_token_id=tokenizer.bos_token_id, 
                output_scores=True,
                return_dict_in_generate=True)

generations_predicted_sequences = generations_predicted.sequences[0, 992:] # we only need the generation part, not the prompt part.
print(tokenizer.decode(generations_predicted_sequences))

In [None]:
generations_predicted_sequences

In [None]:
generations_predicted.keys()

In [None]:
generations_predicted["sequences_scores"]

In [None]:
len(generations_predicted['scores'])

In [None]:
generations_predicted['scores'][0][0]

In [None]:
torch.argmax(generations_predicted['scores'][0][0])

In [None]:
generations_predicted['scores'][0][0][8625]

In [None]:
torch.argmax(generations_predicted['scores'][1][0])

In [None]:
generations_predicted['scores'][1][0][6713]

In [None]:
torch.argmax(generations_predicted['scores'][2][0])

In [None]:
generations_predicted['scores'][2][0][598]

In [None]:
torch.argmax(generations_predicted['scores'][3][0])

In [None]:
generations_predicted['scores'][3][0][13]

In [None]:
tokenizer.decode(598)

In [None]:
generations_predicted['scores'][0].size()

In [None]:
import pickle

# Open a file in binary write mode
with open('scores_GPT2.pkl', 'wb') as file:
    # Use pickle to dump the array into the file
    pickle.dump(scores, file)

In [None]:
# Open the file in binary read mode
import pickle
with open('scores_GPT2.pkl', 'rb') as file:
    # Load the array from the file
    scores = pickle.load(file)

In [None]:
top_k_accuracy_score(true_candidate_index, scores, k=1)