In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
import torch
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset
from dataclasses import dataclass
from typing import Optional, List
from fuzzywuzzy import fuzz
import csv
import numpy as np
import evaluate
import torch.nn.functional as F
from sklearn.metrics import top_k_accuracy_score, ndcg_score

In [3]:
def is_approximate_substring(substring, string, threshold=70):
    for i in range(len(string) - len(substring) + 1):
        window = string[i:i+len(substring)]
        similarity_ratio = fuzz.ratio(substring, window)
        if similarity_ratio >= threshold:
            return True
    return False

In [4]:
conversation_training_path = "data/dataset_v2/training"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_training_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_training_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_train = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_train)}")

  0%|          | 0/8720 [00:00<?, ?it/s]

100%|██████████| 8720/8720 [00:25<00:00, 344.27it/s]

8720
8720
8720
8720

number of rows: 8720





In [5]:
df_recommender_train = df_recommender_train[(df_recommender_train["recommend_indexes"] != -1) & (df_recommender_train["turns"].apply(lambda x: len(x) > 0))]

In [6]:
apps_training_path = "data/master_app_data_V1_true.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["app_name"].lower())

In [7]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_train['recommended_app_name'] = df_recommender_train.apply(fix_recommended_apps_names, axis=1)

In [8]:
counter = 0
counter_not_found = 0
found = False
recommended_apps = df_recommender_train['recommended_app_name'].values
for recommended in recommended_apps:
    if recommended not in all_apps:
        found = False
        for app in all_apps:
            if fuzz.ratio(recommended, app) > 80:
                print(recommended, app)
                found = True
                break
        if not found:
            counter_not_found += 1
            print(recommended, "not found")
        counter += 1
        
        
print(counter)
print(counter_not_found)
print(len(recommended_apps))

0
0
8644


In [9]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_app_name"]]), 24, replace=False).astype(new_dtype)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_train['candidate_apps'] = df_recommender_train.apply(candidate_creator, axis=1)

In [10]:
df_recommender_train["recommended_app_name"][874] 

'audiomack-stream music offline'

In [11]:
df_recommender_train["turns"][874][df_recommender_train["recommend_indexes"][874]]

'computer: in that case, i recommend trying audiomack-stream music offline app.'

In [12]:
df_recommender_train["candidate_apps"][874]

array(['club vegas slots: casino games', 'cardboard design lab',
       'temple run 2', 'puzzles & survival',
       'modern combat versus: fps game', 'audiomack-stream music offline',
       'world at arms', 'self - build credit & savings',
       'idle supermarket tycoon－shop', 'qr & barcode scanner',
       'wikipedia', 'manta: unlimited comics',
       'who - live video chat & match & meet me',
       'truck simulator - truck games', 'paper fold', 'my talking hank',
       'whatsapp messenger', 'postknight', 'youtube music',
       'gameloft classics: 20 years', 'stumble guys: multiplayer royale',
       'noblemen: 1896', 'solitaire ocean', 'blockstarplanet',
       'nova launcher'], dtype='<U50')

In [13]:
conversation_validation_path = "data/dataset_v2/validation"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_validation_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_validation_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_validation = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_validation)}")

  2%|▏         | 29/1285 [00:00<00:04, 289.78it/s]

100%|██████████| 1285/1285 [00:04<00:00, 306.28it/s]

1285
1285
1285
1285

number of rows: 1285





In [14]:
df_recommender_validation = df_recommender_validation[(df_recommender_validation["recommend_indexes"] != -1) & (df_recommender_validation["turns"].apply(lambda x: len(x) > 0))]

In [15]:
count = 0
for turn in df_recommender_validation['turns']:
    if len(turn) == 0:
        count += 1
print(count)

count = 0
for recommend_index in df_recommender_validation['recommend_indexes']:
    if recommend_index == -1:
        count += 1
print(count)

0
0


In [16]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_validation['recommended_app_name'] = df_recommender_validation.apply(fix_recommended_apps_names, axis=1)

In [18]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_app_name"]]), 24, replace=False).astype(new_dtype)
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_validation['candidate_apps'] = df_recommender_validation.apply(candidate_creator, axis=1)

In [19]:
df_recommender_validation

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,qwttxhjcryqhatgb,,wattpad - read & write stories,[computer: hi there! how can i help you today?...,18,"[rider, novelcat-read & write stories, keepcle..."
1,tugk24mlf1qvjzba,flight sim 2018,"paytm -upi, money transfer, recharge, bill pay...",[computer: hi there! what brings you here toda...,24,"[paytm -upi, money transfer, recharge, bill pa..."
2,hjbzfrhcuq2pxoma,"dealdash - bid & save auctions, heart's medic...",wood block puzzle - block game,[computer: hey there! how can i help you today...,20,"[myheritage: family tree & dna, amazon kindle,..."
3,0zm4n7djyrppfrgx,"dangerous fellows: otome game, painnt - pro a...",dynamons 2,[computer: hey there! what are you looking for...,24,"[animation throwdown: epic ccg, walk band - mu..."
4,2c7ofjyraazmwgr3,"bumble - dating. friends. bizz, music player ...",2 player games : the challenge,"[computer: hello! how can i help you today?, h...",20,"[toca life world: build stories, 2 player game..."
...,...,...,...,...,...,...
1280,chyzv6ycokrxisoo,"vector, command & conquer: rivalsãâ¢ãâãâ¢...",my talking hank,"[computer: hello! how can i help you today?, h...",22,"[rider, cookie swirl world, foxnovel - read st..."
1281,pfin90inkk8rauwn,"wattpad beta, soccer star 22 top leagues, rai...",pinterest,[computer: hi there! how can i help you today?...,20,"[foxnovel - read stories & books, pinterest, ..."
1282,rog58g9ofnggpobe,"journey: diary, journal, my ooredoo myanmar, ...",lego® friends: heartlake rush,[computer: hey there! what can i help you find...,24,"[keepclean - cleaner & booster, sweet escapes:..."
1283,5s1a0sasi88h3mcq,"age of war 2, noblemen: 1896, groupon ãâ¢ãâ...",human anatomy atlas 2021: complete 3d human body,[computer: hi there! what are you looking for ...,12,"[google go: a lighter, faster way to search, p..."


In [20]:
df_recommender_train

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,d1o00gcfxa7tj9fc,"snipers vs thieves, retro bowl, be the king: ...",weather forecast - accurate local weather & wi...,"[computer: hello! how can i assist you today?,...",24,"[harry potter: hogwarts mystery, marvel future..."
1,tjfy73vbqa5acgac,"daily shopping stories, truck simulator 2018 ...",bridge race,[computer: hi there! what are you looking for ...,18,"[toca life: stable, chess, retro bowl, cyber h..."
2,okqwykkowhwv8ppo,"shop titans: craft & build, dragon ball z dok...",video editor & maker videoshow,[computer: are you looking for apps that are e...,16,[tile master - classic triple match & puzzle g...
3,och0j9udd8rwbezp,"pepi wonder world: magic isle!, the simsãâ¯ã...",ludo star,"[computer: hi! how can i help you today?, huma...",24,"[plants vs. zombies free, flow free, raid: sha..."
4,86ckfhrgr3my1asp,microsoft onenote: save ideas and organize no...,xender - share music transfer,[computer: hey there! what can i help you with...,22,"[monstercrafter, hackers, roku - official remo..."
...,...,...,...,...,...,...
8715,dkfqslrdxlhbegdr,,true fear: forsaken souls 1,[computer: hi there! how can i help you today?...,24,"[drop the number® : merge game, tennis clash: ..."
8716,8lw0qo1crrzsoef9,"macy's, play to win: win real money, opera ne...",hill climb racing 2,[computer: hey there! how can i help you today...,16,"[bbc learning english, twitch: live game strea..."
8717,xqbiugex5rwctlsp,"jurassic world alive, google classroom",zombie catchers – love to hunt,[computer: sure thing! what kinds of zombie hu...,22,"[animal jam, hp smart, streamer rush, sky danc..."
8718,xpo2xk6chiiygdp3,"playmobil luxury mansion, into the dead 2, sc...",township,[computer: hi there! how can i help you today?...,20,"[bingo showdown - bingo games, streamkar - liv..."


In [21]:
df_recommender_validation["recommended_app_name"][213]

'sonic dash - endless running'

In [22]:
df_recommender_validation["candidate_apps"][213]

array(['novelcat-read & write stories', 'audible: audiobooks & podcasts',
       'pokémon go', 'readict -one click away novels',
       'toca life world: build stories', 'solitaire grand harvest',
       'dhgate-online wholesale stores', 'standoff 2', 'archero',
       'walk band - multitracks music', 'rider',
       'google go: a lighter, faster way to search',
       'wattpad - read & write stories', 'norton secure vpn: wi-fi proxy',
       'asphalt 8 - car racing game', 'animation throwdown: epic ccg',
       'pixellab - text on pictures', 'sonic dash - endless running',
       'stickman vs zombies', 'foxnovel - read stories &  books',
       'dreame', 'keepclean - cleaner & booster',
       'sweet escapes: design a bakery with puzzle games',
       'amazon kindle', 'cookie swirl world'], dtype='<U50')

In [None]:
model_checkpoint = "gpt2"
bos = '<|startoftext|>'
eos = '<|endoftext|>'
pad = '<|pad|>'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint, bos_token=bos, eos_token=eos, pad_token=pad, additional_special_tokens=["computer:", "human:", "candidate_apps:", "previous_interactions:"])

model = GPT2LMHeadModel.from_pretrained(model_checkpoint).to(device)
model.resize_token_embeddings(len(tokenizer))
model_max_length=1024

In [24]:
@dataclass
class RecommenderItem:
    prompt: str
    generation: Optional[str] = None
    
class recommenderDataset(Dataset):
    def __init__(self, data: List[RecommenderItem]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> RecommenderItem:
        return self.data[idx]

In [25]:
items_validation = []
for _, row in df_recommender_validation.iterrows():
    if row["previous_interactions"] is not None:
        prompt = bos + "previous_interactions:" + row["previous_interactions"] + "\n"
    else:
        prompt = bos + "previous_interactions: No previous interactions" + "\n"
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            items_validation.append(RecommenderItem(prompt, row["recommended_app_name"] + " app." + eos))
            break
        else:
            print("error!!")

In [26]:
items_validation[4].prompt

"<|startoftext|>previous_interactions: bumble - dating. friends. bizz, music player & mp3 player - lark player, cooking craze: restaurant game, critical ops: multiplayer fps, imgur: funny memes & gif maker, bingo abradoodle: mobile bingo, alfredcamera home security app, good pizza, great pizza, art of war 3:rts strategy game, snake rivals - fun snake game\ncomputer: hello! how can i help you today?\nhuman: i'm looking for multiplayer minigames apps for android. any suggestions?\ncomputer: are you interested in apps that focus on gaming for bonding?\nhuman: yes, that's what i'm looking for.\ncomputer: do you mind if the apps have ads or in-app purchases?\nhuman: i'm okay with apps that have ads and offers in-app purchases.\ncomputer: do you have a preference for the content rating?\nhuman: i prefer apps with an 'everyone' rating.\ncomputer: how many reviews are you looking for in an app?\nhuman: i prefer apps with a lot of reviews, like 1m.\ncomputer: are you looking for a free app or w

In [27]:
items_validation[4].generation

'2 player games : the challenge app.<|endoftext|>'

In [28]:
items_train = []
for _, row in df_recommender_train.iterrows():
    if row["previous_interactions"] is not None:
        prompt = bos + "previous_interactions:" + row["previous_interactions"] + "\n"
    else:
        prompt = bos + "previous_interactions: No previous interactions" + "\n"
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            prompt += "computer: I would recommend the "
            items_train.append(RecommenderItem(prompt, row["recommended_app_name"] + " app." + eos))
            break
        else:
            print("error!!")

In [29]:
items_train[70].prompt

"<|startoftext|>previous_interactions: home streetã\x83â¯ã\x82â¿ã\x82â½dream house design, picsart color - painting, drawing & sketch, bleach: brave souls anime game, learn coding/programming: mimo, knives out, mr bean - special delivery, background eraser, viber - safe chats and calls, crackle, warfriends: pvp shooter game\ncomputer: hi there! how can i help you today?\nhuman: looking for apps for android that allow free audio/video calls.\ncomputer: are you looking for communication apps or something else?\nhuman: apps with everyone content rating.\ncomputer: do you mind if the apps have ads?\nhuman: i'm okay with apps that have ads and offer in-app purchases.\ncomputer: would you prefer free apps or are you willing to pay for additional features?\nhuman: i prefer using free apps.\ncomputer: do you have a preference for the app's average rating?\nhuman: i prefer apps with high ratings, around 4.5 stars or above.\ncomputer: i recommend checking out the messenger app. is that something

In [30]:
items_train[70].generation

'viber - safe chats and calls app.<|endoftext|>'

In [31]:
def training_collator(batch: list[recommenderDataset]): # for training a language model
    input_ids = []
    attention_masks = []
    labels = []
    for item in batch:
        prompt_tokens = tokenizer.encode(item.prompt, return_tensors="pt")[0] 
        generation_tokens = tokenizer.encode(item.generation, return_tensors="pt")[0]
        prompt_len = len(prompt_tokens)
        generation_len = len(generation_tokens)
        unused_len = model_max_length - prompt_len - generation_len
        # handling case when input is greater than tokenizer length.
        if unused_len < 0:
            prompt_start_tokens = prompt_tokens[:1]
            trimmed_prompt = prompt_tokens[unused_len * -1 + 1 :] # TODO: you could delete the prompt to reach the first |beginuser| token
            prompt_tokens = torch.cat(
                [prompt_start_tokens, trimmed_prompt], axis=0
            )
            prompt_len = len(prompt_tokens)
            unused_len = 0
        pad = torch.full([unused_len], tokenizer.pad_token_id)
        input_tokens = torch.cat(
            [prompt_tokens, generation_tokens, pad]
        )
        label = torch.cat(
            [
                torch.full(
                    [prompt_len],
                    -100,
                ),
                generation_tokens,
                torch.full([unused_len], -100),
            ]
        )
        attention_mask = torch.cat(
            [
                torch.full([prompt_len + generation_len], 1),
                torch.full([unused_len], 0),
            ]
        )
        input_ids.append(input_tokens)
        attention_masks.append(attention_mask)
        labels.append(label)

    out = {
        "input_ids": torch.stack(input_ids),
        "attention_mask": torch.stack(attention_masks),
        "labels": torch.stack(labels),
    }

    return out

In [32]:
training_args = TrainingArguments(
    output_dir="models/GPT2_previous_interactions_candidate_apps",
    num_train_epochs=5,
    # logging_steps=500,
    # logging_dir=self.cfg.logging_dir,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy="steps",
    evaluation_strategy="steps",
    save_steps=1000,#self.cfg.save_steps,
    eval_steps=1000, #self.cfg.eval_steps,
    save_total_limit=3,
    gradient_accumulation_steps=3, #gradient_accumulation_steps,
    per_device_train_batch_size=4, #train_batch_size,
    per_device_eval_batch_size=4, #self.cfg.eval_batch_size,
    warmup_steps=100,
    weight_decay=0.01,
    # dataloader_drop_last=True,
    disable_tqdm=False,
    push_to_hub=False
)


trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=recommenderDataset(items_train),
        eval_dataset=recommenderDataset(items_validation), #dm.datasets[DataNames.dev_language_model.value],
        data_collator=training_collator,
    )

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [33]:
trainer.train()
trainer.save_model()

Step,Training Loss,Validation Loss
1000,0.2125,0.140672
2000,0.0738,0.085166
3000,0.0374,0.086379


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [208]:
torch.cuda.empty_cache()

Load the model and test it on the test dataset

In [3]:
def is_approximate_substring(substring, string, threshold=70):
    for i in range(len(string) - len(substring) + 1):
        window = string[i:i+len(substring)]
        similarity_ratio = fuzz.ratio(substring, window)
        if similarity_ratio >= threshold:
            return True
    return False

In [4]:
conversation_test_path = "data/dataset_v2/testing"

user_id = []
previous_interactions = []
recommended_app_name = []
turns = []
recommend_indexes = []


# List all files in the folder
files = os.listdir(conversation_test_path)

# Loop through each file in the directory
for filename in tqdm(files):
    # Construct the full file path
    file_path = os.path.join(conversation_test_path, filename)

    # Check if the current item is a file
    if os.path.isfile(file_path):
        # Open and read the file
        with open(file_path, 'r') as file:
            # Read the content of the file
            file_content = file.read().lower()
            
            # finding the user id
            index_1 = file_content.find("user's previous interactions")
            user_id.append(file_content[9:index_1].rstrip('\n'))
            
            # finding the User's Previous Interactions
            index_2 = file_content.find("recommended app name:")
            previous_interactions_arr = file_content[index_1+29:index_2].rstrip('\n').split("app name:")
            previous_interactions_arr_filtered = []
            for previous_interaction in previous_interactions_arr[1:]:
                previous_interactions_arr_filtered.append(previous_interaction[:previous_interaction.find(" | ")])
            if len(previous_interactions_arr_filtered) > 0:
                previous_interactions.append(",".join(previous_interactions_arr_filtered))
            else:
                previous_interactions.append(None)
            
            # finding recommended app name
            index_3 = file_content[index_2:].find("package name")
            recommended = file_content[index_2+22:index_2+index_3-3].rstrip('\n')
            recommended_app_name.append(recommended)
            
            # finding each turns
            dialog_turns = []
            dialog_index = 0
            COMPUTER_index = file_content.find("computer:")
            file_content = file_content[COMPUTER_index:]
            found_recommender = False
            while True:
                HUMAN_index = file_content.find("human:")
                if HUMAN_index == -1:
                    break
                turn = file_content[:HUMAN_index].rstrip('\n') # computer dialog
                if (recommended in turn) and not found_recommender:
                    recommend_indexes.append(dialog_index)
                    found_recommender = True
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[HUMAN_index:]
                
                COMPUTER_index = file_content.find("computer:")
                turn = file_content[:COMPUTER_index].rstrip('\n') # human dialog
                dialog_turns.append(turn)
                dialog_index +=1
                file_content = file_content[COMPUTER_index:]
                
            if not found_recommender: # approximately finding the recommender turn
                for i, dialog_turn in enumerate(dialog_turns):
                    if is_approximate_substring(recommended, dialog_turn):
                        recommend_indexes.append(i)
                        found_recommender = True
                        break
                    
            if not found_recommender:
                recommend_indexes.append(-1)
                        
            turns.append(dialog_turns)

print(len(user_id))
print(len(previous_interactions))
print(len(recommended_app_name))
print(len(recommend_indexes))
df_recommender_test = pd.DataFrame({"user_id": user_id, "previous_interactions":previous_interactions, "recommended_app_name":recommended_app_name, "turns": turns, "recommend_indexes":recommend_indexes})
print(f"\nnumber of rows: {len(df_recommender_test)}")

100%|██████████| 2557/2557 [00:06<00:00, 369.11it/s]

2557
2557
2557
2557

number of rows: 2557





In [5]:
df_recommender_test = df_recommender_test[(df_recommender_test["recommend_indexes"] != -1) & (df_recommender_test["turns"].apply(lambda x: len(x) > 0))]

In [6]:
apps_training_path = "data/master_app_data_V1_true.csv"

all_apps = []
with open(apps_training_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    for row in csv_reader:
        all_apps.append(row["app_name"].lower())

In [7]:
def fix_recommended_apps_names(row):
    if row["recommended_app_name"] not in all_apps:
        for app in all_apps:
            if fuzz.ratio(row["recommended_app_name"], app) > 80:
                return app
        return "uno!™"
    else:
        return row["recommended_app_name"]

df_recommender_test['recommended_app_name'] = df_recommender_test.apply(fix_recommended_apps_names, axis=1)

In [23]:
import pandas as pd
from fuzzywuzzy import fuzz

# Ensure that you've already defined 'apps_training_path' to point to your data file
cols = ['app_name', 'app_type', 'app_category']
df_app = pd.read_csv(apps_training_path, usecols=cols)
df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)

def filter_candidate_apps(rec_app_name):
    candidate_apps = set()  # Using a set to avoid duplicates
    df_rec_app = df_apps[df_apps['app_name'] == rec_app_name.lower()]
    
    if df_rec_app.empty:
        print("No matching app found.")
        return []
    
    recommended_app_type = df_rec_app['app_type'].iloc[0]
    recommended_app_category = df_rec_app['app_category'].iloc[0]
    
    df_same_category = df_apps[df_apps['app_category'] == recommended_app_category]
    df_different_category = df_apps[df_apps['app_category'] != recommended_app_category]

    # same category, same type
    for _, row in df_same_category.iterrows():
        if fuzz.ratio(row["app_type"], recommended_app_type) > 70:
            candidate_apps.add(row['app_name'])

    # only same category
    if len(candidate_apps) < 25:
        for _, row in df_same_category.iterrows():
            if len(candidate_apps) >= 25:
                break
            candidate_apps.add(row['app_name'])

    # different category, same type
    if len(candidate_apps) < 25:
        for _, row in df_different_category.iterrows():
            if len(candidate_apps) >= 25:
                break
            if fuzz.ratio(row["app_type"], recommended_app_type) > 70:
                candidate_apps.add(row['app_name'])

    # all
    if len(candidate_apps) < 25:
        for _, row in df_apps.iterrows():
            if len(candidate_apps) >= 25:
                break
            candidate_apps.add(row['app_name'])

    return list(candidate_apps)  # Converting back to list if needed for downstream processes

  df_apps = df_app.applymap(lambda x: x.lower() if isinstance(x, str) else x)


In [24]:
max_existing_length = max(len(item) for item in all_apps)  # Max length in current array
new_dtype = f'<U{max_existing_length}'

def candidate_creator(row):
    np.random.seed(row.name)
    
    selected_values = np.random.choice(np.setdiff1d(all_apps, [row["recommended_app_name"]]), 24, replace=False).astype(new_dtype) # filter_candidate_apps(row["recommended_app_name"])
    random_position = np.random.randint(0, len(selected_values) + 1)
    
    return np.insert(selected_values, random_position, row["recommended_app_name"]) 

df_recommender_test['candidate_apps'] = df_recommender_test.apply(lambda row: candidate_creator(row), axis=1)

In [25]:
df_recommender_test

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,wstfhmhy9czodllb,toca life: hospital,toca life: hospital,[computer: hey there! how can i help you today...,20,"[dumb ways to draw, baby panda's juice shop, t..."
1,6nwqmzcxfrkgrtup,"chic me - chic in command, scribblenauts unli...",sketchbook,"[computer: hello! how can i help you today?, h...",22,"[hulu: stream tv series & films, godzilla defe..."
2,pq6f1srzatvtawfx,"viva slots vegas: casino slots, kitty city, s...",intro maker -video intro outro,[computer: hi there! how can i help you today?...,20,"[big fish casino - social slots, but you seem ..."
3,pjsymtgz7tht9m0m,speedtest by ookla,uno!™,[computer: hi! what can i help you with today?...,18,"[world at war: ww2 strategy mmo, life360: find..."
4,cinuyvftdrhv0bb5,"gta: chinatown wars, toca kitchen, touchgrind...",microsoft excel: spreadsheets,"[computer: hello! how can i help you today?, h...",20,"[my friend pedro: ripe for revenge, qr & barco..."
...,...,...,...,...,...,...
2552,udb5lh4yq0qtymrw,"zulily: fresh finds, daily deals, basketball ...",google docs,[computer: hi there! how can i help you today?...,20,"[wheel of fortune: tv game, jurassic world™: t..."
2553,imov9268r3fjzryt,,where's my water?,[computer: hi there! how can i assist you toda...,24,"[where's my water?, five nights at freddy's 4,..."
2554,ofvpfkqsyzgqub3e,radarbot: speed camera detector & speedometer...,minecraft,"[computer: hey! how can i help you today?, hum...",24,"[fonts keyboard, world war heroes: ww2 fps, so..."
2555,j9wginotn1rgxaz2,musou glory,my talking tom friends,[computer: hi there! how can i help you today?...,20,"[kid-e-cats: kitty cat games!, helix stack jum..."


In [26]:
@dataclass
class RecommenderItem:
    prompt: str
    generation: Optional[str] = None
    
class recommenderDataset(Dataset):
    def __init__(self, data: List[RecommenderItem]):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx) -> RecommenderItem:
        return self.data[idx]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bos = '<|startoftext|>'
eos = '<|endoftext|>'
pad = '<|pad|>'

tokenizer = GPT2Tokenizer.from_pretrained("gpt2", bos_token=bos, eos_token=eos, pad_token=pad, additional_special_tokens=["computer:", "human:", "candidate_apps:", "previous_interactions:"], padding_side='left')

model = GPT2LMHeadModel.from_pretrained("models/GPT2_previous_interactions_candidate_apps").to(device)
model.resize_token_embeddings(len(tokenizer))

In [28]:
df_recommender_test

Unnamed: 0,user_id,previous_interactions,recommended_app_name,turns,recommend_indexes,candidate_apps
0,wstfhmhy9czodllb,toca life: hospital,toca life: hospital,[computer: hey there! how can i help you today...,20,"[dumb ways to draw, baby panda's juice shop, t..."
1,6nwqmzcxfrkgrtup,"chic me - chic in command, scribblenauts unli...",sketchbook,"[computer: hello! how can i help you today?, h...",22,"[hulu: stream tv series & films, godzilla defe..."
2,pq6f1srzatvtawfx,"viva slots vegas: casino slots, kitty city, s...",intro maker -video intro outro,[computer: hi there! how can i help you today?...,20,"[big fish casino - social slots, but you seem ..."
3,pjsymtgz7tht9m0m,speedtest by ookla,uno!™,[computer: hi! what can i help you with today?...,18,"[world at war: ww2 strategy mmo, life360: find..."
4,cinuyvftdrhv0bb5,"gta: chinatown wars, toca kitchen, touchgrind...",microsoft excel: spreadsheets,"[computer: hello! how can i help you today?, h...",20,"[my friend pedro: ripe for revenge, qr & barco..."
...,...,...,...,...,...,...
2552,udb5lh4yq0qtymrw,"zulily: fresh finds, daily deals, basketball ...",google docs,[computer: hi there! how can i help you today?...,20,"[wheel of fortune: tv game, jurassic world™: t..."
2553,imov9268r3fjzryt,,where's my water?,[computer: hi there! how can i assist you toda...,24,"[where's my water?, five nights at freddy's 4,..."
2554,ofvpfkqsyzgqub3e,radarbot: speed camera detector & speedometer...,minecraft,"[computer: hey! how can i help you today?, hum...",24,"[fonts keyboard, world war heroes: ww2 fps, so..."
2555,j9wginotn1rgxaz2,musou glory,my talking tom friends,[computer: hi there! how can i help you today?...,20,"[kid-e-cats: kitty cat games!, helix stack jum..."


In [29]:
prompts = []
generations = []
candidate_apps = []
true_candidate_index = []
for _, row in df_recommender_test.iterrows():
    # creating candidate apps
    candidates = []
    for index, candidate_app in enumerate(row["candidate_apps"].tolist()):
        candidates.append(candidate_app + " app." + eos)
        if candidate_app == row["recommended_app_name"]:
            true_candidate_index.append(index)
    candidate_apps.append(candidates)
    # creating prompts and generations
    if row["previous_interactions"] is not None:
        prompt = bos + "previous_interactions:" + row["previous_interactions"] + "\n"
    else:
        prompt = bos + "previous_interactions: No previous interactions" + "\n"
    for index, turn in enumerate(row["turns"]):
        if index < row["recommend_indexes"]:
            prompt += turn + "\n"
        elif index == row["recommend_indexes"]:
            prompt += "candidate_apps: "
            for app in row["candidate_apps"]:
                prompt += "'" + app + "', "
            prompt += "\n"
            # prompt += "computer: I would recommend the "
            prompt += "computer: I would recommend the "
            prompts.append(prompt)
            generations.append(row["recommended_app_name"] + " app." + eos)
            break
        else:
            print("error!!")

In [30]:
len(prompts)

2536

In [31]:
len(true_candidate_index)

2536

In [32]:
prompts[40]

"<|startoftext|>previous_interactions: microsoft edge: web browser, darkness rises, avg antivirus & security, cheapflights ã\x83â¢ã\x82â\x80ã\x82â\x93 flight search, chat & date: dating made simple to meet new people, hooked inc: fishing games, vpn proxy master - safer vpn, karaoke - sing songs, real steel boxing champions, moneylion: mobile banking app\ncomputer: hi there! how can i help you today?\nhuman: i want to find strategy game apps for my android phone.\ncomputer: great! do you like games that are really fun?\nhuman: yes, i want apps that are exciting.\ncomputer: would it be okay if the apps have ads sometimes?\nhuman: sure, i don't mind ads as long as the app is good.\ncomputer: are you looking for a free app or one that you have to pay for?\nhuman: i prefer free apps.\ncomputer: how many reviews do you want the app to have?\nhuman: i'd like an app that's popular and has lots of reviews.\ncomputer: i recommend trying out grand theft auto: vice city app.\nhuman: can you tell m

In [33]:
generations[40]

'hole.io app.<|endoftext|>'

In [34]:
true_candidate_index[40]

7

In [35]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]

def evaluate_recommender(prompts, generations, model, tokenizer, batch_size=8, device=device, threshold=70):
  prompt_batches = list(chunk(prompts, batch_size))
  generation_batches = list(chunk(generations, batch_size))
  max_length = 992
  generation_length = 32
  correctly_predicted = []
  for prompt_batch, generation_batch in tqdm(zip(prompt_batches, generation_batches), total = len(generation_batches)):

    inputs = tokenizer(prompt_batch, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt") 

    generations_predicted = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device),
                            max_new_tokens=generation_length,
                            num_beams=8,
                            eos_token_id=tokenizer.eos_token_id,
                            pad_token_id=tokenizer.pad_token_id,
                            bos_token_id=tokenizer.bos_token_id) # length_penalty=0.8, Set length_penalty to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences.

    generations_predicted = generations_predicted[:, max_length:] # we only need the generation part, not the prompt part.
    decoded_generations = [tokenizer.decode(generation, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" app.", "")  for generation in generations_predicted]
    generation_batch = [generation.replace(" app.", "").replace("<|endoftext|>", "") for generation in generation_batch]
    
    correctly_predicted.extend([1 if fuzz.ratio(predicted, ground_truth) >= threshold else 0 for predicted, ground_truth in zip(decoded_generations, generation_batch)])
  
  return correctly_predicted

In [36]:
correctly_predicted = evaluate_recommender(prompts, generations, model, tokenizer, batch_size=4, device=device, threshold=95)
success_rate = sum(correctly_predicted) / len(correctly_predicted)
print("success_rate: ", success_rate)

100%|██████████| 634/634 [13:34<00:00,  1.28s/it]

success_rate:  0.8261041009463722





In [21]:
correctly_predicted = evaluate_recommender(prompts, generations, model, tokenizer, batch_size=4, device=device, threshold=95)
success_rate = sum(correctly_predicted) / len(correctly_predicted)
print("success_rate: ", success_rate)

  0%|          | 0/634 [00:00<?, ?it/s]

100%|██████████| 634/634 [12:28<00:00,  1.18s/it]

success_rate:  0.4720031545741325





In [22]:
def chunk(list_of_elements, batch_size): # using this chunk function, we can split our data to multiple batches
  for i in range(0, len(list_of_elements), batch_size):
    yield list_of_elements[i:i+batch_size]
    
def convert_to_sublists(numbers, sublist_size):
    return [numbers[i:i+sublist_size] for i in range(0, len(numbers), sublist_size)]

def recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=8, device=device):
  model.eval()
  tokenizer.padding_side='left'
  tokenizer.truncation_side='left'
  max_length = 992 
  prompts_ids = tokenizer(prompts, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
  
  tokenizer.padding_side='right'
  tokenizer.truncation_side='right'
  input_ids = []
  attention_mask = []
  for index, candidate_app_elements in enumerate(candidate_apps):
    candidate_apps_ids = tokenizer(candidate_app_elements, max_length=32, truncation=True, padding="max_length", return_tensors="pt")
    for candidate_app_index in range(len(candidate_app_elements)):
      input_ids.append(torch.cat([prompts_ids["input_ids"][index], candidate_apps_ids["input_ids"][candidate_app_index]]))
      attention_mask.append(torch.cat([prompts_ids["attention_mask"][index], candidate_apps_ids["attention_mask"][candidate_app_index]]))
      
  input_ids_batches = list(chunk(input_ids, batch_size))
  attention_mask_batches = list(chunk(attention_mask, batch_size))

  scores = []
  for input_ids_batch, attention_mask_batch in tqdm(zip(input_ids_batches, attention_mask_batches), total = len(attention_mask_batches)):

    input_ids = torch.stack(input_ids_batch).to(device)
    attention_mask = torch.stack(attention_mask_batch).to(device)
    with torch.no_grad():
      model_output = model(input_ids=input_ids) # attention_mask=attention_mask

    logprobs = F.log_softmax(model_output["logits"], dim=-1)[:, max_length -1:-1, :]
    output_tokens = input_ids[:, max_length:]
    
    tokens_logprobs = torch.gather(logprobs, 2, output_tokens[:, :, None]).squeeze(-1).to(torch.float32)
    
    mask = torch.ones(tokens_logprobs.shape, dtype=torch.bool, device=device)
    for i, _output in enumerate(output_tokens):
      for j, _token in enumerate(_output):
        if _token == tokenizer.pad_token_id:
          mask[i, j] = False
          
    score = (tokens_logprobs * mask).sum(-1) / mask.sum(-1)
    scores.extend(score.to('cpu').tolist())
  
  scores = convert_to_sublists(scores, len(candidate_apps[0]))
  
  return scores

In [23]:
scores = recommender_rank(prompts, candidate_apps, model, tokenizer, batch_size=16, device=device)

100%|██████████| 3963/3963 [26:53<00:00,  2.46it/s]


In [24]:
[top_k_accuracy_score(true_candidate_index, scores, k=k) for k in range(1, 11)]

[0.3430599369085173,
 0.4605678233438486,
 0.526813880126183,
 0.5772870662460567,
 0.6222397476340694,
 0.6533911671924291,
 0.6865141955835962,
 0.7121451104100947,
 0.7440851735015773,
 0.7661671924290221]

In [25]:
top_k_accuracy_score(true_candidate_index, scores, k=10)

0.7661671924290221

In [26]:
true_relevance = [[1 if item == index else 0 for item in range(len(candidate_apps[0]))] for index in true_candidate_index]

In [27]:
[ndcg_score(true_relevance, scores, k=k) for k in range(1, 11)]

[0.3430599369085173,
 0.4171991587398637,
 0.4503221871310309,
 0.4720598052041358,
 0.48944987619180846,
 0.5005462357271135,
 0.511587245190836,
 0.5196728985784828,
 0.5292878156324191,
 0.5356709505984294]