## Multi-aspect PEPLER (MA-pepler) Result Visualization

In [1]:
import logging
import os
import random
from pathlib import Path
from pprint import pprint

import torch
from easydict import EasyDict as edict
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer

from pepler.modeling_pepler import RecReg
from pepler.utils import ReviewHistory, bos, eos, generate, pad, ids2tokens, load_model
from pepler.dataset import (
    AspectDataBatch,
    AspectDataInitializer,
    UserItemDataset,
    FfidfStore,
)

In [2]:
def parse_args(
    auto_arg_by_dataset: str,
    output_len: int,
    temperature: float,
    device: str,
    model_name: str,
    ffidf_topk: int,
) -> edict:
    args = edict(
        auto_arg_by_dataset=auto_arg_by_dataset,
        output_len=output_len,
        temperature=temperature,
        device=device,
        model_name=model_name,
        ffidf_topk=ffidf_topk,
    )
    assert args.auto_arg_by_dataset in ["yelp23", "gest", "yelp"]

    ROOT = Path(
        "/home/P76114511/projects"
    )  # ROOT = Path(__file__).parent.parent.parent
    CKPT_DIR = Path(
        "/home/P76114511/projects/my_retriever/checkpoints/dset_ver=2_ptr=False_arreg=1.0"
    )
    # ========== Dataset-specific args ==========
    if args.auto_arg_by_dataset == "yelp23":
        dargs = edict(
            data_path=ROOT / "nete_format_data/yelp23/reviews.pickle",
            index_dir=ROOT / "nete_format_data/yelp23/1",
            aspect_path=ROOT / "nete_format_data/yelp23/aspect_category_index.csv",
            checkpoint=CKPT_DIR / "yelp23/run_2/model.pt",
            ffidf_cache_dir=ROOT / "nete_format_data/ffidf_cache/yelp23/1",
            item_meta_path=ROOT
            / "nete_format_data/yelp23/yelp_academic_dataset_business.json",
            user_meta_path=ROOT
            / "nete_format_data/yelp23/yelp_academic_dataset_user.json",
        )
    elif args.auto_arg_by_dataset == "gest":
        dargs = edict(
            data_path=ROOT / "nete_format_data/gest/reviews.pickle",
            index_dir=ROOT / "nete_format_data/gest/1",
            aspect_path=ROOT / "nete_format_data/gest/aspect_category_index.csv",
            checkpoint=CKPT_DIR / "gest/run_3/model.pt",
            ffidf_cache_dir=ROOT / "nete_format_data/ffidf_cache/gest/1",
            item_meta_path=None,
            user_meta_path=None,
        )
    elif args.auto_arg_by_dataset == "yelp":
        dargs = edict(
            data_path=ROOT / "nete_format_data/yelp/reviews.pickle",
            index_dir=ROOT / "nete_format_data/yelp/1",
            aspect_path=ROOT / "nete_format_data/yelp/aspect_category_index.csv",
            checkpoint=CKPT_DIR / "yelp/run_1/model.pt",
            ffidf_cache_dir=ROOT / "nete_format_data/ffidf_cache/yelp/1",
            item_meta_path=ROOT / "nete_format_data/yelp/item.json",
            user_meta_path=ROOT / "nete_format_data/yelp/user.json",
        )
    args = vars(args)
    args.update(dargs)
    args = edict(args)
    return args

## Custom Arguments
- `auto_arg_by_dataset`: the dataset name you want to use; the path arguments are decided based upon this name. 
Currently support `yelp`, `yelp23`, `gest`. 
- `output_len`: the length of the output sequence.
- `temperature`: the temperature of generated sentence. 0 < temperature <= 1 (the higher the more creative, 0.5 is the best in most cases)
- `device`: the computational device 
- `model_name`: used to load the tokenizer. Currently only support `gpt2`. 
- `ffidf_topk`: the number of user or item important features to be returned and displayed. The important features are ranked by tfidf (ffidf, first f for `feature`) score. 
               

In [3]:
args = parse_args(
    auto_arg_by_dataset="yelp23",
    output_len=20,
    temperature=0.5,
    device="cuda",
    model_name="gpt2",
    ffidf_topk=10,
)

In [4]:
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", level=logging.INFO
)
logger = logging.getLogger(__name__)


corpus = AspectDataInitializer(
    data_path=args.data_path,
    index_dir=args.index_dir,
    aspect_path=args.aspect_path,
    tokenizer=None,
)
#  ========== prepare tokenizer and review history ==========
review_history = ReviewHistory(
    corpus.train,
    valid_data=corpus.valid,
    test_data=corpus.test,
    logger=logger,
)
tokenizer = GPT2Tokenizer.from_pretrained(
    args.model_name, bos_token=bos, eos_token=eos, pad_token=pad
)
SPECIAL_TOKEN_IDS = [
    tokenizer.bos_token_id,
    tokenizer.eos_token_id,
    tokenizer.pad_token_id,
]
valid_dataset = UserItemDataset(corpus.valid, tokenizer, bos=bos, eos=eos)

feature set size: 372576


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def inference(
    model: RecReg,
    batch: AspectDataBatch,
    device: str,
    special_token_ids: list[int],
    output_len: int,
    temperature: float,
) -> tuple:
    aspect_rating_predict = []
    idss_predict = []
    with torch.no_grad():
        user = batch.user.to(device)
        item = batch.item.to(device)
        aspect = batch.aspect.to(device)
        aspect_score = batch.aspect_score.to(device)
        text = batch.seq[:, :1].to(device)

        batch_size = user.size(0)
        for idx in range(output_len):
            outputs = model(
                user, item, aspect, text, mask=None, aspect_score=aspect_score
            )
            if idx == 0:
                # add a batch dimension if batch_size == 1
                aspect_rating = (
                    outputs.aspect_rating.unsqueeze(0)
                    if batch_size == 1
                    else outputs.aspect_rating
                )
                aspect_rating_predict.extend(aspect_rating.tolist())

            last_token = outputs.logits[:, -1, :] / (temperature + 1e-10)
            word_prob = torch.softmax(last_token, dim=-1)
            token = torch.multinomial(word_prob, num_samples=1)
            text = torch.cat([text, token], dim=-1)
        ids = text[:, 1:].tolist()  # remove <bos>
        # remove <bos>, <eos>, <pad> in the middle
        # ids = [[x for x in y if x not in special_token_ids] for y in ids]
        idss_predict.extend(ids)
    return aspect_rating_predict, idss_predict

## Retrieval Results 
- For every user-item pair, we create a batch. 
- Every user-item pair in raw data is a review. For training purpose, we segment the review into multiple segments
based on the aspect category (`category`). Hence a batch contains a dynamic number of segments ($1 <= number\_of\_segments <= 28$). 
- Each display is to display the user, the item, and the gold segments, along with the generated texts based on `user, item, category` prompt tokens. 

In [6]:
ffidf_store = FfidfStore(args.ffidf_cache_dir)

Loading precomputed ffidf values from /home/P76114511/projects/nete_format_data/ffidf_cache/yelp23/1
Loaded 35152 users and 24199 items.
Loading completed.



#### Coloring Scheme

In [7]:
# !! `retrieve.py` has not fixed the batch size= 1 issue
# !! load item name if available


from termcolor import colored

ID_COLOR = "on_green"
ID_COLOR2 = "on_cyan"
GOLD_COLOR = "light_yellow"
PRED_COLOR = "light_blue"


print_gold = lambda x: print(colored(x, GOLD_COLOR))
print_predict = lambda x: print(colored(x, PRED_COLOR))
print_user_id = lambda x: print(colored(x, "black", ID_COLOR))
print_user_info = lambda x: print(colored(x, ID_COLOR.lstrip("on_")))

print_item_id = lambda x: print(colored(x, "black", ID_COLOR2))
print_item_info = lambda x: print(colored(x, ID_COLOR2.lstrip("on_")))

## Display
- `RAND_INDEX` selects a random user-item pair from the validation dataset. 
- We first load the fine-tuned Multi-aspect PEPLER model to predict `user-item-aspect` rating and generate texts. 
- `user_id`: user's id string in raw data 
- `item_id`: item's id string in raw data; ONLY SOME raw data provides the item's name and if so, we also display the item's name. 
- `user(item)_ffidf_features` are pre-computed ffidf-score-ranked features (features are extracted using sentiment UIE model). 
   These features may help identified the user's preferences and the item(restaurants)'s characteristics.
- `user(item)_review_history` are user or item's reviews in the train set. 

In [8]:
import json


def load_jsonl(path: os.PathLike):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

In [9]:
if args.user_meta_path is not None and args.item_meta_path is not None:
    user_meta = load_jsonl(args.user_meta_path)
    item_meta = load_jsonl(args.item_meta_path)

In [10]:
user_id2meta = {x["user_id"]: x for x in user_meta}
item_id2meta = {x["business_id"]: x for x in item_meta}

In [11]:
# ========== randomly select a user-item pair for results visualization ==========
RAND_INDEX = random.randint(0, len(valid_dataset) - 1)

In [12]:
print(f"random index: {RAND_INDEX}/{len(valid_dataset)}")
data = valid_dataset[RAND_INDEX]
user, item = data.user, data.item
user_id, item_id = corpus.user_dict.get_entity(user), corpus.item_dict.get_entity(item)
batch = valid_dataset.collate_fn(data)
model = load_model(args.checkpoint)
aspect_rating_predict, idss_predict = inference(
    model,
    batch,
    args.device,
    SPECIAL_TOKEN_IDS,
    args.output_len,
    args.temperature,
)
# ========== ffidf features ==========
user_ffidf_features = ffidf_store.get_user_ffidf(
    user_id, topk=args.ffidf_topk, return_score=True
)
item_ffidf_features = ffidf_store.get_item_ffidf(
    item_id, topk=args.ffidf_topk, return_score=True
)

# ========== review history ==========
user_review_history_idx = review_history.get_user_history(
    user, hide_item=item, return_embedding=False
)
item_review_history_idx = review_history.get_item_history(
    item, hide_user=user, return_embedding=False
)

# BELOW demonstrates how to get review content
user_review_history = [corpus.train[i] for i in user_review_history_idx]
item_review_history = [corpus.train[i] for i in item_review_history_idx]

user_review_history_text = [d["text"] for d in user_review_history]
item_review_history_text = [d["text"] for d in item_review_history]

batch_size = len(batch)

print_user_id(f"user_id: {user_id}")
if args.user_meta_path is not None:
    print_user_id(f"user name: {user_id2meta[user_id]['name']}")

for tuple in user_ffidf_features:
    print_user_info(f"{tuple}")


print_item_id(f"item_id: {item_id}")
if args.item_meta_path is not None:
    print_item_id(f"item name: {item_id2meta[item_id]['name']}")
# id name
for tuple in item_ffidf_features:
    print_item_info(f"{tuple}")


print(f"# aspect categories mentioned in gold review: {batch_size}")

print_gold(f"gold overall rating: {data.rating}")
for i in range(batch_size):
    d = data[i]
    predict_text_ids = idss_predict[i]
    predict_aspect_rating = aspect_rating_predict[i]
    predict_text = ids2tokens(predict_text_ids, tokenizer, eos)

    print_gold(f"\tcategory: {d['category_name']}")
    print_gold(f"\tgold text: {d['text']}")
    print_gold(f"\tgold feature(s): {d['feature']}")
    print_predict(f"\tpredict text: {' '.join(predict_text)}")
    print_gold(f"\tgold aspect-rating: {d['rating']}")
    print_predict(f"\tpredict aspect-rating: {predict_aspect_rating}")
    if i < batch_size - 1:
        print("\t=========== ")

pprint(f"user's review history: {user_review_history_text}")
pprint(f"items's review history: {item_review_history_text}")

random index: 94449/170879
[42m[30muser_id: 8a00hdf5M5cr8D1aNi5hUA[0m
[42m[30muser name: Karl[0m
[32m('individual salsa bowls', 0.35888773927808937)[0m
[32m('pad kee mow beef', 0.35888773927808937)[0m
[32m('pad kee mow', 0.29653884185304)[0m
[32m('peach butter', 0.2838982114544177)[0m
[32m('pad see ewe', 0.27901491497825076)[0m
[32m('vegetable', 0.24918017038108284)[0m
[32m('pork taco', 0.2309972915799755)[0m
[32m('country fried steak', 0.22450897915051574)[0m
[32m('queso dip', 0.2105154133470587)[0m
[32m('beef pho', 0.19524939152146756)[0m
[46m[30mitem_id: vgaDkuxjt8-w-0PtE8tTCA[0m
[46m[30mitem name: Yolk - City Way[0m
[36m('food', 0.3024494100962948)[0m
[36m('breakfast', 0.28144553613252876)[0m
[36m('cinnamon roll french toast', 0.26492258461430296)[0m
[36m('coffee', 0.2533889246050116)[0m
[36m('pancake', 0.23425780275669422)[0m
[36m('service', 0.23262131228525731)[0m
[36m('brunch', 0.2120751972504243)[0m
[36m('strawberry orange juice', 