In [1]:
import pickle, os
from collections import defaultdict
import argparse
import logging

parser = argparse.ArgumentParser()

# path
parser.add_argument("--root_data_dir",type=str,default="/home/featurize/CB4Rec/data/")
parser.add_argument("--root_proj_dir",type=str,default="/home/featurize/CB4Rec/")
# parser.add_argument("--root_proj_dir",type=str,default="./")
# parser.add_argument("--model_path", type=str, default="/home/v-zhenyuhe/CB4Rec/model/large/large.pkl")
parser.add_argument("--sim_path", type=str, default="pretrained_models/sim_nrms_bce_r14_ep6_thres038414")
parser.add_argument("--sim_threshold", type=float, default=0.38414)

# Preprocessing 
parser.add_argument("--dataset",type=str,default='large')
parser.add_argument("--num_selected_users", type=int, default=1000, help='number of randomly selected users from val set')
parser.add_argument("--cb_train_ratio", type=float, default=0.2)
parser.add_argument("--sim_npratio", type=int, default=4)
parser.add_argument("--sim_val_batch_size", type=int, default=1024)

# Simulation
parser.add_argument("--algo",type=str,default="2_ts_neuralucb")
parser.add_argument("--algo_prefix", type=str, default="algo",
    help='the name of save files')
parser.add_argument("--n_trials", type=int, default=4, help = 'number of experiment runs')
parser.add_argument("--T", type=int, default=1000, help = 'number of rounds (interactions)')
parser.add_argument("--topic_update_period", type=int, default=1, help = 'Update period for CB topic model')
parser.add_argument("--update_period", type=int, default=100, help = 'Update period for CB model')
parser.add_argument("--n_inference", type=int, default=5, help='number of Monte Carlo samples of prediction. ')
parser.add_argument("--rec_batch_size", type=int, default=5, help='recommendation size for each round.')
parser.add_argument("--per_rec_score_budget", type=int, default=1000, help='buget for calcuating scores, e.g. ucb, for each rec')
parser.add_argument("--max_batch_size", type=int, default=256, help = 'Maximum batch size your GPU can fit in.')
parser.add_argument("--pretrained_mode",type=bool,default=True, 
    help="Indicates whether to load a pretrained model. True: load from a pretrained model, False: no pretrained model ")
parser.add_argument("--preinference_mode",type=bool,default=True, 
    help="Indicates whether to preinference news before each model update.")

parser.add_argument("--uniform_init",type=bool,default=True, 
    help="For Thompson Sampling: Indicates whether to init ts parameters uniformly")
parser.add_argument("--gamma", type=float, default=1.0, help='ucb parameter: mean + gamma * std.')


# nrms 
parser.add_argument("--npratio", type=int, default=4) 
parser.add_argument("--max_his_len", type=int, default=50)
parser.add_argument("--min_word_cnt", type=int, default=1) # 5
parser.add_argument("--max_title_len", type=int, default=30)
# nrms topic
parser.add_argument("--dynamic_aggregate_topic", type=bool, default=True) # whether to dynamicly aggregate small topic during simulation
parser.add_argument("--min_item_size", type=int, default=1000)

# model training
parser.add_argument("--batch_size", type=int, default=64) 
parser.add_argument("--epochs", type=int, default=5)
parser.add_argument("--lr", type=float, default=0.0001)
parser.add_argument("--num_workers", type=int, default=4)
args = parser.parse_args([])


def load_cb_topic_news(root_data_dir):
    fname = os.path.join(root_data_dir, "large/utils/cb_news.pkl") 
    with open(fname, 'rb') as f: 
        cb_news = pickle.load(f)
    return cb_news 
topic_news = load_cb_topic_news("/home/featurize/CB4Rec/data")
cb_news = defaultdict(list)
for k,v in topic_news.items():
    cb_news[k] = [l.strip('\n').split("\t")[0] for l in v] # get nIDs 

In [62]:
import json
large_topic = []
for topic, nids in cb_news.items():
    if len(nids) > 2000:
        large_topic.append(topic)
        
json.dump(large_topic, open("../data/large/utils/large_topic.json", 'w'))

In [34]:
import numpy as np
root_data_dir = "/home/featurize/CB4Rec/data"
dataset = "large"
with open(os.path.join("/home/featurize/CB4Rec/data", "large",  'utils', 'nid2index.pkl'), 'rb') as f:
    nid2index = pickle.load(f)
news_index = np.load(os.path.join(root_data_dir, dataset,  'utils', 'news_index.npy'))

In [52]:
from tqdm import tqdm
word2vec = np.load(os.path.join(args.root_data_dir, args.dataset,  'utils', 'embedding.npy'))
cb_news_embedding = {}
for topic in tqdm(large_topic):
    cb_news_embedding[topic] = []
    for nid in cb_news[topic]:
        total = len(np.where(news_index[nid2index[nid]]!=0))
        news_embedding = np.sum(word2vec[np.array(news_index[nid2index[nid]], dtype=int)], axis=0)
        news_mebedding = news_embedding / total
        cb_news_embedding[topic].append(news_embedding)
    

100%|██████████| 14/14 [00:04<00:00,  3.33it/s]


In [66]:
from sklearn.cluster import KMeans
large_topic_cluster = {}
for topic in large_topic:
    num_cluster = len(cb_news[topic]) // 2000 + 1
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(cb_news_embedding[topic])
    large_topic_cluster[topic] = kmeans.labels_.tolist()

In [68]:
json.dump(large_topic_cluster, open("../data/large/utils/large_topic_cluster.json", 'r'))

In [75]:
subcategory_by_order = json.load(open(os.path.join(root_data_dir, dataset,  'utils', 'subcategory_byorder.json')))

In [78]:
nid2topic = {}
for topic in large_topic:
    subcategory_by_order.remove(topic)
    for nid, sub in zip(cb_news[topic], large_topic_cluster[topic]):
        new_topic = topic + "_" + str(sub)
        nid2topic[nid] = new_topic
        if new_topic not in subcategory_by_order:
            subcategory_by_order.append(new_topic)

In [81]:
json.dump(subcategory_by_order,open(os.path.join(root_data_dir, dataset,  'utils', 'subcategory_byorder_large_topic_splited.json'), 'r'))
json.dump(nid2topic,open(os.path.join(root_data_dir, dataset,  'utils', 'nid2topic_large_topic_splited.json'), 'r'))

In [69]:
from collections import Counter
for topic in large_topic:
    c = Counter(large_topic_cluster[topic])
    print(dict(c))
    print("--------------")

{1: 1941, 0: 915}
--------------
{0: 713, 1: 1867}
--------------
{0: 1575, 3: 716, 4: 1587, 5: 1466, 2: 2580, 6: 2378, 1: 1806}
--------------
{2: 710, 0: 1731, 1: 1894}
--------------
{1: 1790, 0: 911}
--------------
{2: 2239, 0: 837, 1: 2225}
--------------
{0: 1155, 1: 1723}
--------------
{0: 1050, 1: 1734}
--------------
{1: 2111, 0: 1422}
--------------
{7: 569, 0: 1485, 2: 793, 5: 2788, 1: 2152, 4: 2002, 6: 2027, 3: 3057}
--------------
{0: 1215, 1: 2073}
--------------
{0: 2367, 1: 1141}
--------------
{1: 2319, 0: 1363}
--------------
{1: 1394, 0: 2367}
--------------


In [82]:
len(subcategory_by_order)

312