In [1]:
import pandas as pd

def read_jsonl(file_path, nrows=None):
    return pd.read_json(file_path, lines=True, nrows=nrows)

train_data = read_jsonl("phase_2_input_data/training_data/train.features")
train_solution = read_jsonl("phase_2_input_data/training_data/train.labels")

In [2]:
train_data = train_data.merge(train_solution, on="indoml_id", how="left")

In [3]:
train_data.head()

Unnamed: 0,indoml_id,description,retailer,price,supergroup,group,module,brand
0,0,1 adblue,organicorner,25.35,automotive,automotive detail unknown total,automotive,receipt all
1,1,1 car mat set,greenharbor,4.99,automotive,automotive detail unknown total,automotive,receipt all
2,2,1 cp rmx scrnwash,naturify,3.85,automotive,automotive detail unknown total,automotive,receipt all
3,3,1 diesel,ecogro,4.41,automotive,automotive detail unknown total,automotive,receipt all
4,4,1 unstoppable refrsher,greenharbor,3.0,automotive,automotive detail unknown total,automotive,receipt all


In [4]:
train_data["int_price"] = train_data["price"].apply(lambda x: float(x))
# convert nan to 0
train_data["int_price"] = train_data["int_price"].fillna(0)
train_data["int_price"] = train_data["int_price"].astype(int)
train_data["int_price"] = train_data["int_price"].astype(str)

# create a query, which has the retailer, price, and the description of the product
train_data["query"] = "retailer: " + train_data["retailer"] + " " + "price: " + train_data["int_price"] + " " + "description: " + train_data["description"]

In [None]:
# sample query
train_data['query'].sample(3).values

array(['retailer: orchidora price: 1 description: family chicken fie',
       'retailer: savormart price: 0 description: x 175 g haribo tangfastics',
       'retailer: orchidora price: 2 description: j 20 applemango'],
      dtype=object)

In [None]:
# we process the brand seperately and group the module, group, and supergroup into one single target format
 
train_data["mod_grp_sg"] = train_data["module"] + "<|>" + train_data["group"] + "<|>" + train_data["supergroup"]
mod_grp_sg = train_data["mod_grp_sg"].unique()

mod_out_dct = {}
for i in mod_grp_sg:
    mgs_lst = i.split("<|>")
    mod_out_dct[mgs_lst[0]] = {"module": mgs_lst[0], "group": mgs_lst[1], "supergroup": mgs_lst[2]}

import pickle
with open("mod_out_dct.pkl", "wb") as f:
    pickle.dump(mod_out_dct, f)

# every module is uniquley determined by a group-supergroup pair
len(mod_out_dct)

449

In [15]:
brand = train_data["brand"].unique()

br_out_dct = {}
for i in brand:
    br_out_dct[i] = {"brand": i}

with open("br_out_dct.pkl", "wb") as f:
    pickle.dump(br_out_dct, f)

In [16]:
br_out_dct

{'receipt all': {'brand': 'receipt all'},
 'huggies': {'brand': 'huggies'},
 'nourify': {'brand': 'nourify'},
 'pure baby': {'brand': 'pure baby'},
 'verdemart': {'brand': 'verdemart'},
 'avent': {'brand': 'avent'},
 'crispcorner': {'brand': 'crispcorner'},
 'crispcorner fred & flo': {'brand': 'crispcorner fred & flo'},
 'dr browns': {'brand': 'dr browns'},
 'dr johnsons': {'brand': 'dr johnsons'},
 'griptight': {'brand': 'griptight'},
 'koko': {'brand': 'koko'},
 'mam': {'brand': 'mam'},
 'milton': {'brand': 'milton'},
 'mothercare': {'brand': 'mothercare'},
 'munchkin': {'brand': 'munchkin'},
 'nimble babies': {'brand': 'nimble babies'},
 'nourify baby': {'brand': 'nourify baby'},
 'nuby': {'brand': 'nuby'},
 'nuk': {'brand': 'nuk'},
 'orchidora': {'brand': 'orchidora'},
 'orchidora nutmeg': {'brand': 'orchidora nutmeg'},
 'peppa pig': {'brand': 'peppa pig'},
 'tommee tippee': {'brand': 'tommee tippee'},
 'vital baby': {'brand': 'vital baby'},
 'vitalveg': {'brand': 'vitalveg'},
 'vi

In [17]:
train_data = train_data.drop(columns=["price", "retailer", "group", "supergroup", "int_price", "mod_grp_sg"])

In [18]:
train_data

Unnamed: 0,indoml_id,description,module,brand,query
0,0,1 adblue,automotive,receipt all,retailer: organicorner price: 25 description: ...
1,1,1 car mat set,automotive,receipt all,retailer: greenharbor price: 4 description: 1 ...
2,2,1 cp rmx scrnwash,automotive,receipt all,retailer: naturify price: 3 description: 1 cp ...
3,3,1 diesel,automotive,receipt all,retailer: ecogro price: 4 description: 1 diesel
4,4,1 unstoppable refrsher,automotive,receipt all,retailer: greenharbor price: 3 description: 1 ...
...,...,...,...,...,...
561833,561833,zuru xshot excelxcess,toys,receipt all,retailer: noshify price: 16 description: zuru ...
561834,561834,zuru xshot micro,toys,receipt all,retailer: vitalveg price: 3 description: zuru ...
561835,561835,zuru xshot typhoon thunder,toys,receipt all,retailer: crispcorner price: 8 description: zu...
561836,561836,zzand,toys,receipt all,retailer: snackify price: 4 description: zzand


In [None]:

# shuffles dataset
train_data = train_data.sample(frac=1, random_state=42)
train_data = train_data.reset_index(drop=True)

In [None]:
# create a document of product descriptions which is module + brand fusion
train_data["sup_doc"] = train_data["module"] + "<|>" + train_data["brand"]

In [None]:
# for each module brand pair, add all queries that correspond to that pair

doc_query_lst = {}
for i, row in train_data.iterrows():
    doc_query = row["sup_doc"]
    if doc_query in doc_query_lst:
        doc_query_lst[doc_query].append(row["query"])
    else:
        doc_query_lst[doc_query] = [row["query"]]

In [14]:
import random

In [15]:
train_final = {"query": [], "brand": [], "module": []}
val_final = {"query": [], "brand": [], "module": []}

for doc, query_lst in doc_query_lst.items():
    train_split = max(round(0.97*len(query_lst)), 1)
    random.shuffle(query_lst)
    train_q = query_lst[:train_split]
    val_q = query_lst[train_split:]

    brand = doc.split("<|>")[1]
    module = doc.split("<|>")[0]

    brs = [brand]*len(train_q)
    mods = [module]*len(train_q)

    v_brs = [brand]*len(val_q)
    v_mods = [module]*len(val_q)

    train_final["query"].extend(train_q)
    train_final["brand"].extend(brs)
    train_final["module"].extend(mods)

    val_final["query"].extend(val_q)
    val_final["brand"].extend(v_brs)
    val_final["module"].extend(v_mods)

In [16]:
train_final_df = pd.DataFrame(train_final)
val_final_df = pd.DataFrame(val_final)

In [17]:
train_final_df

Unnamed: 0,query,brand,module
0,retailer: vibrantmart price: 2 description: ba...,receipt all,stationery & printed material & services
1,retailer: wilko price: 3 description: scratchc...,receipt all,stationery & printed material & services
2,retailer: plenify price: 1 description: easter...,receipt all,stationery & printed material & services
3,retailer: orchidora price: 1 description: plan...,receipt all,stationery & printed material & services
4,retailer: greenzen price: 2 description: whs a...,receipt all,stationery & printed material & services
...,...,...,...
546159,retailer: groveify price: 1 description: prime...,primeo,cheese fresh fw
546160,retailer: tastify price: 0 description: qatcake,north staffs,sweet pastry dough fresh
546161,retailer: vivify price: 0 description: koko df...,koko,cheese fresh fw
546162,retailer: snackify price: 5 description: sosft,so soft,toilet tissue


In [20]:
train_final_df = train_final_df.sample(frac=1, random_state=42)
train_final_df = train_final_df.reset_index(drop=True)

In [21]:
train_final_df.to_csv("_train.csv", index=False)
val_final_df.to_csv("_val.csv", index=False)

# Encode Docs

In [24]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [25]:
model = SentenceTransformer('all-mpnet-base-v2')

In [27]:
import pickle
doc_mod = train_final_df["module"].unique()
doc_br = train_final_df["brand"].unique()

In [28]:
mod_id = {}
br_id = {}

for i, doc in enumerate(doc_mod):
    mod_id[doc] = i

for i, doc in enumerate(doc_br):
    br_id[doc] = i

with open("mod_id.pkl", "wb") as f:
    pickle.dump(mod_id, f)

with open("br_id.pkl", "wb") as f:
    pickle.dump(br_id, f)

In [29]:
dct_id_mod = {"doc": doc_mod, "id": list(range(len(doc_mod)))}
dct_df_mod = pd.DataFrame(dct_id_mod)

dct_id_br = {"doc": doc_br, "id": list(range(len(doc_br)))}
dct_df_br = pd.DataFrame(dct_id_br)

In [30]:
batched_mod = [doc_mod[i:i+128] for i in range(0, len(doc_mod), 128)]
batched_br = [doc_br[i:i+128] for i in range(0, len(doc_br), 128)]

In [32]:
from tqdm import tqdm
embeddings_mod = []
embeddings_br = []

for batch in tqdm(batched_mod):
    embeddings_mod.extend(model.encode(batch))

for batch in tqdm(batched_br):
    embeddings_br.extend(model.encode(batch))

100%|██████████| 4/4 [00:01<00:00,  3.63it/s]
100%|██████████| 45/45 [00:03<00:00, 13.44it/s]


In [33]:
import numpy as np
embeddings_mod = np.array(embeddings_mod)
embeddings_br = np.array(embeddings_br)

np.save("embeddings_mod.npy", embeddings_mod)
np.save("embeddings_br.npy", embeddings_br)

In [35]:
dct_df_mod = dct_df_mod.rename(columns={"id": "docid"})
dct_df_mod["vector"] = dct_df_mod["docid"].apply(lambda x: np.array(embeddings_mod[x]))

In [54]:
dct_df_mod["vector"] = dct_df_mod["vector"].apply(lambda x: "|".join([str(g) for g in x.tolist()]))

In [55]:
dct_df_mod.to_csv("mod_kmeans.tsv", sep="\t", index=False, header=False)

# Product Quantisation

In [38]:
import nanopq
pq = nanopq.PQ(M=12, Ks=48, verbose=True)

M: 12, Ks: 48, metric : <class 'numpy.uint8'>, code_dtype: l2


In [40]:
pq.fit(vecs=embeddings_br, iter=30, seed=1422)

iter: 30, seed: 1422
Training the subspace: 0 / 12
Training the subspace: 1 / 12
Training the subspace: 2 / 12
Training the subspace: 3 / 12
Training the subspace: 4 / 12
Training the subspace: 5 / 12
Training the subspace: 6 / 12
Training the subspace: 7 / 12
Training the subspace: 8 / 12
Training the subspace: 9 / 12
Training the subspace: 10 / 12
Training the subspace: 11 / 12


<nanopq.pq.PQ at 0x7f5dae63b160>

In [42]:
X_code = pq.encode(vecs=embeddings_br)

Encoding the subspace: 0 / 12
Encoding the subspace: 1 / 12
Encoding the subspace: 2 / 12
Encoding the subspace: 3 / 12
Encoding the subspace: 4 / 12
Encoding the subspace: 5 / 12
Encoding the subspace: 6 / 12
Encoding the subspace: 7 / 12
Encoding the subspace: 8 / 12
Encoding the subspace: 9 / 12
Encoding the subspace: 10 / 12
Encoding the subspace: 11 / 12


In [43]:
X_code.shape

(5679, 12)

In [44]:
print(np.unique(X_code, axis=0).shape)

(5548, 12)


In [45]:
br_code = {}
for i, doc in enumerate(doc_br):
    br_code[doc] = X_code[i]

In [46]:
with open("br_code.pkl", "wb") as f:
    pickle.dump(br_code, f)

# Kmeans

Reused from the Neural Corpus Indexer Repository

In [47]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
import numpy as np
import pandas as pd
import pickle

args = {"v_dim": 768, "bert_size": 768, "seed": 7, "k": 6, "c": 6}

In [52]:
#  convert args to object
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

args = Struct(**args)

In [56]:
df = pd.read_csv(f'mod_kmeans.tsv',
                 names=['doc', 'docid', 'vector'],
                 header=None, sep='\t').loc[:, ['docid', 'vector']]
df.drop_duplicates('docid', inplace = True)
old_id = df['docid'].tolist()
X = df['vector'].tolist()
for idx,v in enumerate(X):
    vec_str = v.split('|')
    if len(vec_str) != args.v_dim:
        print('vec dim error!')
        print(len(vec_str))
        print(vec_str)
        exit(1)
    X[idx] = [float(v) for v in vec_str]
X = np.array(X)
print(X.shape)
new_id_list = []

kmeans = KMeans(n_clusters=args.k, max_iter=300, n_init=100, init='k-means++', random_state=args.seed, tol=1e-7)

mini_kmeans = MiniBatchKMeans(n_clusters=args.k, max_iter=300, n_init=100, init='k-means++', random_state=3,
                              batch_size=1000, reassignment_ratio=0.01, max_no_improvement=20, tol=1e-7)


def classify_recursion(x_data_pos):
    if x_data_pos.shape[0] <= args.c:
        if x_data_pos.shape[0] == 1:
            return
        for idx, pos in enumerate(x_data_pos):
            new_id_list[pos].append(idx)
        return

    temp_data = np.zeros((x_data_pos.shape[0], args.v_dim))
    for idx, pos in enumerate(x_data_pos):
        temp_data[idx, :] = X[pos]

    if x_data_pos.shape[0] >= 1e3:
        pred = mini_kmeans.fit_predict(temp_data)
    else:
        pred = kmeans.fit_predict(temp_data)

    for i in range(args.k):
        pos_lists = []
        for id_, class_ in enumerate(pred):
            if class_ == i:
                pos_lists.append(x_data_pos[id_])
                new_id_list[x_data_pos[id_]].append(i)
        classify_recursion(np.array(pos_lists))

    return

print('Start First Clustering')
pred = mini_kmeans.fit_predict(X)
print(pred.shape)   #int 0-9 for each vector
print(mini_kmeans.n_iter_)

for class_ in pred:
    new_id_list.append([class_])

print('Start Recursively Clustering...')
for i in range(args.k):
    print(i, "th cluster")
    pos_lists = []
    for id_, class_ in enumerate(pred):
        if class_ == i:
            pos_lists.append(id_)
    classify_recursion(np.array(pos_lists))

mapping = {}
for i in range(len(old_id)):
    mapping[old_id[i]] = new_id_list[i]

with open(f'IDMapping_NQ_bert_{args.bert_size}_k{args.k}_c{args.c}_seed_{args.seed}.pkl', 'wb') as f:
    pickle.dump(mapping, f)

(449, 768)
Start First Clustering
(449,)
36
Start Recursively Clustering...
0 th cluster
1 th cluster
2 th cluster
3 th cluster
4 th cluster
5 th cluster


In [57]:
with open(f'IDMapping_NQ_bert_{args.bert_size}_k{args.k}_c{args.c}_seed_{args.seed}.pkl', "rb") as f:
    docid = pickle.load(f)

In [58]:
docid

{0: [1, 1, 0, 5, 0],
 1: [0, 5, 0],
 2: [1, 2, 1, 0],
 3: [1, 2, 3, 1],
 4: [1, 4, 5, 1, 0],
 5: [3, 2, 0],
 6: [3, 0, 0, 0],
 7: [5, 1, 1, 0],
 8: [1, 4, 5, 4],
 9: [5, 4, 0],
 10: [5, 5, 0],
 11: [5, 4, 3],
 12: [5, 0, 2, 0],
 13: [5, 2, 2, 0],
 14: [1, 0, 3, 0],
 15: [5, 3, 0, 0],
 16: [5, 0, 0, 0],
 17: [0, 5, 1],
 18: [2, 5, 0, 0],
 19: [1, 2, 3, 2, 0],
 20: [1, 4, 5, 1, 1],
 21: [2, 0, 4, 0],
 22: [3, 4, 0, 0],
 23: [5, 5, 1],
 24: [2, 1, 3, 0],
 25: [1, 2, 1, 1],
 26: [4, 1, 0],
 27: [5, 2, 3],
 28: [1, 4, 1, 0],
 29: [3, 4, 0, 1],
 30: [1, 0, 3, 1],
 31: [1, 1, 5, 0],
 32: [5, 1, 2],
 33: [1, 3, 0, 1, 0],
 34: [1, 4, 5, 0],
 35: [1, 2, 2, 0],
 36: [3, 3, 2],
 37: [2, 0, 0, 0],
 38: [1, 1, 2, 0],
 39: [2, 3, 1, 0],
 40: [1, 1, 0, 3],
 41: [1, 5, 2, 0],
 42: [1, 2, 4, 0],
 43: [5, 1, 1, 1],
 44: [0, 2, 2],
 45: [5, 3, 5, 0],
 46: [1, 5, 4, 0],
 47: [5, 0, 2, 1],
 48: [1, 1, 0, 4, 0],
 49: [1, 1, 0, 4, 1],
 50: [5, 0, 3, 0],
 51: [3, 3, 3, 0],
 52: [1, 2, 2, 4, 0],
 53: [5, 0, 2, 

In [59]:
dv = list(docid.values())

lns = []

for d in dv:
    lns.append(len(d))

In [60]:
max(lns), min(lns), sum(lns) / len(lns)

(5, 3, 3.8730512249443207)

In [61]:
with open("mod_id.pkl", "rb") as f:
    mod_id = pickle.load(f)

In [62]:
mod_id

{'bleach ammonia': 0,
 'garden & flora': 1,
 'stationery & printed material & services': 2,
 'homecare merchandise': 3,
 'skin conditioning moisturising': 4,
 'wine still light table styles': 5,
 'sugar candy': 6,
 'snacks chips crisps reconstituted extruded': 7,
 'skin cleansing & toning': 8,
 'meat products fresh': 9,
 'dog food dry': 10,
 'meat cuts joints whole fresh fw': 11,
 'eggs egg products fresh': 12,
 'chocolate single variety': 13,
 'cough cold & other respiratory remedies & accessories': 14,
 'fruit orange fresh fw': 15,
 'cheese fresh fw': 16,
 'vegetables salad vegetables remaining varieties ambient': 17,
 'milk substitutes non flavoured ambient': 18,
 'clothing & personal accessories': 19,
 'skin treatments dermatologicals': 20,
 'biscuits sweet ambient': 21,
 'flavoured drinks carbonated non cola': 22,
 'cat food wet': 23,
 'vinegar ambient': 24,
 'home furnishings & decor': 25,
 'ice cream novelties sorbet yogurt frozen': 26,
 'yogurt spoonable fresh': 27,
 'hair cond

In [63]:
mod_code = {k: np.array(docid[v]) for k, v in mod_id.items()}

In [64]:
mod_code

{'bleach ammonia': array([1, 1, 0, 5, 0]),
 'garden & flora': array([0, 5, 0]),
 'stationery & printed material & services': array([1, 2, 1, 0]),
 'homecare merchandise': array([1, 2, 3, 1]),
 'skin conditioning moisturising': array([1, 4, 5, 1, 0]),
 'wine still light table styles': array([3, 2, 0]),
 'sugar candy': array([3, 0, 0, 0]),
 'snacks chips crisps reconstituted extruded': array([5, 1, 1, 0]),
 'skin cleansing & toning': array([1, 4, 5, 4]),
 'meat products fresh': array([5, 4, 0]),
 'dog food dry': array([5, 5, 0]),
 'meat cuts joints whole fresh fw': array([5, 4, 3]),
 'eggs egg products fresh': array([5, 0, 2, 0]),
 'chocolate single variety': array([5, 2, 2, 0]),
 'cough cold & other respiratory remedies & accessories': array([1, 0, 3, 0]),
 'fruit orange fresh fw': array([5, 3, 0, 0]),
 'cheese fresh fw': array([5, 0, 0, 0]),
 'vegetables salad vegetables remaining varieties ambient': array([0, 5, 1]),
 'milk substitutes non flavoured ambient': array([2, 5, 0, 0]),
 'cl

In [71]:
lfs_dct = {}

for mod, did in mod_code.items():
    if str(did[:3]) not in lfs_dct:
        lfs_dct[str(did[:3])] = [mod]
    else:
        lfs_dct[str(did[:3])].append(mod)

In [72]:
lfs_dct

{'[1 1 0]': ['bleach ammonia',
  'bath additives',
  'toilet cleaners fresheners',
  'carpet fresheners',
  'cleansing soap',
  'laundry detergents',
  'textile fresheners',
  'household cleaners',
  'hand sanitizers',
  'cleansing body wash',
  'household disinfectants',
  'fabric softeners',
  'household stain removers',
  'antiseptic products'],
 '[0 5 0]': ['garden & flora'],
 '[1 2 1]': ['stationery & printed material & services',
  'home furnishings & decor',
  'kitchen & tableware',
  'home do it yourself',
  'sport & leisure'],
 '[1 2 3]': ['homecare merchandise',
  'clothing & personal accessories',
  'toys',
  'personal care combination & travel packs',
  'cosmetic combination packs & gift sets',
  'first aid & medical surgical supplies',
  'shoe accessories',
  'confectionery & gift confectionery & novelty',
  'cosmetic accessories'],
 '[1 4 5]': ['skin conditioning moisturising',
  'skin cleansing & toning',
  'skin treatments dermatologicals',
  'fragrances cologne',
  'ma

In [73]:
with open("mod_code.pkl", "wb") as f:
    pickle.dump(mod_code, f)

In [1]:
import pickle
with open("mod_code.pkl", "rb") as f:
    mod_code = pickle.load(f)

In [2]:
dv = list(mod_code.values())

lns = []

for d in dv:
    lns.append(len(d))

In [3]:
max(lns), min(lns), sum(lns) / len(lns)

(5, 3, 3.8730512249443207)

In [None]:
# CUDA_VISIBLE_DEVICES=4,3 accelerate launch --num_processes=2 train_final.py --train_data train.csv --val_data val.csv --model_name "t5-small" --bs 64 --eval_every 1000 --pred_type br --num_epochs 16