In [1]:
import os
from tqdm import tqdm
import pandas as pd
from contextlib import contextmanager  # for context management

In [2]:
DATA_DIR = '../../../data/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.data'

In [3]:
def load(filename, **kw):
    engine = kw.pop("engine", "python")
    return pd.read_csv(os.path.join(DATA_DIR, filename), engine=engine, **kw)

In [4]:
@contextmanager
def open_files(f_dict):
    fs = {k:open(v, "w") for k, v in f_dict.items()}
    yield fs
    for k, f in fs.items():
        f.close()

In [5]:
class FileWritter:
    def __init__(self, f):
        self.f = f
        self.buffer = ""
    
    def write_buffer(self, chars):
        self.buffer += chars
        
    def clear_buffer(self):
        self.buffer = ""
    
    def flush(self):
        self.f.write(self.buffer)
        self.clear_buffer()

In [6]:
class FileWritterGroup:
    def __init__(self, f_dict):
        self.writters = {k:FileWritter(v) for k, v in f_dict.items()}
        self.n_writters = len(f_dict)
    
    def write_buffer(self, name, chars):
        self.writters[name].write_buffer(chars)
    
    def clear_buffer(self, name):
        self.writters[name].clear_buffer()
    
    def clear_buffers(self):
        for name, writter in self.writters.items():
            writter.clear_buffer()
            
    def flush(self):
        for name, writter in self.writters.items():
            writter.flush()

In [7]:
df_train = load("train.csv")
df_test = load("test1.csv")
df_ad = load("adFeature.csv")

In [8]:
aid_to_product = {row["aid"]: row["productId"] for i, row in df_ad.iterrows()}
print(len(aid_to_product))

173


In [9]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()
df_train_copy["productId"] = df_train["aid"].map(aid_to_product)
df_test_copy["productId"] = df_test["aid"].map(aid_to_product)

In [10]:
prod_ids = df_ad["productId"].unique().tolist()

In [11]:
product_to_trainidx = df_train_copy.groupby("productId").groups  # dict that maps productId to Index
product_to_testidx = df_test_copy.groupby("productId").groups

In [12]:
# ===============
# split train.csv
# ===============
out_folder = '../../../data/split/preliminary_contest_data/byproductId/'
os.makedirs(out_folder, exist_ok=True)  # create directory if not existed
row_counts = 0  # for debug use
for prod_id, idx in tqdm(product_to_trainidx.items()):
    df = df_train.loc[idx]
    df.to_csv(os.path.join(out_folder, "train.[productId='{}'].csv".format(prod_id)), index=False)
    row_counts += df.shape[0]
assert row_counts == df_train.shape[0]  # for debug use

100%|██████████| 33/33 [00:27<00:00,  1.20it/s]


In [13]:
# ===============
# split test1.csv
# ===============
row_counts = 0  # for debug use
for prod_id, idx in tqdm(product_to_testidx.items()):
    df = df_test.loc[idx]
    df.to_csv(os.path.join(out_folder, "test1.[productId='{}'].csv".format(prod_id)), index=False)
    row_counts += df.shape[0]
assert row_counts == df_test.shape[0]  # for debug use

100%|██████████| 33/33 [00:06<00:00,  5.21it/s]


In [None]:
# ============================
# get productId to userIds map
# ============================
product_to_user = {}  # key: integer, value: set
for prod_id in tqdm(prod_ids):
    train_idx = product_to_trainidx[prod_id]
    test_idx = product_to_testidx[prod_id]
    train_uids = df_train.loc[train_idx]["uid"].values  # list
    test_uids = df_test.loc[test_idx]["uid"].values  # list
    uids = set(train_uids).union(set(test_uids))  # set
    product_to_user[prod_id] = uids

100%|██████████| 33/33 [00:04<00:00,  7.04it/s]


In [None]:
# ======================
# split userFeature.data
# ======================
line_counts = 11420039

batch_size = 1000
batch_counts = (line_counts - 1) // batch_size + 1
product_to_filenames = {prod_id: os.path.join(out_folder, "userFeature.[productId='{}'].data".format(prod_id)) 
                        for prod_id in prod_ids}
with open(os.path.join(DATA_DIR, USER_DATA_FILE), "r") as user_f:
    with open_files(product_to_filenames) as prod_fs:
        fwg = FileWritterGroup(prod_fs)
        for i in tqdm(range(batch_counts)):
            try:
                fwg.clear_buffers()
                for i in range(batch_size):
                    ln = user_f.readline()
                    feat = ln[4:16]
                    uid = feat.split("|")[0]
                    uid = int(uid)
                    for prod_id, required_uid_set in product_to_user.items():
                        if uid in required_uid_set:
                            fwg.write_buffer(prod_id, ln)
            except Exception as e:
                pass
            fwg.flush()
print("done.")

 97%|█████████▋| 11089/11421 [02:13<00:03, 83.26it/s]