In [1]:
from contextlib import contextmanager  # for context management
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import io_utils as iu
import config

In [2]:
# ================
# set up constants
# ================
# DATA_DIR = '../data/raw/preliminary_contest_data/'
DATA_DIR = '/mnt/d/DataShortcut/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.data'
USER_DATA_PATH = os.path.join(DATA_DIR, USER_DATA_FILE)
line_counts = iu.count_file_lines(USER_DATA_PATH)  # 11420039
print("Line Counts: {}".format(line_counts))

# ==========
# load lines
# ==========
lines = []
with open(USER_DATA_PATH) as f:
    for i in tqdm(range(line_counts), desc="loading lines"):
        line = f.readline().strip()
        lines.append(line)
assert line_counts == i + 1

# ============
# set up paths
# ============
out_folder = '../data/split/preliminary_contest_data/byUserFeatureName/'
os.makedirs(out_folder, exist_ok=True)  # create directory if not existed
feat_names = config.USER_FEAT_NAMES
feat_to_filenames = {feat_name: os.path.join(out_folder, "userFeature.[featureName='{}'].data".format(feat_name)) 
                     for feat_name in feat_names}

loading lines:   0%|          | 49019/11420039 [00:00<00:23, 490125.03it/s]

Line Counts: 11420039


loading lines: 100%|██████████| 11420039/11420039 [00:20<00:00, 544711.03it/s]


In [3]:
batch_size = 100
batch_counts = int(np.ceil(line_counts / batch_size))

with iu.open_files(feat_to_filenames) as feat_fs:
    fwg = iu.FileWriterGroup(feat_fs)
    for name, writter in fwg.writers.items():
        writter.write_buffer("uid|{}\n".format(name))  # write header
        writter.flush()
    for batch_i in tqdm(range(batch_counts)):
        line_batch = lines[batch_i * batch_size:(batch_i + 1) * batch_size]
        try:
            fwg.clear_buffers()
            for line in line_batch:
                feats = line.split("|")
                uid = feats[0].split(" ")[1]
                fwg.write_buffers("{}|".format(uid))
                for feat in feats[1:]:
                    key, val = feat.split(" ", maxsplit=1)  # only do split for once; e.g. key: "interest1", val: "xxx yyy ..."
                    fwg.write_buffer(key, val)
                fwg.write_buffers("\n")
        except Exception as e:
            print(e.args)
            pass
        fwg.flush()
print("done.")

100%|██████████| 114201/114201 [08:36<00:00, 221.12it/s]


done.


In [4]:
line_count_dict = {feat_name: iu.count_file_lines(filepath) for feat_name, filepath in feat_to_filenames.items()}
for feat_name, lc in line_count_dict.items():
    assert lc == line_counts or lc == line_counts + 1  # there is one empty line at the end