In [1]:
from tqdm import tqdm
from collections import Counter
import os
import sys
import numpy as np
import multiprocessing as mp
from time import gmtime, strftime
import gc
import pickle
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code')
import perf_utils as pu
import data_utils as du
import io_utils as iu
import config

In [2]:
# DATA_DIR = '../../../data/raw/preliminary_contest_data/'
DATA_DIR = '/mnt/d/DataShortcut/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.data'
USER_DATA_PATH = os.path.join(DATA_DIR, USER_DATA_FILE)
assert os.path.exists(USER_DATA_PATH)

with pu.profiler("counting lines"):
    line_counts = iu.count_file_lines(USER_DATA_PATH)
print("{} lines in userFeature.data".format(line_counts))

lines = []
with open(USER_DATA_PATH) as f:
    for i in tqdm(range(line_counts), desc="loading lines"):
        line = f.readline().strip()
        lines.append(line)
print("Memory Usage at this moment: {}".format(pu.get_memory_str()))

loading lines:   0%|          | 47297/11420039 [00:00<00:24, 472857.71it/s]

[09:21:12] Finish counting lines. △M: +773.91MB. △T: 24.5 seconds.
11420039 lines in userFeature.data


loading lines: 100%|██████████| 11420039/11420039 [00:34<00:00, 328740.96it/s]

Memory Usage at this moment: 4.98GB





In [3]:
feat_names = config.USER_FEAT_NAMES


def split_list(lst, n_split=4):
    n_total = len(lst)
    step = int(np.ceil(n_total / n_split))
    splits = []
    for offset in range(0, n_total, step):
        splits.append(lst[offset:offset + step])
    return splits


def fast_get_word_from_lines(lines):
    vocabs = {feat_name: set() for feat_name in feat_names}
    n_lines = len(lines)
    for line in lines:
        for feat in line.split("|")[1:]:
            arr = feat.split(" ")
            key = arr[0]
            vals = arr[1:]
            vocabs[key].update(vals)
    return vocabs


def merge_set_dicts(set_dict_list):
    final_set_dict = {}
    for feat_name in feat_names:
        final_set = set()
        for set_dict in set_dict_list:
            final_set.update(set_dict[feat_name])
        final_set_dict[feat_name] = final_set
    return final_set_dict

In [4]:
def batch_get_words(lines, n_procs=None, n_batches=None):
    n_procs = mp.cpu_count() if n_procs is None else n_procs
    n_batches = mp.cpu_count() if n_batches is None else n_batches
    pool = mp.Pool(processes=n_procs)
    results = [pool.apply_async(fast_get_word_from_lines, (batch, )) for batch in split_list(lines, n_procs)]
    pool.close()
    pool.join()
    sets_list = [result.get() for result in results]
    final_set_dict = merge_set_dicts(sets_list)
    return final_set_dict

In [5]:
n_batches = 100
line_batches = split_list(lines, n_batches)
vocab_dicts = []
for i, line_batch in enumerate(line_batches):
    vocab_dicts.append(batch_get_words(line_batch, 8))
    print("[{}] Batch {}/{} Done.".format(pu.get_time_str(), i+1, n_batches))

with pu.profiler("merging results"):
    vocab_dict = merge_set_dicts(vocab_dicts)
print("Memory Usage at this moment: {}".format(pu.get_memory_str()))                                   

[09:22:21] Batch 1/100 Done.
[09:22:29] Batch 2/100 Done.
[09:22:36] Batch 3/100 Done.
[09:22:44] Batch 4/100 Done.
[09:22:50] Batch 5/100 Done.
[09:22:58] Batch 6/100 Done.
[09:23:06] Batch 7/100 Done.
[09:23:14] Batch 8/100 Done.
[09:23:21] Batch 9/100 Done.
[09:23:30] Batch 10/100 Done.
[09:23:38] Batch 11/100 Done.
[09:23:45] Batch 12/100 Done.
[09:23:53] Batch 13/100 Done.
[09:24:01] Batch 14/100 Done.
[09:24:08] Batch 15/100 Done.
[09:24:17] Batch 16/100 Done.
[09:24:25] Batch 17/100 Done.
[09:24:32] Batch 18/100 Done.
[09:24:40] Batch 19/100 Done.
[09:24:48] Batch 20/100 Done.
[09:24:56] Batch 21/100 Done.
[09:25:04] Batch 22/100 Done.
[09:25:11] Batch 23/100 Done.
[09:25:19] Batch 24/100 Done.
[09:25:26] Batch 25/100 Done.
[09:25:35] Batch 26/100 Done.
[09:25:43] Batch 27/100 Done.
[09:25:51] Batch 28/100 Done.
[09:26:00] Batch 29/100 Done.
[09:26:08] Batch 30/100 Done.
[09:26:16] Batch 31/100 Done.
[09:26:23] Batch 32/100 Done.
[09:26:31] Batch 33/100 Done.
[09:26:40] Batch 34

NameError: name 'get_memory_str' is not defined

In [7]:
vocab_folder = '../../../data/vocabulary/preliminary_contest_data/'
os.makedirs(vocab_folder, exist_ok=True)
for feat_name, vocab in vocab_dict.items():
    vocab_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
    vocab_path = os.path.join(vocab_folder, vocab_file)
    du.save_pickle(list(vocab), vocab_path)