In [1]:
from tqdm import tqdm
from collections import Counter
import os
import sys
import numpy as np
import multiprocessing as mp
from time import gmtime, strftime
import gc
import pickle
import sys
sys.path.append('../code/utils')
from perf_utils import get_memory_str

In [2]:
DATA_DIR = '../data/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.preliminary.data'
USER_DATA_PATH = os.path.join(DATA_DIR, USER_DATA_FILE)

In [3]:
def count_file_lines(filepath):
    f = open(filepath)
    for i, l in tqdm(enumerate(f.readlines())):
        pass
    return i + 1

In [4]:
# line_counts = count_file_lines(os.path.join(DATA_DIR, USER_DATA_FILE))  # comment this if you don't want to run it again
line_counts = 9686953  # uncomment this to save time
print("{} lines in userFeature.data".format(line_counts))
print("Memory Usage at this moment: {}".format(get_memory_str()))

9686953 lines in userFeature.data
Memory Usage at this moment: 51.11MB


In [5]:
lines = []
with open(USER_DATA_PATH) as f:
    for i in tqdm(range(line_counts)):
        line = f.readline().strip()
        lines.append(line)
print("Memory Usage at this moment: {}".format(get_memory_str()))

100%|██████████| 9686953/9686953 [00:17<00:00, 556975.53it/s]

Memory Usage at this moment: 4.21GB





In [6]:
feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]


def fast_count_feature_from_lines(lines):  
    # no checking, no uid, so faster
    counters = {feat_name: Counter() for feat_name in feat_names}
    n_lines = len(lines)
    # for line in tqdm(lines):
    for line in lines:
        for feat in line.split("|")[1:]:
            arr = feat.split(" ")
            key = arr[0]
            vals = arr[1:]
            counters[key] += Counter(vals)
    # print("[{}] worker's task done.".format(strftime("%H:%M:%S", gmtime())))
    return counters

In [7]:
def split_list(lst, n_split=4):
    n_total = len(lst)
    step = int(np.ceil(n_total / n_split))
    splits = []
    for offset in range(0, n_total, step):
        splits.append(lst[offset:offset + step])
    return splits

In [8]:
def merge_counter_dicts(counter_dict_list):
    final_counter_dict = {}
    for feat_name in feat_names:
        final_counter = Counter()
        for counter_dict in counter_dict_list:
            final_counter += counter_dict[feat_name]
        final_counter_dict[feat_name] = final_counter
    return final_counter_dict

In [9]:
def batch_count_features(lines, n_procs=None, n_batches=None):
    print("[{}] Starting counting features.".format(strftime("%H:%M:%S", gmtime())))
    n_procs = mp.cpu_count() if n_procs is None else n_procs
    n_batches = mp.cpu_count() if n_batches is None else n_batches
    pool = mp.Pool(processes=n_procs)
    results = [pool.apply_async(fast_count_feature_from_lines, (batch, )) for batch in split_list(lines, n_procs)]
    pool.close()
    pool.join()
    counters_list = [result.get() for result in results]
    # print("[{}] All workers' tasks done. Combining results...".format(strftime("%H:%M:%S", gmtime())))
    final_counter_dict = merge_counter_dicts(counters_list)
    print("[{}] Combining Finished. Memory Usage: {}".format(strftime("%H:%M:%S", gmtime()), get_memory_str()))
    return final_counter_dict

In [10]:
n_batches = 500
line_batches = split_list(lines, n_batches)
counter_dicts = []
for i, line_batch in enumerate(line_batches):
    print("Batch {}/{}".format(i, n_batches))
    counter_dicts.append(batch_count_features(line_batch, 4))
print("Memory Usage at this moment: {}".format(get_memory_str()))

Batch 0/500
[02:58:11] Starting counting features.
[02:58:19] Combining Finished. Memory Usage: 4.31GB
Batch 1/500
[02:58:19] Starting counting features.
[02:58:26] Combining Finished. Memory Usage: 4.32GB
Batch 2/500
[02:58:26] Starting counting features.
[02:58:34] Combining Finished. Memory Usage: 4.32GB
Batch 3/500
[02:58:34] Starting counting features.
[02:58:41] Combining Finished. Memory Usage: 4.33GB
Batch 4/500
[02:58:41] Starting counting features.
[02:58:57] Combining Finished. Memory Usage: 4.33GB
Batch 5/500
[02:58:57] Starting counting features.
[02:59:06] Combining Finished. Memory Usage: 4.34GB
Batch 6/500
[02:59:06] Starting counting features.
[02:59:15] Combining Finished. Memory Usage: 4.34GB
Batch 7/500
[02:59:15] Starting counting features.
[02:59:22] Combining Finished. Memory Usage: 4.35GB
Batch 8/500
[02:59:22] Starting counting features.
[02:59:31] Combining Finished. Memory Usage: 4.35GB
Batch 9/500
[02:59:31] Starting counting features.
[02:59:40] Combining F

[03:09:08] Combining Finished. Memory Usage: 4.74GB
Batch 80/500
[03:09:08] Starting counting features.
[03:09:17] Combining Finished. Memory Usage: 4.74GB
Batch 81/500
[03:09:17] Starting counting features.
[03:09:26] Combining Finished. Memory Usage: 4.75GB
Batch 82/500
[03:09:26] Starting counting features.
[03:09:34] Combining Finished. Memory Usage: 4.75GB
Batch 83/500
[03:09:34] Starting counting features.
[03:09:41] Combining Finished. Memory Usage: 4.76GB
Batch 84/500
[03:09:41] Starting counting features.
[03:09:49] Combining Finished. Memory Usage: 4.77GB
Batch 85/500
[03:09:49] Starting counting features.
[03:09:56] Combining Finished. Memory Usage: 4.77GB
Batch 86/500
[03:09:56] Starting counting features.
[03:10:04] Combining Finished. Memory Usage: 4.77GB
Batch 87/500
[03:10:04] Starting counting features.
[03:10:11] Combining Finished. Memory Usage: 4.78GB
Batch 88/500
[03:10:11] Starting counting features.
[03:10:19] Combining Finished. Memory Usage: 4.79GB
Batch 89/500

[03:20:11] Combining Finished. Memory Usage: 5.17GB
Batch 159/500
[03:20:11] Starting counting features.
[03:20:18] Combining Finished. Memory Usage: 5.17GB
Batch 160/500
[03:20:18] Starting counting features.
[03:20:26] Combining Finished. Memory Usage: 5.17GB
Batch 161/500
[03:20:26] Starting counting features.
[03:20:34] Combining Finished. Memory Usage: 5.18GB
Batch 162/500
[03:20:34] Starting counting features.
[03:20:43] Combining Finished. Memory Usage: 5.19GB
Batch 163/500
[03:20:43] Starting counting features.
[03:20:51] Combining Finished. Memory Usage: 5.19GB
Batch 164/500
[03:20:51] Starting counting features.
[03:21:00] Combining Finished. Memory Usage: 5.19GB
Batch 165/500
[03:21:00] Starting counting features.
[03:21:08] Combining Finished. Memory Usage: 5.2GB
Batch 166/500
[03:21:08] Starting counting features.
[03:21:16] Combining Finished. Memory Usage: 5.21GB
Batch 167/500
[03:21:16] Starting counting features.
[03:21:24] Combining Finished. Memory Usage: 5.22GB
Batc

[03:31:34] Combining Finished. Memory Usage: 5.59GB
Batch 238/500
[03:31:34] Starting counting features.
[03:31:42] Combining Finished. Memory Usage: 5.6GB
Batch 239/500
[03:31:42] Starting counting features.
[03:31:51] Combining Finished. Memory Usage: 5.61GB
Batch 240/500
[03:31:51] Starting counting features.
[03:32:00] Combining Finished. Memory Usage: 5.61GB
Batch 241/500
[03:32:00] Starting counting features.
[03:32:09] Combining Finished. Memory Usage: 5.62GB
Batch 242/500
[03:32:09] Starting counting features.
[03:32:18] Combining Finished. Memory Usage: 5.62GB
Batch 243/500
[03:32:18] Starting counting features.
[03:32:27] Combining Finished. Memory Usage: 5.63GB
Batch 244/500
[03:32:27] Starting counting features.
[03:32:36] Combining Finished. Memory Usage: 5.63GB
Batch 245/500
[03:32:36] Starting counting features.
[03:32:44] Combining Finished. Memory Usage: 5.64GB
Batch 246/500
[03:32:44] Starting counting features.
[03:32:53] Combining Finished. Memory Usage: 5.64GB
Batc

[03:43:19] Combining Finished. Memory Usage: 6.02GB
Batch 317/500
[03:43:19] Starting counting features.
[03:43:27] Combining Finished. Memory Usage: 6.03GB
Batch 318/500
[03:43:27] Starting counting features.
[03:43:37] Combining Finished. Memory Usage: 6.03GB
Batch 319/500
[03:43:37] Starting counting features.
[03:43:46] Combining Finished. Memory Usage: 6.04GB
Batch 320/500
[03:43:46] Starting counting features.
[03:43:55] Combining Finished. Memory Usage: 6.04GB
Batch 321/500
[03:43:55] Starting counting features.
[03:44:05] Combining Finished. Memory Usage: 6.05GB
Batch 322/500
[03:44:05] Starting counting features.
[03:44:15] Combining Finished. Memory Usage: 6.05GB
Batch 323/500
[03:44:15] Starting counting features.
[03:44:23] Combining Finished. Memory Usage: 6.06GB
Batch 324/500
[03:44:23] Starting counting features.
[03:44:32] Combining Finished. Memory Usage: 6.07GB
Batch 325/500
[03:44:32] Starting counting features.
[03:44:41] Combining Finished. Memory Usage: 6.07GB
Bat

[03:55:16] Combining Finished. Memory Usage: 6.45GB
Batch 396/500
[03:55:16] Starting counting features.
[03:55:25] Combining Finished. Memory Usage: 6.45GB
Batch 397/500
[03:55:25] Starting counting features.
[03:55:35] Combining Finished. Memory Usage: 6.46GB
Batch 398/500
[03:55:35] Starting counting features.
[03:55:44] Combining Finished. Memory Usage: 6.47GB
Batch 399/500
[03:55:44] Starting counting features.
[03:55:53] Combining Finished. Memory Usage: 6.47GB
Batch 400/500
[03:55:53] Starting counting features.
[03:56:03] Combining Finished. Memory Usage: 6.48GB
Batch 401/500
[03:56:03] Starting counting features.
[03:56:11] Combining Finished. Memory Usage: 6.48GB
Batch 402/500
[03:56:12] Starting counting features.
[03:56:21] Combining Finished. Memory Usage: 6.49GB
Batch 403/500
[03:56:21] Starting counting features.
[03:56:30] Combining Finished. Memory Usage: 6.49GB
Batch 404/500
[03:56:30] Starting counting features.
[03:56:38] Combining Finished. Memory Usage: 6.5GB
Batc

[04:07:27] Combining Finished. Memory Usage: 6.88GB
Batch 475/500
[04:07:27] Starting counting features.
[04:07:37] Combining Finished. Memory Usage: 6.88GB
Batch 476/500
[04:07:37] Starting counting features.
[04:07:47] Combining Finished. Memory Usage: 6.89GB
Batch 477/500
[04:07:47] Starting counting features.
[04:07:56] Combining Finished. Memory Usage: 6.89GB
Batch 478/500
[04:07:56] Starting counting features.
[04:08:05] Combining Finished. Memory Usage: 6.9GB
Batch 479/500
[04:08:05] Starting counting features.
[04:08:14] Combining Finished. Memory Usage: 6.91GB
Batch 480/500
[04:08:14] Starting counting features.
[04:08:24] Combining Finished. Memory Usage: 6.91GB
Batch 481/500
[04:08:24] Starting counting features.
[04:08:33] Combining Finished. Memory Usage: 6.92GB
Batch 482/500
[04:08:33] Starting counting features.
[04:08:42] Combining Finished. Memory Usage: 6.92GB
Batch 483/500
[04:08:42] Starting counting features.
[04:08:52] Combining Finished. Memory Usage: 6.93GB
Batc

In [11]:
print("[{}] Starting merging results.".format(strftime("%H:%M:%S", gmtime())))
counter_dict = merge_counter_dicts(counter_dicts)
print("[{}] Merging done.".format(strftime("%H:%M:%S", gmtime())))
print("Memory Usage at this moment: {}".format(get_memory_str()))                                   

[04:11:23] Starting merging results.
[04:11:55] Merging done.
Memory Usage at this moment: 7.03GB


In [15]:
while len(counter_dicts) > 0:
    del counter_dicts[0]
del counter_dicts
gc.collect()
print("Memory Usage at this moment: {}".format(get_memory_str())) 

Memory Usage at this moment: 6.91GB


In [19]:
def counter_to_csv(counter, filepath):
    with open(filepath, "w") as f:
        f.write("value,counts\n")
        for k, v in counter.most_common():
            f.write("{},{}\n".format(k, v))

In [21]:
out_folder = out_folder = '../data/counter/preliminary_contest_data/'
os.makedirs(out_folder, exist_ok=True)
for feat_name, counter in counter_dict.items():
    out_file = "userFeature.[featureName='{}'].csv".format(feat_name)
    out_path = os.path.join(out_folder, out_file)
    counter_to_csv(counter, out_path)

In [23]:
def save_as_pickle(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [25]:
vocab_folder = '../data/vocabulary/preliminary_contest_data/'
os.makedirs(vocab_folder, exist_ok=True)
for feat_name, counter in counter_dict.items():
    vocab_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
    vocab_path = os.path.join(vocab_folder, vocab_file)
    save_as_pickle(list(counter.keys()), vocab_path)

In [37]:
def merge_counter_dict(counter_dict):
    final_counter = Counter()
    for name, counter in counter_dict.items():
        new_counter = Counter({"{}_{}".format(name, k): v for k, v in counter.items()})
        final_counter += new_counter
    return final_counter

In [38]:
large_counter = merge_counter_dict(counter_dict)
print("Memory Usage at this moment: {}".format(get_memory_str())) 
print("Cleaning memory...")
for feat_name in feat_names:
    del counter_dict[feat_name]
del counter_dict
gc.collect()
print("Memory Usage at this moment: {}".format(get_memory_str())) 

Memory Usage at this moment: 6.91GB
Cleaning memory...
Memory Usage at this moment: 5.77GB


In [44]:
out_folder = '../data/counter/preliminary_contest_data/'
out_file = "userFeature.csv"
out_path = os.path.join(out_folder, out_file)
counter_to_csv(large_counter, out_path)

In [42]:
vocab_folder = '../data/vocabulary/preliminary_contest_data/'
vocab_file = "userFeature.pkl"
vocab_path = os.path.join(vocab_folder, vocab_file)
save_as_pickle(list(large_counter.keys()), vocab_path)

In [45]:
print("Total feature value(word) counts: {}".format(len(large_counter)))

Total feature value(word) counts: 419204
