In [1]:
from tqdm import tqdm
from collections import Counter
import os
import sys
import numpy as np
import multiprocessing as mp
from time import gmtime, strftime
import gc
import pickle
import sys
sys.path.append('../../../code/utils')
from perf_utils import get_memory_str

In [2]:
DATA_DIR = '../../../data/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.preliminary.data'
USER_DATA_PATH = os.path.join(DATA_DIR, USER_DATA_FILE)

In [3]:
def count_file_lines(filepath):
    f = open(filepath)
    for i, l in tqdm(enumerate(f.readlines())):
        pass
    return i + 1

In [4]:
# line_counts = count_file_lines(os.path.join(DATA_DIR, USER_DATA_FILE))  # comment this if you don't want to run it again
line_counts = 9686953  # uncomment this to save time
print("{} lines in userFeature.data".format(line_counts))
print("Memory Usage at this moment: {}".format(get_memory_str()))

9686953 lines in userFeature.data
Memory Usage at this moment: 61.0MB


In [5]:
lines = []
with open(USER_DATA_PATH) as f:
    for i in tqdm(range(line_counts)):
        line = f.readline().strip()
        lines.append(line)
print("Memory Usage at this moment: {}".format(get_memory_str()))

100%|████████████████████████████████████████████████████████████████████| 9686953/9686953 [00:31<00:00, 306340.97it/s]


Memory Usage at this moment: 4.2GB


In [6]:
feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]


def fast_count_feature_from_lines(lines):  
    # no checking, no uid, so faster
    counters = {feat_name: Counter() for feat_name in feat_names}
    n_lines = len(lines)
    # for line in tqdm(lines):
    for line in lines:
        for feat in line.split("|")[1:]:
            arr = feat.split(" ")
            key = arr[0]
            vals = arr[1:]
            counters[key] += Counter(vals)
    # print("[{}] worker's task done.".format(strftime("%H:%M:%S", gmtime())))
    return counters

In [7]:
def split_list(lst, n_split=4):
    n_total = len(lst)
    step = int(np.ceil(n_total / n_split))
    splits = []
    for offset in range(0, n_total, step):
        splits.append(lst[offset:offset + step])
    return splits

In [8]:
def merge_counter_dicts(counter_dict_list):
    final_counter_dict = {}
    for feat_name in feat_names:
        final_counter = Counter()
        for counter_dict in counter_dict_list:
            final_counter += counter_dict[feat_name]
        final_counter_dict[feat_name] = final_counter
    return final_counter_dict

In [9]:
def batch_count_features(lines, n_procs=None, n_batches=None):
    print("[{}] Starting counting features.".format(strftime("%H:%M:%S", gmtime())))
    n_procs = mp.cpu_count() if n_procs is None else n_procs
    print(1)
    n_batches = mp.cpu_count() if n_batches is None else n_batches
    print(2)
    pool = mp.Pool(processes=n_procs)
    print(3)
    results = [pool.apply_async(fast_count_feature_from_lines, (batch, )) for batch in split_list(lines, n_procs)]
    print(4)
    pool.close()
    print(5)
    pool.join()
    print(6)
    counters_list = [result.get() for result in results]
    print("[{}] All workers' tasks done. Combining results...".format(strftime("%H:%M:%S", gmtime())))
    final_counter_dict = merge_counter_dicts(counters_list)
    print("[{}] Combining Finished. Memory Usage: {}".format(strftime("%H:%M:%S", gmtime()), get_memory_str()))
    return final_counter_dict

In [None]:
n_batches = 500
line_batches = split_list(lines, n_batches)
counter_dicts = []
for i, line_batch in enumerate(line_batches):
    print("Batch {}/{}".format(i, n_batches))
    counter_dicts.append(batch_count_features(line_batch, 4))
print("Memory Usage at this moment: {}".format(get_memory_str()))

Batch 0/500
[16:22:14] Starting counting features.
1
2
3
4
5


In [None]:
print("[{}] Starting merging results.".format(strftime("%H:%M:%S", gmtime())))
counter_dict = merge_counter_dicts(counter_dicts)
print("[{}] Merging done.".format(strftime("%H:%M:%S", gmtime())))
print("Memory Usage at this moment: {}".format(get_memory_str()))                                   

In [None]:
while len(counter_dicts) > 0:
    del counter_dicts[0]
del counter_dicts
gc.collect()
print("Memory Usage at this moment: {}".format(get_memory_str())) 

In [None]:
def counter_to_csv(counter, filepath):
    with open(filepath, "w") as f:
        f.write("value,counts\n")
        for k, v in counter.most_common():
            f.write("{},{}\n".format(k, v))

In [None]:
out_folder = out_folder = '../../../data/counter/preliminary_contest_data/'
os.makedirs(out_folder, exist_ok=True)
for feat_name, counter in counter_dict.items():
    out_file = "userFeature.[featureName='{}'].csv".format(feat_name)
    out_path = os.path.join(out_folder, out_file)
    counter_to_csv(counter, out_path)

In [None]:
def save_as_pickle(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [None]:
vocab_folder = '../../../data/vocabulary/preliminary_contest_data/'
os.makedirs(vocab_folder, exist_ok=True)
for feat_name, counter in counter_dict.items():
    vocab_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
    vocab_path = os.path.join(vocab_folder, vocab_file)
    save_as_pickle(list(counter.keys()), vocab_path)

In [None]:
def merge_counter_dict(counter_dict):
    final_counter = Counter()
    for name, counter in counter_dict.items():
        new_counter = Counter({"{}_{}".format(name, k): v for k, v in counter.items()})
        final_counter += new_counter
    return final_counter

In [None]:
large_counter = merge_counter_dict(counter_dict)
print("Memory Usage at this moment: {}".format(get_memory_str())) 
print("Cleaning memory...")
for feat_name in feat_names:
    del counter_dict[feat_name]
del counter_dict
gc.collect()
print("Memory Usage at this moment: {}".format(get_memory_str())) 

In [None]:
out_folder = '../../../data/counter/preliminary_contest_data/'
out_file = "userFeature.csv"
out_path = os.path.join(out_folder, out_file)
counter_to_csv(large_counter, out_path)

In [None]:
vocab_folder = '../../../data/vocabulary/preliminary_contest_data/'
vocab_file = "userFeature.pkl"
vocab_path = os.path.join(vocab_folder, vocab_file)
save_as_pickle(list(large_counter.keys()), vocab_path)

In [None]:
print("Total feature value(word) counts: {}".format(len(large_counter)))