In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from contextlib import contextmanager  # for context management

In [2]:
def count_file_lines(filepath):
    f = open(filepath)
    for i, l in tqdm(enumerate(f.readlines())):
        pass
    return i + 1

In [3]:
@contextmanager
def open_files(f_dict):
    fs = {k:open(v, "w") for k, v in f_dict.items()}
    yield fs
    for k, f in fs.items():
        f.close()

In [4]:
class FileWritter:
    def __init__(self, f):
        self.f = f
        self.buffer = ""
    
    def write_buffer(self, chars):
        self.buffer += chars
        
    def clear_buffer(self):
        self.buffer = ""
    
    def flush(self):
        self.f.write(self.buffer)
        self.clear_buffer()

In [5]:
class FileWritterGroup:
    def __init__(self, f_dict):
        self.writters = {k:FileWritter(v) for k, v in f_dict.items()}
        self.n_writters = len(f_dict)
    
    def write_buffer(self, name, chars):
        self.writters[name].write_buffer(chars)
        
    def write_buffers(self, chars):
        for name, writter in self.writters.items():
            writter.write_buffer(chars)
    
    def clear_buffer(self, name):
        self.writters[name].clear_buffer()
    
    def clear_buffers(self):
        for name, writter in self.writters.items():
            writter.clear_buffer()
            
    def flush(self):
        for name, writter in self.writters.items():
            writter.flush()

In [6]:
DATA_DIR = '../data/raw/preliminary_contest_data/'
# USER_DATA_FILE, line_counts, out_folder = 'userFeature.data', 11420039, '../data/split/preliminary_contest_data/byUserFeatureName[all]/'
USER_DATA_FILE, line_counts, out_folder = 'userFeature.preliminary.data', 9686953, '../data/split/preliminary_contest_data/byUserFeatureName/'
USER_DATA_PATH = os.path.join(DATA_DIR, USER_DATA_FILE)

In [7]:
# comment this you are afraid your memory will explode, it requires ~5GB
lines = []
with open(USER_DATA_PATH) as f:
    for i in tqdm(range(line_counts)):
        line = f.readline().strip()
        lines.append(line)
print("{} lines read".format(i + 1))
assert line_counts == i + 1

100%|██████████| 9686953/9686953 [00:19<00:00, 506158.43it/s]

9686953 lines read





In [8]:
os.makedirs(out_folder, exist_ok=True)  # create directory if not existed
feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]
feat_to_filenames = {feat_name: os.path.join(out_folder, "userFeature.[featureName='{}'].data".format(feat_name)) 
                     for feat_name in feat_names}

In [9]:
batch_size = 1000
batch_counts = int(np.ceil(line_counts / batch_size))

In [10]:
# comment this you are afraid your memory will explode, it requires ~5GB
with open_files(feat_to_filenames) as feat_fs:
    fwg = FileWritterGroup(feat_fs)
    for name, writter in fwg.writters.items():
        writter.write_buffer("uid|{}\n".format(name))  # write header
        writter.flush()
    for batch_i in tqdm(range(batch_counts)):
        line_batch = lines[batch_i * batch_size:(batch_i + 1) * batch_size]
        try:
            fwg.clear_buffers()
            for line in line_batch:
                feats = line.split("|")
                uid = feats[0].split(" ")[1]
                fwg.write_buffers("{}|".format(uid))
                for feat in feats[1:]:
                    key, val = feat.split(" ", maxsplit=1)  # only do split for once; e.g. key: "interest1", val: "xxx yyy ..."
                    # fwg.write_buffer(key, "{}|{}\n".format(uid, val))  # when there is only uid in feats, fwg will write nothing, this is bad for consistency
                    fwg.write_buffer(key, val)
                fwg.write_buffers("\n")
        except Exception as e:
            print(e.args)
            pass
        fwg.flush()
print("done.")

100%|██████████| 9687/9687 [10:59<00:00, 14.68it/s]

done.





In [11]:
# # uncomment this if you think your memory can hold it
# with open(os.path.join(DATA_DIR, USER_DATA_FILE), "r") as user_f:
#     with open_files(feat_to_filenames) as feat_fs:
#         fwg = FileWritterGroup(feat_fs)
#         for name, writter in fwg.writters.items():
#             writter.write_buffer("uid|{}".format(name))
#         for i in tqdm(range(batch_counts)):
#             try:
#                 fwg.clear_buffers()
#                 for i in range(batch_size):
#                     ln = user_f.readline().strip()
#                     feats = ln.split("|")
#                     uid = feats[0].split(" ")[1]  # "xxxxxxx"
#                     fwg.write_buffers("{}|".format(uid))
#                     for feat in feats[1:]:
#                         key, val = feat.split(" ", num=1)  # only do split for once; e.g. key: "interest1", val: "xxx yyy ..."
#                         # fwg.write_buffer(key, "{}|{}\n".format(uid, val))  # when there is only uid in feats, fwg will write nothing, this is bad for consistency
#                         fwg.write_buffer(key, val)
#                     fwg.write_buffers("\n")
#             except Exception as e:
#                 pass
#             fwg.flush()
# print("done.")

In [14]:
line_count_dict = {feat_name:count_file_lines(filepath) for feat_name, filepath in feat_to_filenames.items()}
print(line_count_dict)
for feat_name, lc in line_count_dict.items():
    assert lc == line_counts or lc == line_counts + 1  # there is one empty line at the end

9686954it [00:03, 3093083.47it/s]
9686954it [00:03, 3046647.71it/s]
9686954it [00:03, 3092382.16it/s]
9686954it [00:03, 3071914.80it/s]
9686954it [00:03, 3008569.04it/s]
9686954it [00:03, 3084307.50it/s]
9686954it [00:03, 3083362.11it/s]
9686954it [00:03, 3008979.68it/s]
9686954it [00:03, 2988031.48it/s]
9686954it [00:03, 3083134.22it/s]
9686954it [00:03, 3113648.46it/s]
9686954it [00:03, 3088680.09it/s]
9686954it [00:03, 2852687.34it/s]
9686954it [00:03, 3104943.49it/s]
9686954it [00:03, 2935023.97it/s]
9686954it [00:03, 3020814.00it/s]
9686954it [00:03, 3050800.48it/s]
9686954it [00:03, 3116912.27it/s]
9686954it [00:03, 3058709.17it/s]
9686954it [00:03, 3049256.14it/s]
9686954it [00:03, 2901525.52it/s]
9686954it [00:03, 3129620.33it/s]
9686954it [00:03, 3116251.98it/s]

{'consumptionAbility': 9686954, 'topic1': 9686954, 'os': 9686954, 'age': 9686954, 'LBS': 9686954, 'interest4': 9686954, 'ct': 9686954, 'interest5': 9686954, 'kw2': 9686954, 'appIdInstall': 9686954, 'marriageStatus': 9686954, 'appIdAction': 9686954, 'kw1': 9686954, 'kw3': 9686954, 'interest3': 9686954, 'gender': 9686954, 'house': 9686954, 'education': 9686954, 'interest2': 9686954, 'topic2': 9686954, 'topic3': 9686954, 'carrier': 9686954, 'interest1': 9686954}



