In [1]:
import os
import sys
from tqdm import tqdm
import pandas as pd
import pickle

In [2]:
DATA_DIR = '../../../data/raw/preliminary_contest_data/'
USER_DATA_FILE = 'userFeature.data'

In [3]:
def count_file_lines(filepath):
    f = open(filepath)
    for i, l in tqdm(enumerate(f.readlines())):
        pass
    return i + 1

In [4]:
def line_to_dict(line, to_int=True):
    out = {}
    line = line.strip()
    for feat in line.split("|"):
        arr = feat.split(" ")
        key = arr[0]
        val = arr[1:]
        if to_int:
            val = [int(i) for i in val]
        out[key] = val
    return out

In [5]:
def load(filename, **kw):
    engine = kw.pop("engine", "python")
    return pd.read_csv(os.path.join(DATA_DIR, filename), engine=engine, **kw)

In [6]:
def save_list(lst, filepath):
    with open(filepath, "w") as f:
        for i in lst:
            f.write("{}\n".format(i))

In [7]:
def save_as_pickle(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [8]:
def load_int_list(filepath, dtype=None):
    lst = None
    with open(filepath, "r") as f:
        lst = [int(ln) for ln in f.readlines()]
    return lst

In [9]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [10]:
# line_counts = count_file_lines(os.path.join(DATA_DIR, USER_DATA_FILE))  # comment this if you don't want to run it again
line_counts = 11420039  # uncomment this to save time
print("{} lines in userFeature.data".format(line_counts))

11420039 lines in userFeature.data


In [11]:
# comment this block to save time if you have done this before
train_uid = load("train.csv", usecols=["uid"])["uid"].unique().tolist()
test_uid = load("test1.csv", usecols=["uid"])["uid"].unique().tolist()
train_uid = sorted(train_uid)  # sort list in ascending order
test_uid = sorted(test_uid)

# save_list(train_uid, os.path.join(DATA_DIR, "train.uid.list"))  # larger than pickle, just comment this
# save_list(test_uid, os.path.join(DATA_DIR, "test1.uid.list"))
save_as_pickle(train_uid, os.path.join(DATA_DIR, "train.uid.pkl"))
save_as_pickle(test_uid, os.path.join(DATA_DIR, "test1.uid.pkl"))

In [12]:
# # uncomment this block if you have saved this before so that you can load them from disk quickly
# train_uid = load_pickle(os.path.join(DATA_DIR, "train.uid.pkl"))
# test_uid = load_pickle(os.path.join(DATA_DIR, "test1.uid.pkl"))

In [13]:
print("Train User Counts: {}".format(len(train_uid)))
print("Test User Counts: {}".format(len(test_uid)))

Train User Counts: 7883466
Test User Counts: 2195951


In [14]:
# comment this block to save time if you have done this before
required_uid = list(set(train_uid).union(set(test_uid)))  # get union of two lists
required_uid = sorted(required_uid)
# save_list(required_uid, os.path.join(DATA_DIR, "required.uid.list"))
save_as_pickle(required_uid, os.path.join(DATA_DIR, "required.uid.pkl"))

In [15]:
# # uncomment this block if you have saved this before so that you can load them from disk quickly
# required_uid = load_pickle(os.path.join(DATA_DIR, "required.uid.pkl"))

In [16]:
print("Train+Test User Counts: {}".format(len(required_uid)))

Train+Test User Counts: 9686953


In [17]:
out_file = "userFeature.preliminary.data"
out_path = os.path.join(DATA_DIR, out_file)
required_uid_set = set(required_uid)

batch_size = 1000
batch_counts = (line_counts - 1) // batch_size + 1
with open(os.path.join(DATA_DIR, USER_DATA_FILE), "r") as user_f:
    with open(out_path, "w") as out_f:
        for i in tqdm(range(batch_counts)):
            try:
                cache = ""
                for i in range(batch_size):
                    # 1. the first 4 characters must be "uid ", just use indexing to filter it out
                    # 2. don't do split on the whole line. slice the line first before using split
                    # 3. use `set` instead of `list` (the bottleneck)
                    ln = user_f.readline()
                    feat = ln[4:16]
                    uid = feat.split("|")[0]
                    uid = int(uid)
                    if uid in required_uid_set:
                        cache += ln
            except Exception as e:
                pass
            out_f.write(cache)
print("done.")
assert(count_file_lines(out_path) == len(required_uid))

100%|██████████| 11421/11421 [00:21<00:00, 522.86it/s]


done.


9686953it [00:03, 2459053.92it/s]


In [18]:
# data = []
# for i, l in tqdm(enumerate(f.readlines())):
# #     if i % 1000 == 0:
# #         print("{} lines read.".format(i))
#     data.append(line_to_dict(l))