In [1]:
from functools import lru_cache
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
@lru_cache(1024)
def count_values(string):
    if isinstance(string, float):
        return 0  # is nan
    else:
        return len(string.split(" "))

In [3]:
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES

In [4]:
uj = dj.PandasPandasJointer("uid")

In [5]:
df_user = None
for feat_name in user_multi_feat_names:
    with pu.profiler("counting '{}' for each user".format(feat_name)):
        # preparation
        count_name = "valueCount@{}".format(feat_name)

        # load and count values
        df_feat = du.load_user_feature(feat_name)
        df_feat[count_name] = df_feat[feat_name].apply(count_values)
        df_feat.drop(feat_name, axis=1, inplace=True)  # clean data for joining

        # join data
        if df_user is None:
            df_user = df_feat
        else:
            df_user = uj.join(df_user, df_feat)

        # release memory and clean garbage
        del df_feat
        gc.collect()

[18:07:18] Finish counting 'marriageStatus' for each user. △M: +149.86MB. △T: 5.3 seconds.
[18:07:41] Finish counting 'interest1' for each user. △M: +176.5MB. △T: 23.7 seconds.
[18:07:56] Finish counting 'interest2' for each user. △M: +69.53MB. △T: 14.6 seconds.
[18:08:06] Finish counting 'interest3' for each user. △M: +73.66MB. △T: 9.7 seconds.
[18:08:15] Finish counting 'interest4' for each user. △M: +73.66MB. △T: 9.6 seconds.
[18:08:41] Finish counting 'interest5' for each user. △M: +78.55MB. △T: 26.1 seconds.
[18:09:03] Finish counting 'kw1' for each user. △M: +68.28MB. △T: 21.6 seconds.
[18:09:25] Finish counting 'kw2' for each user. △M: +80.88MB. △T: 21.9 seconds.
[18:09:35] Finish counting 'kw3' for each user. △M: +64.8MB. △T: 10.6 seconds.
[18:09:56] Finish counting 'topic1' for each user. △M: +75.81MB. △T: 21.0 seconds.
[18:10:17] Finish counting 'topic2' for each user. △M: +82.42MB. △T: 20.4 seconds.
[18:10:28] Finish counting 'topic3' for each user. △M: +63.64MB. △T: 11.0 se

In [6]:
with pu.profiler("saving user count data to .csv file"):
    count_folder = os.path.join(config.DATA_DIR, "stats", config.PRELIMINARY_CONTEST_DATA_SUBDIR[1:], "row_value_counts")
    count_file = "raw.csv"
    count_path = os.path.join(count_folder, count_file)
    os.makedirs(count_folder, exist_ok=True)

    df_user.to_csv(count_path, index=False)

[18:13:12] Finish saving user count data to .csv file. △M: +356.0KB. △T: 2.0 minutes.


In [7]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_all = pd.concat([df_train, df_test], ignore_index=True)

train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [8]:
with pu.profiler("joining user count data"):
    df_all = uj.join(df_all, df_user)
    df_all.drop(["aid", "uid", "label"], axis=1, inplace=True)  # clean data for saving
    gc.collect()

with pu.profiler("preparing"):
    out_folder = config.INPUT_DIR
    os.makedirs(out_folder, exist_ok=True)
    col_names = df_all.columns.tolist()
    X_all = sparse.csr_matrix(df_all.values)
    assert len(col_names) == len(user_multi_feat_names)
    assert X_all.shape[1] == len(user_multi_feat_names)

with pu.profiler("saving count data for the training set"):
    out_file = "train.raw.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_train = X_all[:train_size, :]
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(user_multi_feat_names)

    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

with pu.profiler("saving count data for the testing set"):
    out_file = "test1.raw.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_test = X_all[train_size:, :]
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(user_multi_feat_names)
    
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[18:13:32] Finish joining user count data. △M: +1.16GB. △T: 17.2 seconds.
[18:13:39] Finish preparing. △M: +1.18GB. △T: 6.2 seconds.
[18:13:43] Finish saving count data for the training set. △M: +68.0KB. △T: 4.7 seconds.
[18:13:45] Finish saving count data for the testing set. △M: +4.14MB. △T: 1.2 seconds.
