In [1]:
from functools import lru_cache
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
@lru_cache(1024)
def count_values(string):
    if isinstance(string, float):
        return 0  # is nan
    else:
        return len(string.split(" "))

In [3]:
uj = dj.PandasPandasJointer("uid")

In [4]:
feats_to_count = ['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 
                  'kw1', 'kw2', 'kw3', 
                  'topic1', 'topic2', 'topic3', 
                  'appIdInstall', 'appIdAction'] 
df_user = None
for feat_name in feats_to_count:
    with pu.profiler("counting '{}' for each user".format(feat_name)):
        # preparation
        count_name = "valueCount@{}".format(feat_name)

        # load and count values
        df_feat = du.load_user_feature(feat_name)
        df_feat[count_name] = df_feat[feat_name].apply(count_values)
        df_feat.drop(feat_name, axis=1, inplace=True)  # clean data for joining

        # join data
        if df_user is None:
            df_user = df_feat
        else:
            df_user = uj.join(df_user, df_feat)

        # release memory and clean garbage
        del df_feat
        gc.collect()

[15:07:16] Finish counting 'interest1' for each user. △M: +300.49MB. △T: 32.2 seconds.
[15:07:39] Finish counting 'interest2' for each user. △M: +181.85MB. △T: 23.7 seconds.
[15:07:55] Finish counting 'interest3' for each user. △M: +86.13MB. △T: 15.4 seconds.
[15:08:09] Finish counting 'interest4' for each user. △M: +87.13MB. △T: 14.5 seconds.
[15:08:51] Finish counting 'interest5' for each user. △M: +90.48MB. △T: 42.1 seconds.
[15:09:27] Finish counting 'kw1' for each user. △M: +83.34MB. △T: 35.1 seconds.
[15:10:02] Finish counting 'kw2' for each user. △M: +91.5MB. △T: 35.7 seconds.
[15:10:19] Finish counting 'kw3' for each user. △M: +81.49MB. △T: 16.2 seconds.
[15:10:52] Finish counting 'topic1' for each user. △M: +88.34MB. △T: 33.9 seconds.
[15:11:31] Finish counting 'topic2' for each user. △M: +93.06MB. △T: 38.6 seconds.
[15:11:50] Finish counting 'topic3' for each user. △M: +79.32MB. △T: 19.0 seconds.
[15:12:12] Finish counting 'appIdInstall' for each user. △M: +314.17MB. △T: 22.1

In [5]:
feat_groups = {
    'interest': ['interest1', 'interest2', 'interest3', 'interest4', 'interest5'],
    'kw': ['kw1', 'kw2', 'kw3'],
    'topic': ['topic1', 'topic2', 'topic3'],
    'app': ['appIdInstall', 'appIdAction']
    
}

for name, feats in feat_groups.items():
    all_count_name = 'allCount@{}'.format(name)
    df_user[all_count_name] = 0
    for feat in feats:
        count_name = "valueCount@{}".format(feat)
        df_user[all_count_name] += df_user[count_name]
        df_user.drop(count_name, axis=1, inplace=True)  # clean data for joining

In [6]:
with pu.profiler("saving user count data to .csv file"):
    count_folder = os.path.join(config.DATA_DIR, "stats", config.PRELIMINARY_CONTEST_DATA_SUBDIR[1:], "row_value_counts")
    count_file = "high_level.csv"
    count_path = os.path.join(count_folder, count_file)
    os.makedirs(count_folder, exist_ok=True)

    df_user.to_csv(count_path, index=False)

[15:14:11] Finish saving user count data to .csv file. △M: +48.0KB. △T: 1.2 minutes.


In [None]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
df_all = pd.concat([df_train, df_test], ignore_index=True)

train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [None]:
with pu.profiler("joining user count data"):
    df_all = uj.join(df_all, df_user)
    df_all.drop(["aid", "uid", "label"], axis=1, inplace=True)  # clean data for saving
    gc.collect()

with pu.profiler("preparing"):
    out_folder = config.INPUT_DIR
    os.makedirs(out_folder, exist_ok=True)
    col_names = df_all.columns.tolist()
    X_all = sparse.csr_matrix(df_all.values)
    assert len(col_names) == len(feat_groups)
    assert X_all.shape[1] == len(feat_groups)

with pu.profiler("saving count data for the training set"):
    out_file = "train.high_level.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_train = X_all[:train_size, :]
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(feat_groups)

    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

with pu.profiler("saving count data for the testing set"):
    out_file = "test2.high_level.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_test = X_all[train_size:, :]
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(feat_groups)
    
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()