In [1]:
from functools import lru_cache
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
@lru_cache(1024)
def count_values(string):
    if isinstance(string, float):
        return 0  # is nan
    else:
        return len(string.split(" "))

In [3]:
uj = dj.PandasPandasJointer("uid")

In [4]:
feats_to_count = ['interest1', 'interest2', 'interest3', 'interest4', 'interest5', 
                  'kw1', 'kw2', 'kw3', 
                  'topic1', 'topic2', 'topic3', 
                  'appIdInstall', 'appIdAction'] 
df_user = None
for feat_name in feats_to_count:
    with pu.profiler("counting '{}' for each user".format(feat_name)):
        # preparation
        count_name = "valueCount@{}".format(feat_name)

        # load and count values
        df_feat = du.load_user_feature(feat_name)
        df_feat[count_name] = df_feat[feat_name].apply(count_values)
        df_feat.drop(feat_name, axis=1, inplace=True)  # clean data for joining

        # join data
        if df_user is None:
            df_user = df_feat
        else:
            df_user = uj.join(df_user, df_feat)

        # release memory and clean garbage
        del df_feat
        gc.collect()

[15:10:22] Finish counting 'interest1' for each user. △M: +274.13MB. △T: 30.2 seconds.
[15:10:43] Finish counting 'interest2' for each user. △M: +144.23MB. △T: 21.1 seconds.
[15:10:57] Finish counting 'interest3' for each user. △M: +74.91MB. △T: 13.9 seconds.
[15:11:11] Finish counting 'interest4' for each user. △M: +73.91MB. △T: 14.1 seconds.
[15:11:49] Finish counting 'interest5' for each user. △M: +78.31MB. △T: 37.6 seconds.
[15:12:24] Finish counting 'kw1' for each user. △M: +70.55MB. △T: 34.9 seconds.
[15:12:59] Finish counting 'kw2' for each user. △M: +79.63MB. △T: 35.2 seconds.
[15:13:15] Finish counting 'kw3' for each user. △M: +62.29MB. △T: 16.6 seconds.
[15:13:51] Finish counting 'topic1' for each user. △M: +77.56MB. △T: 35.7 seconds.
[15:14:22] Finish counting 'topic2' for each user. △M: +79.67MB. △T: 31.3 seconds.
[15:14:40] Finish counting 'topic3' for each user. △M: +63.8MB. △T: 17.9 seconds.
[15:15:01] Finish counting 'appIdInstall' for each user. △M: +264.33MB. △T: 20.3

In [5]:
feat_groups = {
    'interest': ['interest1', 'interest2', 'interest3', 'interest4', 'interest5'],
    'kw': ['kw1', 'kw2', 'kw3'],
    'topic': ['topic1', 'topic2', 'topic3'],
    'app': ['appIdInstall', 'appIdAction']
    
}

for name, feats in feat_groups.items():
    all_count_name = 'allCount@{}'.format(name)
    df_user[all_count_name] = 0
    for feat in feats:
        count_name = "valueCount@{}".format(feat)
        df_user[all_count_name] += df_user[count_name]
        df_user.drop(count_name, axis=1, inplace=True)  # clean data for joining

In [6]:
with pu.profiler("saving user count data to .csv file"):
    count_folder = os.path.join(config.DATA_DIR, "stats", config.PRELIMINARY_CONTEST_DATA_SUBDIR[1:], "row_value_counts")
    count_file = "high_level.csv"
    count_path = os.path.join(count_folder, count_file)
    os.makedirs(count_folder, exist_ok=True)

    df_user.to_csv(count_path, index=False)

[15:16:43] Finish saving user count data to .csv file. △M: +36.0KB. △T: 1.0 minutes.


In [7]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_all = pd.concat([df_train, df_test], ignore_index=True)

train_size = df_train.shape[0]
test_size = df_test.shape[0]

In [8]:
with pu.profiler("joining user count data"):
    df_all = uj.join(df_all, df_user)
    df_all.drop(["aid", "uid", "label"], axis=1, inplace=True)  # clean data for saving
    gc.collect()

with pu.profiler("preparing"):
    out_folder = config.INPUT_DIR
    os.makedirs(out_folder, exist_ok=True)
    col_names = df_all.columns.tolist()
    X_all = sparse.csr_matrix(df_all.values)
    assert len(col_names) == len(feat_groups)
    assert X_all.shape[1] == len(feat_groups)

with pu.profiler("saving count data for the training set"):
    out_file = "train.high_level.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_train = X_all[:train_size, :]
    assert X_train.shape[0] == df_train.shape[0]
    assert X_train.shape[1] == len(feat_groups)

    du.save_pickle((col_names, X_train), out_path)
    del X_train
    gc.collect()

with pu.profiler("saving count data for the testing set"):
    out_file = "test1.high_level.rowCount.pkl"
    out_path = os.path.join(out_folder, out_file)
    X_test = X_all[train_size:, :]
    assert X_test.shape[0] == df_test.shape[0]
    assert X_test.shape[1] == len(feat_groups)
    
    du.save_pickle((col_names, X_test), out_path)
    del X_test
    gc.collect()

[15:17:00] Finish joining user count data. △M: +168.84MB. △T: 13.0 seconds.
[15:17:04] Finish preparing. △M: +416.34MB. △T: 4.0 seconds.
[15:17:07] Finish saving count data for the training set. △M: +64.0KB. △T: 2.3 seconds.
[15:17:07] Finish saving count data for the testing set. △M: +25.42MB. △T: 0.6 seconds.
