In [1]:
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
def inverse_dict(dic):
    return {v: k for k, v in dic.items()}

In [3]:
def indices_to_matrix(indices, max_cols):
    # life is short, I use my own implementation
    n_rows = len(indices)
    indptr = np.arange(n_rows + 1)
    indices = np.array(indices)
    data = np.ones(len(indices), dtype=np.int8)
    matrix = sparse.csr_matrix((data, indices, indptr), shape=(n_rows, max_cols) ,dtype=np.int8)

    del indptr
    del indices
    del data
    gc.collect()

    return matrix


def cross_vectorize(df, feat_names, add_prefix=True):
    assert len(feat_names) == 2  # only 2 degree crossing is supported now
    
    # get unique values
    feat1_name = feat_names[0]
    feat2_name = feat_names[1]
    feat1_vals = df[feat1_name].unique()
    feat2_vals = df[feat2_name].unique()

    # count number of unique values
    feat1_nunique = len(feat1_vals)
    feat2_nunique = len(feat2_vals)
    num_combinations  = feat1_nunique * feat2_nunique

    # get index base and offset for unique values
    feat1_to_index = dj.list_to_dict(feat1_vals, feat2_nunique)
    feat2_to_index = dj.list_to_dict(feat2_vals)

    # get indices
    indices1 = df[feat1_name].map(feat1_to_index)
    indices2 = df[feat2_name].map(feat2_to_index)
    indices = indices1 + indices2
    assert indices.nunique() >= max(feat1_nunique, feat2_nunique)
    assert indices.nunique() <= feat1_nunique * feat2_nunique
    
    # get column names
    index_to_feat1 = inverse_dict(feat1_to_index)
    index_to_feat2 = inverse_dict(feat2_to_index)
    col_names = []
    if not add_prefix:
        for i in range(0, num_combinations, feat2_nunique):
            feat1_val = index_to_feat1[i]
            col_names += ["{}x{}".format(feat1_val, index_to_feat2[j]) for j in range(feat2_nunique)]
    else:
        for i in range(0, num_combinations, feat2_nunique):
            feat1_val = index_to_feat1[i]
            col_names += ["{}_{}_x_{}_{}".format(feat1_name, feat1_val, feat2_name, index_to_feat2[j]) 
                          for j in range(feat2_nunique)]
    
    # release memory and collect garbage
    del feat1_to_index
    del feat2_to_index
    del index_to_feat1
    del index_to_feat2
    del indices1
    del indices2
    gc.collect()

    # construct sparse matrix
    matrix = indices_to_matrix(indices, num_combinations)
    assert matrix.shape[0] == df.shape[0]
    assert matrix.shape[1] == df[feat1_name].nunique() * df[feat2_name].nunique()
    return matrix, col_names

In [4]:
def cross_binary_path(ad_feat_name, user_feat_name, prefix="train", create=True):
    folder = os.path.join(out_folder, "[featureName='{}']".format(user_feat_name))
    file = "{}.[adFeatureName='{}'].binary.pkl".format(prefix, ad_feat_name)
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path

In [5]:
out_folder = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "simple_cross/byUserFeatureName")

In [6]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_ad = du.load_raw_data("ad")

In [7]:
train_size = df_train.shape[0]
test_size = df_test.shape[0]
df_all = pd.concat([df_train, df_test], ignore_index=True)
print("Train Size: {}".format(train_size))
print("Test Size: {}".format(test_size))
print("Concatenated Data Shape: {}".format(df_all.shape))

Train Size: 8798814
Test Size: 2265989
Concatenated Data Shape: (11064803, 3)


In [8]:
user_one_feat_names = config.USER_SINGLE_FEAT_NAMES
ad_feat_names = config.AD_FEAT_NAMES.copy()
ad_feat_names.remove("creativeSize")

In [9]:
aj = dj.PandasPandasJointer("aid")
uj = dj.PandasPandasJointer("uid")

ad_user = aj.join(df_all, df_ad)  # join ad features
for user_feat_name in user_one_feat_names:
    with pu.profiler("loading and joining '{}'".format(user_feat_name)):
        df_feat = du.load_user_feature(user_feat_name).fillna("[nan]")  # load user feature
        ad_user = uj.join(ad_user, df_feat)  # join user feature
        
        del df_feat
        gc.collect()

[14:13:54] Finish loading and joining 'age'. △M: +92.65MB. △T: 10.8 seconds.
[14:14:05] Finish loading and joining 'gender'. △M: +84.42MB. △T: 11.0 seconds.
[14:14:16] Finish loading and joining 'education'. △M: +84.42MB. △T: 10.9 seconds.
[14:14:28] Finish loading and joining 'consumptionAbility'. △M: +84.42MB. △T: 11.3 seconds.
[14:14:40] Finish loading and joining 'LBS'. △M: +85.45MB. △T: 12.0 seconds.
[14:14:52] Finish loading and joining 'carrier'. △M: +84.42MB. △T: 11.8 seconds.
[14:15:03] Finish loading and joining 'house'. △M: +84.42MB. △T: 11.9 seconds.


In [10]:
for user_feat_name in user_one_feat_names:
    for ad_feat_name in ad_feat_names:
        with pu.profiler("vectorizing and saving '{}'x'{}' binary".format(user_feat_name, ad_feat_name)):
            # get matrix and names for matrix columns
            matrix, col_names = cross_vectorize(ad_user, [ad_feat_name, user_feat_name])

            # save train matrix
            out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="train")
            matrix_train = matrix[:train_size, :]
            du.save_pickle((col_names, matrix_train), out_path)
            del matrix_train
            gc.collect()

            # save test matrix
            out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="test1")
            matrix_test = matrix[train_size:, :]
            du.save_pickle((col_names, matrix_test), out_path)
            del matrix_test
            gc.collect()

            # release memory and clean garbage
            del matrix
            del col_names
            gc.collect()

[14:15:08] Finish vectorizing and saving 'age'x'aid' binary. △M: -1.34MB. △T: 4.1 seconds.
[14:15:11] Finish vectorizing and saving 'age'x'advertiserId' binary. △M: +200.0KB. △T: 3.7 seconds.
[14:15:15] Finish vectorizing and saving 'age'x'campaignId' binary. △M: +4.0KB. △T: 3.6 seconds.
[14:15:19] Finish vectorizing and saving 'age'x'creativeId' binary. △M: +0B. △T: 3.6 seconds.
[14:15:21] Finish vectorizing and saving 'age'x'adCategoryId' binary. △M: +12.0KB. △T: 2.9 seconds.
[14:15:25] Finish vectorizing and saving 'age'x'productId' binary. △M: +4.0KB. △T: 3.4 seconds.
[14:15:28] Finish vectorizing and saving 'age'x'productType' binary. △M: +0B. △T: 2.9 seconds.
[14:15:32] Finish vectorizing and saving 'gender'x'aid' binary. △M: +4.0KB. △T: 3.8 seconds.
[14:15:35] Finish vectorizing and saving 'gender'x'advertiserId' binary. △M: -4.0KB. △T: 3.6 seconds.
[14:15:39] Finish vectorizing and saving 'gender'x'campaignId' binary. △M: +8.0KB. △T: 3.6 seconds.
[14:15:42] Finish vectorizing a