In [1]:
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../../../code/utils')
sys.path.append('../../../code/pipeline')
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
def dict_to_list(dic):
    """Given a dictionary mapping something to integers, return list of keys sorted by their values"""
    return [k for k, v in sorted(dic.items(), key=lambda x: x[1])]

In [3]:
def cross_vectorize(matrix, selectors, col_names, prefixes=None):
    """Construct cross binary matrix from a matrix, a selector array (most likely the ad feature values in this case)"""
    assert matrix.shape[0] == selectors.shape[0]
    num_rows = matrix.shape[0]
    num_cols = matrix.shape[1]  # get count of columns for quick crossing
    
    # preparation for selectors
    selectors = pd.Series(selectors) if not isinstance(selectors, pd.Series) else selectors
    unique_selectors = np.unique(selectors.values)  # get unique selector values for crossing
    selector_to_offset = {selector: i * num_cols for i, selector in enumerate(unique_selectors)}  # map unique selector to offset
    
    # preparation for each row
    row_lengths = np.squeeze(np.asarray(matrix.sum(axis=1)))  # get number of non-zeros in each row for quick crossing
    row_offsets = selectors.map(selector_to_offset).values  # get offset of each row in the cross matrix
    offsets = np.repeat(row_offsets, row_lengths)  # get offsets for matrix.indices for quick crossing
    
    # construct cross matrix
    cross_indices = offsets + matrix.indices  # calculate the indices for the cross matrix
    cross_matrix = sparse.csr_matrix((matrix.data, cross_indices, matrix.indptr), 
                                     shape=(num_rows, num_cols * len(unique_selectors)),
                                     dtype=np.int8)  # construct the cross matrix
    
    # get column names for the cross matrix
    cross_col_names = []
    if prefixes is None:
        for selector in unique_selectors:
            cross_col_names += ["{}x{}".format(selector, col_name) for col_name in col_names]
    else:
        prefix1, prefix2 = prefixes  # 
        for selector in unique_selectors:
            cross_col_names += ["{}_{}_x_{}_{}".format(prefix1, selector, prefix2, col_name) 
                                for col_name in col_names]
            
    assert cross_matrix.shape[1] == len(cross_col_names)
    return cross_matrix, cross_col_names

In [4]:
crossbin_folder = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "simple_cross/byUserFeatureName")

def cross_binary_path(ad_feat_name, user_feat_name, prefix="train", create=True):
    folder = os.path.join(crossbin_folder, "[featureName='{}']".format(user_feat_name))
    file = "{}.[adFeatureName='{}'].binary.pkl".format(prefix, ad_feat_name)
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path

In [5]:
pairs = [('advertiserId', 'interest1'),
         ('aid', 'interest2'),
         ('creativeSize', 'interest2'), 
         ('campaignId', 'interest4'),  # whether to keep it? 
         ('aid', 'interest5'),  
         ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
         ('productType', 'kw2'),
         ('productType', 'kw3'),
         ('productType', 'topic1'),
         ('aid', 'topic2'),
         ('productType', 'topic2'),
         ('aid', 'ct'),
         ('aid', 'os')]

In [6]:
ufeat_to_afeats = {}
for afeat, ufeat in pairs:
    if ufeat in ufeat_to_afeats:
        ufeat_to_afeats[ufeat] += [afeat]
    else:
        ufeat_to_afeats[ufeat] = [afeat]

In [7]:
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test2")
df_ad = du.load_raw_data("ad")
train_size = df_train.shape[0]
test_size = df_test.shape[0]

df_all = pd.concat([df_train, df_test], ignore_index=True)  # concatenate DataFrames and then split at the end; to speed up
print("Train Size: {}".format(train_size))
print("Test Size: {}".format(test_size))
print("Concatenated Data Shape: {}".format(df_all.shape))

Train Size: 8798814
Test Size: 2265879
Concatenated Data Shape: (11064693, 3)


In [8]:
# define jointer
ad_jointer = dj.PandasPandasJointer("aid")
user_jointer = dj.PandasMatrixJointer("uid")

with pu.profiler("joining train+test DataFrame and ad DataFrame"):
    df_all = ad_jointer.join(df1=df_all, df2=df_ad) 

[12:23:52] Finish joining train+test DataFrame and ad DataFrame. △M: +675.6MB. △T: 2.2 seconds.


In [9]:
for ufeat, afeats in ufeat_to_afeats.items():
    ### given a user feature ###
    # load and join user matrix
    row_uids, (word_to_index, user_matrix) = du.load_user_cnt(ufeat)
    col_names = dict_to_list(word_to_index)
    matrix = user_jointer.join(df=df_all, matrix=user_matrix, row_names=row_uids)
    
    for afeat in afeats:
        ### given a ad feature ###
        # construct cross matrix
        with pu.profiler("vectorizing and saving '{}'x'{}' binary".format(ufeat, afeat)):
            cross_matrix, cross_col_names = cross_vectorize(matrix, df_all[afeat], col_names, [afeat, ufeat])
            
             # save train matrix
            out_path = cross_binary_path(afeat, ufeat, prefix="train")
            cross_matrix_train = cross_matrix[:train_size, :]
            du.save_pickle((cross_col_names, cross_matrix_train), out_path)
            del cross_matrix_train
            gc.collect()

            # save test matrix
            out_path = cross_binary_path(afeat, ufeat, prefix="test2")
            cross_matrix_test = cross_matrix[train_size:, :]
            du.save_pickle((cross_col_names, cross_matrix_test), out_path)
            del cross_matrix_test
            gc.collect()

            # release memory and collect garbage
            del cross_matrix
            del cross_col_names
            gc.collect()
            
    # release memory and collect garbage
    del user_matrix
    del matrix
    del col_names
    del word_to_index
    gc.collect()

[12:24:26] Finish vectorizing and saving 'topic1'x'productType' binary. △M: -328.39MB. △T: 4.3 seconds.
[12:25:03] Finish vectorizing and saving 'interest5'x'aid' binary. △M: -310.04MB. △T: 9.2 seconds.
[12:25:31] Finish vectorizing and saving 'kw3'x'productType' binary. △M: -306.42MB. △T: 2.5 seconds.
[12:25:59] Finish vectorizing and saving 'os'x'aid' binary. △M: -307.52MB. △T: 3.0 seconds.
[12:26:34] Finish vectorizing and saving 'interest1'x'advertiserId' binary. △M: -312.22MB. △T: 8.4 seconds.
[12:27:21] Finish vectorizing and saving 'topic2'x'aid' binary. △M: -382.83MB. △T: 20.1 seconds.
[12:27:25] Finish vectorizing and saving 'topic2'x'productType' binary. △M: +22.96MB. △T: 4.0 seconds.
[12:27:54] Finish vectorizing and saving 'interest4'x'campaignId' binary. △M: -308.17MB. △T: 3.2 seconds.
[12:28:37] Finish vectorizing and saving 'kw1'x'productType' binary. △M: -402.97MB. △T: 13.8 seconds.
[12:29:12] Finish vectorizing and saving 'kw2'x'productType' binary. △M: -334.13MB. △T: 