In [1]:
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import itertools
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_pipeline as dp
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [2]:
cross_folder = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "simple_cross/byUserFeatureName")


def cross_binary_path(ad_feat_name, user_feat_name, prefix="train", create=True):
    folder = os.path.join(cross_folder, "[featureName='{}']".format(user_feat_name))
    file = "{}.[adFeatureName='{}'].binary.pkl".format(prefix, ad_feat_name)
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path

In [3]:
pairs = [("aid", "age"), ('aid', 'interest2'), ('productType', 'kw2')]
out_folder = os.path.join(config.DATA_DIR, "input_final")

col_names = []
matrix = None
for ad_feat_name, user_feat_name in pairs:
    with pu.profiler("processing '{}' x '{}'".format(ad_feat_name, user_feat_name)):
        # =============
        # process train
        # =============
        path = cross_binary_path(ad_feat_name, user_feat_name, "train")
        new_col_names, new_matrix = du.load_pickle(path, use_joblib=True)
        
        min_df = 10
        col_nnz = new_matrix.getnnz(axis=0)
        mask = (col_nnz >= min_df)
        new_col_names = list(itertools.compress(new_col_names, mask))
        new_matrix = new_matrix[:, mask]
        assert len(new_col_names) == new_matrix.shape[1]

        col_names += new_col_names
        if matrix is None:
            matrix = new_matrix
        else:
            matrix = sparse.hstack((matrix, new_matrix))
        
        out_file = "train.cross.wordCount_v2.pkl"
        out_path = os.path.join(out_folder, out_file)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)

[01:39:55] Finish processing 'aid' x 'age'. △M: +84.33MB. △T: 1.0 seconds.
[01:40:00] Finish processing 'aid' x 'interest2'. △M: +524.55MB. △T: 5.8 seconds.
[01:40:11] Finish processing 'productType' x 'kw2'. △M: +409.42MB. △T: 10.8 seconds.


In [4]:
col_names = []
matrix = None
for ad_feat_name, user_feat_name in pairs:
    with pu.profiler("processing '{}' x '{}'".format(ad_feat_name, user_feat_name)):
        # =============
        # process train
        # =============
        path = cross_binary_path(ad_feat_name, user_feat_name, "test2")
        new_col_names, new_matrix = du.load_pickle(path, use_joblib=True)
        
        min_df = 10
        col_nnz = new_matrix.getnnz(axis=0)
        mask = (col_nnz >= min_df)
        new_col_names = list(itertools.compress(new_col_names, mask))
        new_matrix = new_matrix[:, mask]
        assert len(new_col_names) == new_matrix.shape[1]

        col_names += new_col_names
        if matrix is None:
            matrix = new_matrix
        else:
            matrix = sparse.hstack((matrix, new_matrix))
        
        out_file = "test2.cross.wordCount_v2.pkl"
        out_path = os.path.join(out_folder, out_file)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)

[01:40:12] Finish processing 'aid' x 'age'. △M: -201.21MB. △T: 0.2 seconds.
[01:40:13] Finish processing 'aid' x 'interest2'. △M: +155.15MB. △T: 1.6 seconds.
[01:40:16] Finish processing 'productType' x 'kw2'. △M: +97.95MB. △T: 3.0 seconds.
