In [5]:
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import data_jointer as dj
import config

In [6]:
out_folder = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "simple_cross/byUserFeatureName")
out_folder_v3 = os.path.join(config.PRELIM_NLP_COUNT_DATA_DIR, "user_cross/")

def cross_binary_path(ad_feat_name, user_feat_name, prefix="train", create=True):
    folder = os.path.join(out_folder, "[featureName='{}']".format(user_feat_name))
    file = "{}.[adFeatureName='{}'].binary.pkl".format(prefix, ad_feat_name)
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path



def cross_binary_path_v3(feat_name1, feat_name2, prefix="train", create=True):
    folder = out_folder_v3
    file = "{}.['{}'x'{}'].binary.pkl".format(prefix, feat_name1, feat_name2)
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path

In [7]:
pairs_v1 = [("aid", "age"), ("creativeSize", "age"), 
            ("aid", "education"), ("creativeSize", "education"), 
            ("aid", "consumptionAbility"), ("creativeSize", "consumptionAbility"), 
            ("aid", "LBS"), ("productId", "LBS")]

pairs_v2 = [('advertiserId', 'interest1'),
            ('aid', 'interest2'),
            ('creativeSize', 'interest2'), 
            ('campaignId', 'interest4'),  # whether to keep it? 
            ('aid', 'interest5'),  
            ('productType', 'kw1'),  # 'kw1' looks very overfitting prone, to be decide whether to keep it
            ('productType', 'kw2'),
            ('productType', 'kw3'),
            ('productType', 'topic1'),
            ('aid', 'topic2'),
            ('productType', 'topic2'),
            ('aid', 'ct'),
            ('aid', 'os')]

pairs_v3 = [("LBS", "carrier"), 
            ("LBS", "house"), 
            ("LBS", "gender")]

In [8]:
for ad_feat_name, user_feat_name in pairs_v1:
    with pu.profiler("rewriting '{}' x '{}'".format(ad_feat_name, user_feat_name)):
        out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="train")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)
        
        out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="test2")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)

[16:11:51] Finish rewriting 'aid' x 'age'. △M: +19.33MB. △T: 0.5 seconds.
[16:11:51] Finish rewriting 'creativeSize' x 'age'. △M: +36.49MB. △T: 0.4 seconds.
[16:11:52] Finish rewriting 'aid' x 'education'. △M: +64.0KB. △T: 0.4 seconds.
[16:11:52] Finish rewriting 'creativeSize' x 'education'. △M: -52.0KB. △T: 0.4 seconds.
[16:11:52] Finish rewriting 'aid' x 'consumptionAbility'. △M: +24.0KB. △T: 0.4 seconds.
[16:11:53] Finish rewriting 'creativeSize' x 'consumptionAbility'. △M: -24.0KB. △T: 0.4 seconds.
[16:11:54] Finish rewriting 'aid' x 'LBS'. △M: +31.3MB. △T: 1.3 seconds.
[16:11:55] Finish rewriting 'productId' x 'LBS'. △M: -31.19MB. △T: 0.7 seconds.


In [9]:
for ad_feat_name, user_feat_name in pairs_v2:
    with pu.profiler("rewriting '{}' x '{}'".format(ad_feat_name, user_feat_name)):
        out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="train")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)
        
        out_path = cross_binary_path(ad_feat_name, user_feat_name, prefix="test2")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)

[16:11:58] Finish rewriting 'advertiserId' x 'interest1'. △M: +138.46MB. △T: 3.2 seconds.
[16:11:59] Finish rewriting 'aid' x 'interest2'. △M: -103.87MB. △T: 1.2 seconds.
[16:12:00] Finish rewriting 'creativeSize' x 'interest2'. △M: +4.0KB. △T: 1.1 seconds.
[16:12:01] Finish rewriting 'campaignId' x 'interest4'. △M: -20.79MB. △T: 0.4 seconds.
[16:12:04] Finish rewriting 'aid' x 'interest5'. △M: +127.94MB. △T: 3.6 seconds.
[16:12:12] Finish rewriting 'productType' x 'kw1'. △M: +125.51MB. △T: 7.8 seconds.
[16:12:15] Finish rewriting 'productType' x 'kw2'. △M: -186.55MB. △T: 2.5 seconds.
[16:12:15] Finish rewriting 'productType' x 'kw3'. △M: -59.43MB. △T: 0.7 seconds.
[16:12:17] Finish rewriting 'productType' x 'topic1'. △M: +39.26MB. △T: 1.9 seconds.
[16:12:29] Finish rewriting 'aid' x 'topic2'. △M: +318.83MB. △T: 12.1 seconds.
[16:12:31] Finish rewriting 'productType' x 'topic2'. △M: -328.65MB. △T: 1.5 seconds.
[16:12:31] Finish rewriting 'aid' x 'ct'. △M: -35.89MB. △T: 0.6 seconds.
[16

In [10]:
for ad_feat_name, user_feat_name in pairs_v3:
    with pu.profiler("rewriting '{}' x '{}'".format(ad_feat_name, user_feat_name)):
        out_path = cross_binary_path_v3(ad_feat_name, user_feat_name, prefix="train")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)
        
        out_path = cross_binary_path_v3(ad_feat_name, user_feat_name, prefix="test2")
        col_names, matrix = du.load_pickle(out_path, use_joblib=True)
        du.save_pickle((col_names, matrix), out_path, use_joblib=False)

[16:12:32] Finish rewriting 'LBS' x 'carrier'. △M: +1.76MB. △T: 0.5 seconds.
[16:12:33] Finish rewriting 'LBS' x 'house'. △M: +4.0KB. △T: 0.5 seconds.
[16:12:34] Finish rewriting 'LBS' x 'gender'. △M: +4.0KB. △T: 0.6 seconds.
