In [1]:
import pandas as pd
import numpy as np
import operator
import pickle
import time
import os
import matplotlib.pyplot as plt
import sys
sys.path.append("../code/utils")
import data_utils as du
import perf_utils as pu

In [2]:
clickhist_folder = "../data/click_history/simple_cross/byUserFeatureName/"
os.makedirs(clickhist_folder, exist_ok=True)

In [3]:
df_train = du.load_raw_data("train")
df_ad = du.load_raw_data("ad")
ad_user = pd.merge(df_train, df_ad, on='aid', how='left')
train_size = df_train.shape[0]

In [4]:
ad_feat_names = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'creativeSize',
                 'adCategoryId', 'productId', 'productType']
user_feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
                   "interest1", "interest2", "interest3", "interest4", "interest5",
                   "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
                   "appIdAction", "ct", "os", "carrier", "house"]

In [5]:
avals = {ad_feat_name: df_ad[ad_feat_name].unique() for ad_feat_name in ad_feat_names}
with pu.profiler("calculating user index mapping for ad feature values"):
    aval_to_indices = {ad_feat_name: ad_user.groupby(ad_feat_name).groups for ad_feat_name in ad_feat_names}

[04:58:36] Finish calculating user index mapping for ad feature values. △M: +535.48MB. △T: 8.3 seconds.


In [6]:
for user_feat_name in user_feat_names:
    # Given a user feature
    # load dictionary and the sparse matrix
    uids, (ufeat_index, uvec) = du.load_user_cnt(user_feat_name)
    uid_to_index = dict(zip(uids, list(range(len(uids)))))  # mapping from uids to distinct indices
    ad_user["user_index"] = ad_user['uid'].map(uid_to_index)
    
    for ad_feat_name in ad_feat_names:
        print("[{}] counting ad['{}'] x user['{}']...".format(pu.get_time_str(), ad_feat_name, user_feat_name))
        total_count = 0
        for aval in avals[ad_feat_name]:
            # Given an ad feature value
            # get indices and select DataFrame
            indices = aval_to_indices[ad_feat_name][aval]  # indices of rows with [ad_feat_name=aval]
            df_selected = ad_user.loc[indices]  # rows with [ad_feat_name=aval]
            total_count += df_selected.shape[0]

            # select positive and netagive
            df_pos = df_selected[df_selected['label']==1]  # rows with [ad_feat_name=aval] and clicked
            df_neg = df_selected[df_selected['label']==-1]  # rows with [ad_feat_name=aval] and unclicked

            # get positive and negative indices to pick rows in the sparse matrix
            pos_uindex = df_pos['user_index'].values
            neg_uindex = df_neg['user_index'].values

            # pick up rows from the sparse matrix
            X_pos = uvec[pos_uindex,:]
            X_neg = uvec[neg_uindex,:]

            # count positive and negative
            pos_count = X_pos.sum(axis=0)
            pos_count = np.squeeze(np.asarray(pos_count))
            neg_count = X_neg.sum(axis=0)
            neg_count = np.squeeze(np.asarray(neg_count))

            # sort feature values by their indices
            sorted_index = sorted(ufeat_index.items(), key=operator.itemgetter(1))
            values = [k for k, v in sorted_index]

            # construct DataFrame
            counter = pd.DataFrame()
            counter["value"] = values
            counter["positive"] = pos_count
            counter["negative"] = neg_count

            # append aggregate
            counter = counter.append({"value": "all", 
                                      "positive": df_pos.shape[0], 
                                      "negative": df_neg.shape[0]}, 
                                     ignore_index=True)

            out_folder = os.path.join(clickhist_folder, "[featureName='{}']".format(user_feat_name))
            os.makedirs(out_folder, exist_ok=True)
            out_filename = "[{}='{}'].csv".format(ad_feat_name, aval)
            out_filepath = os.path.join(out_folder, out_filename)
            counter.to_csv(out_filepath, index=False)

        assert total_count == train_size

[04:59:02] counting ad['aid'] x user['age']...
[04:59:16] counting ad['advertiserId'] x user['age']...
[04:59:25] counting ad['campaignId'] x user['age']...
[04:59:35] counting ad['creativeId'] x user['age']...
[04:59:46] counting ad['creativeSize'] x user['age']...
[04:59:53] counting ad['adCategoryId'] x user['age']...
[05:00:02] counting ad['productId'] x user['age']...
[05:00:10] counting ad['productType'] x user['age']...
[05:00:43] counting ad['aid'] x user['gender']...
[05:00:52] counting ad['advertiserId'] x user['gender']...
[05:01:01] counting ad['campaignId'] x user['gender']...
[05:01:11] counting ad['creativeId'] x user['gender']...
[05:01:22] counting ad['creativeSize'] x user['gender']...
[05:01:31] counting ad['adCategoryId'] x user['gender']...
[05:01:39] counting ad['productId'] x user['gender']...
[05:01:49] counting ad['productType'] x user['gender']...
[05:02:26] counting ad['aid'] x user['marriageStatus']...
[05:02:37] counting ad['advertiserId'] x user['marriageS

[06:30:11] counting ad['adCategoryId'] x user['appIdInstall']...
[06:30:35] counting ad['productId'] x user['appIdInstall']...
[06:30:56] counting ad['productType'] x user['appIdInstall']...
[06:31:25] counting ad['aid'] x user['appIdAction']...
[06:31:37] counting ad['advertiserId'] x user['appIdAction']...
[06:31:46] counting ad['campaignId'] x user['appIdAction']...
[06:31:57] counting ad['creativeId'] x user['appIdAction']...
[06:32:08] counting ad['creativeSize'] x user['appIdAction']...
[06:32:16] counting ad['adCategoryId'] x user['appIdAction']...
[06:32:24] counting ad['productId'] x user['appIdAction']...
[06:32:32] counting ad['productType'] x user['appIdAction']...
[06:32:57] counting ad['aid'] x user['ct']...
[06:33:05] counting ad['advertiserId'] x user['ct']...
[06:33:12] counting ad['campaignId'] x user['ct']...
[06:33:19] counting ad['creativeId'] x user['ct']...
[06:33:26] counting ad['creativeSize'] x user['ct']...
[06:33:33] counting ad['adCategoryId'] x user['ct'].