In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import os
import gc
import tqdm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/feature/')
import data_utils as du
import perf_utils as pu
from clickrate import NaiveSmoothedClickrate

In [2]:
click_history_folder = "../../../data/click_history/simple_cross/byUserFeatureName/"

In [3]:
def load_user_history(user_feat_name, ad_feat_name, ad_feat_val):
    folder = os.path.join(click_history_folder, "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, ad_feat_val)
    filepath = os.path.join(folder, filename)
    return pd.read_csv(filepath)

In [4]:
ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names

In [5]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os'] 
user_feat_names = user_one_feat_names + user_multi_feat_names

In [6]:
df_train = du.load_raw_data("train")
y = df_train['label'].values
y = (y + 1) / 2
df_ad = du.load_raw_data("ad")

In [7]:
avals = {ad_feat_name: df_ad[ad_feat_name].unique() for ad_feat_name in ad_feat_names}

In [8]:
clickrate_folder = "../../../data/clickrate_ns/simple_cross/byUserFeatureName/"  # ns for naive smoothing
os.makedirs(clickrate_folder, exist_ok=True)

In [9]:
for user_feat_name in user_feat_names:
    for ad_feat_name in ad_feat_names:
        # define output path and data structure to keep the records
        out_folder = os.path.join(clickrate_folder, "[featureName='{}']".format(user_feat_name))
        out_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
        out_path = os.path.join(out_folder, out_file)
        os.makedirs(out_folder, exist_ok=True)
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "ns_clickrate", "click", "impression"])

        for aval in tqdm.tqdm(avals[ad_feat_name], desc="Fitting ad['{}'] x user['{}']...".format(ad_feat_name, 
                                                                                                  user_feat_name)):
            # prepare data
            feat_stats = load_user_history(user_feat_name, ad_feat_name, aval)
            feat_stats = feat_stats[feat_stats['value'] != 'all']
            feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
            feat_stats = feat_stats.rename(columns={"positive": "click", "value": "user_val"})

            # fit naive smoothing
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            ns = NaiveSmoothedClickrate()
            ns.fit(imps, clks)

            # transform raw feature to bayesian smoothed click rate
            ns_crs = ns.transform(imps, clks)
            feat_stats["ns_clickrate"] = ns_crs

            # update global info for current feature
            feat_stats["ad_val"] = aval
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", 
                                                           "ns_clickrate", "click", "impression"]])

        df_clickrate["click"] = df_clickrate["click"].astype(int)
        df_clickrate["impression"] = df_clickrate["impression"].astype(int)
        df_clickrate = df_clickrate.sort_values(["ns_clickrate", "click"], ascending=False)
        df_clickrate.to_csv(out_path, columns=["ad_val", "user_val", "ns_clickrate", "click", "impression"], index=False)
        
        del df_clickrate
        del feat_stats
        del ns_crs
        del imps
        del clks
        gc.collect()

Fitting ad['creativeSize'] x user['age']...: 100%|██████████| 15/15 [00:00<00:00, 21.39it/s]
Fitting ad['aid'] x user['age']...: 100%|██████████| 173/173 [00:06<00:00, 24.82it/s]
Fitting ad['advertiserId'] x user['age']...: 100%|██████████| 79/79 [00:03<00:00, 23.68it/s]
Fitting ad['campaignId'] x user['age']...: 100%|██████████| 138/138 [00:06<00:00, 22.61it/s]
Fitting ad['creativeId'] x user['age']...: 100%|██████████| 173/173 [00:07<00:00, 22.19it/s]
Fitting ad['adCategoryId'] x user['age']...: 100%|██████████| 40/40 [00:01<00:00, 23.74it/s]
Fitting ad['productId'] x user['age']...: 100%|██████████| 33/33 [00:01<00:00, 23.77it/s]
Fitting ad['productType'] x user['age']...: 100%|██████████| 4/4 [00:00<00:00, 24.26it/s]
Fitting ad['creativeSize'] x user['gender']...: 100%|██████████| 15/15 [00:00<00:00, 23.40it/s]
Fitting ad['aid'] x user['gender']...: 100%|██████████| 173/173 [00:07<00:00, 23.43it/s]
Fitting ad['advertiserId'] x user['gender']...: 100%|██████████| 79/79 [00:03<00:00,

Fitting ad['creativeId'] x user['interest3']...: 100%|██████████| 173/173 [00:06<00:00, 25.74it/s]
Fitting ad['adCategoryId'] x user['interest3']...: 100%|██████████| 40/40 [00:01<00:00, 25.22it/s]
Fitting ad['productId'] x user['interest3']...: 100%|██████████| 33/33 [00:01<00:00, 25.83it/s]
Fitting ad['productType'] x user['interest3']...: 100%|██████████| 4/4 [00:00<00:00, 26.01it/s]
Fitting ad['creativeSize'] x user['interest4']...: 100%|██████████| 15/15 [00:00<00:00, 26.45it/s]
Fitting ad['aid'] x user['interest4']...: 100%|██████████| 173/173 [00:06<00:00, 25.53it/s]
Fitting ad['advertiserId'] x user['interest4']...: 100%|██████████| 79/79 [00:03<00:00, 26.12it/s]
Fitting ad['campaignId'] x user['interest4']...: 100%|██████████| 138/138 [00:05<00:00, 25.41it/s]
Fitting ad['creativeId'] x user['interest4']...: 100%|██████████| 173/173 [00:07<00:00, 23.45it/s]
Fitting ad['adCategoryId'] x user['interest4']...: 100%|██████████| 40/40 [00:01<00:00, 23.90it/s]
Fitting ad['productId']

Fitting ad['advertiserId'] x user['ct']...: 100%|██████████| 79/79 [00:03<00:00, 22.96it/s]
Fitting ad['campaignId'] x user['ct']...: 100%|██████████| 138/138 [00:06<00:00, 22.17it/s]
Fitting ad['creativeId'] x user['ct']...: 100%|██████████| 173/173 [00:08<00:00, 20.37it/s]
Fitting ad['adCategoryId'] x user['ct']...: 100%|██████████| 40/40 [00:01<00:00, 22.21it/s]
Fitting ad['productId'] x user['ct']...: 100%|██████████| 33/33 [00:01<00:00, 22.07it/s]
Fitting ad['productType'] x user['ct']...: 100%|██████████| 4/4 [00:00<00:00, 22.34it/s]
Fitting ad['creativeSize'] x user['os']...: 100%|██████████| 15/15 [00:00<00:00, 23.02it/s]
Fitting ad['aid'] x user['os']...: 100%|██████████| 173/173 [00:07<00:00, 23.46it/s]
Fitting ad['advertiserId'] x user['os']...: 100%|██████████| 79/79 [00:03<00:00, 24.28it/s]
Fitting ad['campaignId'] x user['os']...: 100%|██████████| 138/138 [00:06<00:00, 21.08it/s]
Fitting ad['creativeId'] x user['os']...: 100%|██████████| 173/173 [00:07<00:00, 24.00it/s]
F