In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import os
import gc
import tqdm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/feature/')
import data_utils as du
import perf_utils as pu
from clickrate import BayesianSmoothedClickrate

In [2]:
click_history_folder = "../../../data/click_history/simple_cross/byUserFeatureName/"

In [3]:
def load_user_history(user_feat_name, ad_feat_name, ad_feat_val):
    folder = os.path.join(click_history_folder, "[featureName='{}']".format(user_feat_name))
    filename = "[{}='{}'].csv".format(ad_feat_name, ad_feat_val)
    filepath = os.path.join(folder, filename)
    return pd.read_csv(filepath)

In [4]:
ad_num_feat_names = ['creativeSize']
ad_cat_feat_names = ['aid', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType']
ad_feat_names = ad_num_feat_names + ad_cat_feat_names

In [5]:
user_one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
user_multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os'] 
user_feat_names = user_one_feat_names + user_multi_feat_names

In [6]:
df_train = du.load_raw_data("train")
y = df_train['label'].values
y = (y + 1) / 2
df_ad = du.load_raw_data("ad")

In [7]:
avals = {ad_feat_name: df_ad[ad_feat_name].unique() for ad_feat_name in ad_feat_names}

In [8]:
clickrate_folder = "../../../data/clickrate_bs/simple_cross/byUserFeatureName/"  # ns for naive smoothing
os.makedirs(clickrate_folder, exist_ok=True)

In [9]:
# Solve clickrate for one value features using Fix-Point Iteration
for user_feat_name in user_one_feat_names:
    for ad_feat_name in ad_feat_names:
        # define output path and data structure to keep the records
        out_folder = os.path.join(clickrate_folder, "[featureName='{}']".format(user_feat_name))
        meta_out_file = "params[adFeatureName='{}'].csv".format(ad_feat_name)
        meta_out_path = os.path.join(out_folder, meta_out_file)
        clickrate_out_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
        clickrate_out_path = os.path.join(out_folder, clickrate_out_file)
        os.makedirs(out_folder, exist_ok=True)
        
        df_meta = pd.DataFrame(columns=["ad_val", "alpha", "beta", "clickrate_expectation"])
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"])

        for aval in tqdm.tqdm(avals[ad_feat_name], desc="Fitting ad['{}'] x user['{}']...".format(ad_feat_name, 
                                                                                                  user_feat_name)):
            # prepare data
            feat_stats = load_user_history(user_feat_name, ad_feat_name, aval)
            feat_stats = feat_stats[feat_stats['value'] != 'all']
            feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
            feat_stats = feat_stats.rename(columns={"positive": "click", "value": "user_val"})

            # fit naive smoothing
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            bs = BayesianSmoothedClickrate(max_iter=10000)
            bs.fit(imps, clks, verbose=False)

            # transform raw feature to bayesian smoothed click rate
            bs_crs = bs.transform(imps, clks)
            feat_stats["bs_clickrate"] = bs_crs

            # update global info for current feature
            feat_stats["ad_val"] = aval
            df_meta.loc[df_meta.shape[0]]= {"ad_val": aval, 
                                            "alpha": bs.alpha, 
                                            "beta": bs.beta, 
                                            "clickrate_expectation": bs.clickrate_expectation}
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", 
                                                           "bs_clickrate", "click", "impression"]])

        df_meta["ad_val"] = df_meta["ad_val"].astype(int)
        df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
        df_meta.to_csv(meta_out_path, columns=["ad_val", "alpha", "beta", "clickrate_expectation"], index=False)
        df_clickrate["click"] = df_clickrate["click"].astype(int)
        df_clickrate["impression"] = df_clickrate["impression"].astype(int)
        df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
        df_clickrate.to_csv(clickrate_out_path, columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"], index=False)
        
        del df_clickrate
        del df_meta
        del feat_stats
        del bs_crs
        del imps
        del clks
        gc.collect()

Fitting ad['creativeSize'] x user['age']...: 100%|██████████| 15/15 [00:05<00:00,  2.75it/s]
Fitting ad['aid'] x user['age']...: 100%|██████████| 173/173 [01:07<00:00,  2.56it/s]
Fitting ad['advertiserId'] x user['age']...: 100%|██████████| 79/79 [00:29<00:00,  2.70it/s]
Fitting ad['campaignId'] x user['age']...: 100%|██████████| 138/138 [00:52<00:00,  2.65it/s]
Fitting ad['creativeId'] x user['age']...: 100%|██████████| 173/173 [01:05<00:00,  2.65it/s]
Fitting ad['adCategoryId'] x user['age']...: 100%|██████████| 40/40 [00:15<00:00,  2.52it/s]
Fitting ad['productId'] x user['age']...: 100%|██████████| 33/33 [00:12<00:00,  2.71it/s]
Fitting ad['productType'] x user['age']...: 100%|██████████| 4/4 [00:01<00:00,  2.83it/s]
Fitting ad['creativeSize'] x user['gender']...: 100%|██████████| 15/15 [00:05<00:00,  2.92it/s]
Fitting ad['aid'] x user['gender']...: 100%|██████████| 173/173 [01:00<00:00,  2.85it/s]
Fitting ad['advertiserId'] x user['gender']...: 100%|██████████| 79/79 [00:27<00:00,

In [10]:
# Solve clickrate for one value features using Fix-Point Iteration
for user_feat_name in user_multi_feat_names:
    for ad_feat_name in ad_feat_names:
        # define output path and data structure to keep the records
        out_folder = os.path.join(clickrate_folder, "[featureName='{}']".format(user_feat_name))
        meta_out_file = "params[adFeatureName='{}'].csv".format(ad_feat_name)
        meta_out_path = os.path.join(out_folder, meta_out_file)
        clickrate_out_file = "[adFeatureName='{}'].csv".format(ad_feat_name)
        clickrate_out_path = os.path.join(out_folder, clickrate_out_file)
        os.makedirs(out_folder, exist_ok=True)
        
        df_meta = pd.DataFrame(columns=["ad_val", "alpha", "beta", "clickrate_expectation"])
        df_clickrate = pd.DataFrame(columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"])

        for aval in tqdm.tqdm(avals[ad_feat_name], desc="Fitting ad['{}'] x user['{}']...".format(ad_feat_name, 
                                                                                                  user_feat_name)):
            # prepare data
            feat_stats = load_user_history(user_feat_name, ad_feat_name, aval)
            feat_stats = feat_stats[feat_stats['value'] != 'all']
            feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
            feat_stats = feat_stats.rename(columns={"positive": "click", "value": "user_val"})

            # fit naive smoothing
            imps = feat_stats["impression"].values
            clks = feat_stats["click"].values
            bs = BayesianSmoothedClickrate(use_moment=True, use_fixed_point=False)
            bs.fit(imps, clks, verbose=False)

            # transform raw feature to bayesian smoothed click rate
            bs_crs = bs.transform(imps, clks)
            feat_stats["bs_clickrate"] = bs_crs

            # update global info for current feature
            feat_stats["ad_val"] = aval
            df_meta.loc[df_meta.shape[0]]= {"ad_val": aval, 
                                            "alpha": bs.alpha, 
                                            "beta": bs.beta, 
                                            "clickrate_expectation": bs.clickrate_expectation}
            df_clickrate = df_clickrate.append(feat_stats[["ad_val", "user_val", 
                                                           "bs_clickrate", "click", "impression"]])

        df_meta["ad_val"] = df_meta["ad_val"].astype(int)
        df_meta = df_meta.sort_values(["clickrate_expectation", "alpha"], ascending=False)
        df_meta.to_csv(meta_out_path, columns=["ad_val", "alpha", "beta", "clickrate_expectation"], index=False)
        df_clickrate["click"] = df_clickrate["click"].astype(int)
        df_clickrate["impression"] = df_clickrate["impression"].astype(int)
        df_clickrate = df_clickrate.sort_values(["bs_clickrate", "click"], ascending=False)
        df_clickrate.to_csv(clickrate_out_path, columns=["ad_val", "user_val", "bs_clickrate", "click", "impression"], index=False)
        
        del df_clickrate
        del df_meta
        del feat_stats
        del bs_crs
        del imps
        del clks
        gc.collect()

Fitting ad['creativeSize'] x user['marriageStatus']...: 100%|██████████| 15/15 [00:00<00:00, 25.20it/s]
Fitting ad['aid'] x user['marriageStatus']...: 100%|██████████| 173/173 [00:06<00:00, 25.92it/s]
Fitting ad['advertiserId'] x user['marriageStatus']...: 100%|██████████| 79/79 [00:03<00:00, 25.92it/s]
Fitting ad['campaignId'] x user['marriageStatus']...: 100%|██████████| 138/138 [00:05<00:00, 25.76it/s]
Fitting ad['creativeId'] x user['marriageStatus']...: 100%|██████████| 173/173 [00:06<00:00, 25.97it/s]
Fitting ad['adCategoryId'] x user['marriageStatus']...: 100%|██████████| 40/40 [00:01<00:00, 25.94it/s]
Fitting ad['productId'] x user['marriageStatus']...: 100%|██████████| 33/33 [00:01<00:00, 25.56it/s]
Fitting ad['productType'] x user['marriageStatus']...: 100%|██████████| 4/4 [00:00<00:00, 25.94it/s]
Fitting ad['creativeSize'] x user['interest1']...: 100%|██████████| 15/15 [00:00<00:00, 25.53it/s]
Fitting ad['aid'] x user['interest1']...: 100%|██████████| 173/173 [00:06<00:00, 2

Fitting ad['adCategoryId'] x user['topic2']...: 100%|██████████| 40/40 [00:04<00:00,  8.31it/s]
Fitting ad['productId'] x user['topic2']...: 100%|██████████| 33/33 [00:03<00:00,  8.58it/s]
Fitting ad['productType'] x user['topic2']...: 100%|██████████| 4/4 [00:00<00:00,  8.61it/s]
Fitting ad['creativeSize'] x user['topic3']...: 100%|██████████| 15/15 [00:01<00:00, 14.75it/s]
Fitting ad['aid'] x user['topic3']...: 100%|██████████| 173/173 [00:22<00:00,  7.72it/s]
Fitting ad['advertiserId'] x user['topic3']...: 100%|██████████| 79/79 [00:07<00:00, 10.22it/s]
Fitting ad['campaignId'] x user['topic3']...: 100%|██████████| 138/138 [00:16<00:00,  8.30it/s]
Fitting ad['creativeId'] x user['topic3']...: 100%|██████████| 173/173 [00:23<00:00,  7.30it/s]
Fitting ad['adCategoryId'] x user['topic3']...: 100%|██████████| 40/40 [00:02<00:00, 14.04it/s]
Fitting ad['productId'] x user['topic3']...: 100%|██████████| 33/33 [00:02<00:00, 12.39it/s]
Fitting ad['productType'] x user['topic3']...: 100%|████