In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import os
import gc
import tqdm
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/feature/')
import data_utils as du
import perf_utils as pu
from clickrate import NaiveSmoothedClickrate

In [2]:
clickcount_user_in_folder = "../../../data/clickrate/byUserFeatureName/"

In [3]:
def load_user_history(feat_name, aid):
    folder = os.path.join(clickcount_user_in_folder, "[featureName='{}']".format(feat_name))
    filename = "[aid='{}'].csv".format(aid)
    filepath = os.path.join(folder, filename)
    return pd.read_csv(filepath)

In [4]:
one_feat_names = ['age', 'gender', 'education', 'consumptionAbility', 'LBS',
                  'carrier', 'house']  # one user has only one value
multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os'] 
feat_names = one_feat_names + multi_feat_names

In [5]:
df_train = du.load_raw_data("train")
y = df_train['label'].values
y = (y + 1) / 2

In [6]:
df_ad = du.load_raw_data("ad")
aids = df_ad["aid"].values

In [7]:
nsclickrate_user_folder = "../../../data/ns_clickrate_v1/byUserFeatureName/"
os.makedirs(nsclickrate_user_folder, exist_ok=True)

In [8]:
for feat_name in one_feat_names:
    clickrate_file = "ns_clickrate[featureName='{}'].csv".format(feat_name)
    clickrate_path = os.path.join(nsclickrate_user_folder, clickrate_file)

    df_clickrate = pd.DataFrame(columns=["aid", "value", "bs_clickrate"])

    for aid in tqdm.tqdm(aids, desc="Fitting '{}'...".format(feat_name)):
        # prepare data
        feat_stats = load_user_history(feat_name, aid)
        feat_stats = feat_stats[feat_stats['value'] != 'all']
        feat_stats["impression"] = feat_stats["positive"] + feat_stats["negative"]
        feat_stats = feat_stats.rename(columns={"positive": "click"})

        # fit naive smoothing
        imps = feat_stats["impression"].values
        clks = feat_stats["click"].values
        ns = NaiveSmoothedClickrate()
        ns.fit(imps, clks)

        # transform raw feature to bayesian smoothed click rate
        ns_crs = ns.transform(imps, clks)
        feat_stats["ns_clickrate"] = ns_crs

        # update global info for current feature
        feat_stats["aid"] = aid
        df_clickrate = df_clickrate.append(feat_stats[["aid", "value", "ns_clickrate", "click", "impression"]])

    df_clickrate["click"] = df_clickrate["click"].astype(int)
    df_clickrate["impression"] = df_clickrate["impression"].astype(int)
    df_clickrate = df_clickrate.sort_values(["ns_clickrate", "click"], ascending=False)
    df_clickrate.to_csv(clickrate_path, columns=["aid", "value", "ns_clickrate", "click", "impression"], index=False)

Fitting 'age'...: 100%|██████████| 173/173 [00:06<00:00, 26.57it/s]
Fitting 'gender'...: 100%|██████████| 173/173 [00:06<00:00, 26.44it/s]
Fitting 'education'...: 100%|██████████| 173/173 [00:07<00:00, 24.38it/s]
Fitting 'consumptionAbility'...: 100%|██████████| 173/173 [00:06<00:00, 25.97it/s]
Fitting 'LBS'...: 100%|██████████| 173/173 [00:08<00:00, 19.74it/s]
Fitting 'carrier'...: 100%|██████████| 173/173 [00:07<00:00, 24.04it/s]
Fitting 'house'...: 100%|██████████| 173/173 [00:06<00:00, 27.01it/s]
