In [1]:
import pandas as pd
import os
from sklearn import linear_model, datasets, preprocessing, metrics
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn.model_selection import KFold
import xgboost as xgb

In [2]:
train = pd.read_csv(os.path.join('train.csv'))
val = pd.read_csv(os.path.join('validation.csv'))
test = pd.read_csv(os.path.join('test.csv'))

In [3]:
def downsampling(data):
    data_no_click = data.query('click == 0')
    data_one_click = data.query('click == 1')
    sample_nums = len(data_one_click) * 30
    new_data_no_click = data_no_click.sample(n=sample_nums, random_state=42)
    return pd.concat([new_data_no_click, data_one_click])

In [4]:
def preprocess_data(data, enforce_cols=None):
    data = data.sort_index(axis=0)
    to_drop = ['userid', 'bidid', 'url', 'urlid', 'IP', 'keypage', 'slotid', 'creative', 'domain']
    data = data.drop(to_drop, axis=1)
  
    colums_split = data['useragent'].str.split('_', expand=True)
    data = data.join(colums_split.rename(columns={0:'os', 1:'browser'}))
#     data = data.apply(lambda row: hash_feature(row), axis=1)
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    
    data.drop(['useragent', 'usertag'], axis=1, inplace=True)
    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data) 

    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
    return data

In [5]:
train_processed = downsampling(train.copy())
train_processed = preprocess_data(train_processed)
val_processed = preprocess_data(val.copy(), enforce_cols=train_processed.columns)
test_processed = preprocess_data(test.copy(), enforce_cols=train_processed.columns)

In [6]:
# transformer = preprocessing.OneHotEncoder()
# transformer.fit(train_processed)
# train_processed = transformer.transform(train_processed)
# val_processed = transformer.transform(val_processed)

In [7]:
train_x = train_processed.drop(['bidprice', 'payprice', 'click'], axis=1)
train_y = train_processed['click']
val_x = val_processed.drop(['bidprice', 'payprice', 'click'], axis=1)
val_y = val_processed['click']
test_x = test_processed.drop(['bidprice', 'payprice', 'click'], axis=1)

In [8]:
val_x.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4
1,1,0,0,0,0,3476,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3358,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4
3,0,1,0,0,0,3358,1,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,0,1,0,0,0,3476,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
# Caculate the root mean square error
def rmse(preds, labels):
    diff = preds - labels                       
    mean_diff_squared = (diff ** 2).mean()  
    error = np.sqrt(mean_diff_squared)

    return error 

In [10]:
CTR = sum(val_y) / len(val_x)
error = float('inf')
for i in range(5):
    model = linear_model.LogisticRegression(C=1)
    model.fit(train_x, train_y)
    val_pred = model.predict_proba(val_x)[:,1]
    pCTR = sum(val_pred) / len(val_x)
    curr_error = rmse(pCTR, CTR)
    if curr_error < error:
        error = curr_error
        best_pCTR = pCTR
        best_val_pred = val_pred
print('rmse between pCTR and CTR: ' + str(error))
val_pred = best_val_pred
print('rmse between pred clicks and clicks: ' + str(rmse(val_pred, val_y)))

rmse between pCTR and CTR: 0.0255591878479
rmse between pred clicks and clicks: 0.0588816086523


In [11]:
accuracy_score([round(x) for x in val_pred], val_y)

0.99727235337665543

In [12]:
sum([round(x) for x in val_pred]) / len(val_x)

0.0023591346549313155

In [13]:
CTR

0.0006646376573167722

In [14]:
avgCTR = sum(train['click']) / len(train)
print(avgCTR)

0.0007375623256619447


In [15]:
def compare_performance(metrics_list, best_metrics):
    return metrics_list[0] >= best_metrics[0] and metrics_list[1] >= best_metrics[1]

In [16]:
"""
Computes some eCPC statistics based on provided training dataset.
"""
cols = train[['click', 'advertiser', 'payprice']]
grouped_cols = cols.groupby('advertiser')
new_features = grouped_cols.sum()

new_features.rename(columns={'click':'Clicks', 'payprice':'Cost'}, inplace = True)

new_features['Cost'] = new_features['Cost'] / 1000
new_features['eCPC'] = new_features['Cost'] / new_features['Clicks']

new_features = new_features.reset_index()
new_features = new_features.set_index('advertiser')
new_features.drop(['Cost','Clicks'], inplace=True, axis=1)
max_eCPCs = train.join(new_features, on='advertiser')['eCPC']

In [17]:
max_eCPC = max(max_eCPCs)

In [18]:
new_val = val.copy()

bidprices = [max_eCPC * val_pred[i] for i in range(len(val_pred))]
new_val['bidprice'] = bidprices
budget = 6250
suc_bids = new_val.query('bidprice >= payprice and bidprice >= slotprice')
cost = 0
clicks = 0
imps = 0
results = []

for index, row in suc_bids.iterrows():
    if cost <= budget:
        cost += row['payprice'] / 1000
        clicks += row['click']
        imps += 1

eCPC = cost / clicks if clicks > 0 else float('inf')
metrics_list = [clicks, clicks / imps * 100, cost, cost / imps * 1000, eCPC]
results.append([clicks, CTR, cost, cost / imps * 1000, eCPC])
    
results = pd.DataFrame(results)
results = results.rename(columns={0: "clicks",
    1: "CTR", 2: "cost", 3: "avg CPM", 4: "eCPC"})
results.to_csv(os.path.join('bid_eCPC_lr.csv'), index=False)

In [19]:
results.loc[results.clicks == max(results['clicks'])]

Unnamed: 0,clicks,CTR,cost,avg CPM,eCPC
0,60,0.000665,210.814,20.943175,3.513567
