In [1]:
import pandas as pd
import os
from sklearn import linear_model, datasets, preprocessing, metrics
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn.model_selection import KFold
import xgboost as xgb
import math

In [2]:
train = pd.read_csv(os.path.join('train.csv'))
val = pd.read_csv(os.path.join('validation.csv'))
test = pd.read_csv(os.path.join('test.csv'))

In [3]:
def downsampling(data):
    data_no_click = data.query('click == 0')
    data_one_click = data.query('click == 1')
    sample_nums = len(data_one_click) * 50
    new_data_no_click = data_no_click.sample(n=sample_nums, random_state=42)
    return pd.concat([new_data_no_click, data_one_click])

In [4]:
def create_freq_CTR_by_feature(data, column):
    new_features = data[[column, 'click']]
    new_features[column + '_freq'] = 1.0
    data_grouped = new_features.groupby(column)
    new_features = data_grouped.sum()
    new_features['click'] /= new_features[column + '_freq']
    new_features.rename(columns={'click' : column + '_CTR'}, inplace = True)
    return data.join(new_features, on=column)

In [5]:
def preprocess_data(data, enforce_cols=None):
    data = data.sort_index(axis=0)
    to_drop = ['userid', 'bidid', 'url', 'urlid', 'IP', 'keypage', 'slotid', 'creative', 'domain']
    data = data.drop(to_drop, axis=1)
  
    colums_split = data['useragent'].str.split('_', expand=True)
    data = data.join(colums_split.rename(columns={0:'os', 1:'browser'}))
#     data = data.apply(lambda row: hash_feature(row), axis=1)
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    
    data.drop(['useragent', 'usertag'], axis=1, inplace=True)
    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data) 

    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
    return data

In [6]:
train_processed = downsampling(train.copy())
train_processed = preprocess_data(train_processed)
val_processed = preprocess_data(val.copy(), enforce_cols=train_processed.columns)
test_processed = preprocess_data(test.copy(), enforce_cols=train_processed.columns)

In [7]:
train_x = train_processed.drop(['bidprice', 'payprice', 'click'], axis=1)
train_y = train_processed['click']
val_x = val_processed.drop(['bidprice', 'payprice', 'click'], axis=1)
val_y = val_processed['click']
test_x = test_processed.drop(['bidprice', 'payprice', 'click'], axis=1)

In [8]:
# Caculate the root mean square error
def rmse(preds, dtrain):
    labels = dtrain.get_label()
    diff = preds - labels                       
    mean_diff_squared = (diff ** 2).mean()  
    error = np.sqrt(mean_diff_squared)
    return error 

In [9]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

def KFold_model(trainX, trainY, test, **kwargs):
    trainX = np.array(trainX)
    trainY = np.array(trainY)
    test = np.array(test)
    
    K = 5
    kfold = KFold(n_splits=K, random_state=7,shuffle = True)

    xgb_preds = []
    val_scores = []
    
    for train_index, val_index in kfold.split(trainX):
        
        train_X, valid_X = trainX[train_index], trainX[val_index]
        train_y, valid_y = trainY[train_index], trainY[val_index]
        xgb_params = {'eta': 0.3, 'max_depth': 3, 'subsample': 1.0
                      , 'colsample_bytree': 1.0, 'objective': 'reg:logistic'
                      , 'eval_metric': 'rmse', 'seed': 99, 'silent': True}

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(test)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(xgb_params, d_train, 500,  watchlist, maximize=False, 
                              verbose_eval=50, early_stopping_rounds=50)
        
        val_score = rmse(model.predict(d_valid, ntree_limit=model.best_ntree_limit), d_valid)
        print("The validation set score is :\t{}\n".format(val_score))
        val_scores.append(val_score)
        
        xgb_pred = model.predict(d_test)
        xgb_preds.append(list(xgb_pred))

    print(val_scores)
    
    preds = []
    for i in range(len(xgb_preds[0])):
        sumres = 0
        for j in range(K):
            sumres += xgb_preds[j][i]
        preds.append(sumres / K)
    
    return preds

In [10]:
val_pred = model = KFold_model(train_x, train_y, val_x)

[0]	train-rmse:0.365102	valid-rmse:0.365082
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.113773	valid-rmse:0.113173
[100]	train-rmse:0.111488	valid-rmse:0.112403
[150]	train-rmse:0.10977	valid-rmse:0.112407
[200]	train-rmse:0.108408	valid-rmse:0.112359
Stopping. Best iteration:
[173]	train-rmse:0.109178	valid-rmse:0.112149

The validation set score is :	0.11214888095855713

[0]	train-rmse:0.364852	valid-rmse:0.365558
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.112247	valid-rmse:0.118053
[100]	train-rmse:0.110117	valid-rmse:0.117275
[150]	train-rmse:0.108523	valid-rmse:0.117396
Stopping. Best iteration:
[108]	train-rmse:0.10976	valid-rmse:0.117079

The validation set score is :	0.1170787513256073

[0]	train-rmse:0.364974	valid-rmse:0.365235
Multiple eval

In [11]:
rmse(val_pred, xgb.DMatrix(val_x, val_y))

0.039265536316123222

In [12]:
accuracy_score([round(x) for x in val_pred], val_y)

0.99869046639796

In [13]:
sum([round(x) for x in val_pred]) / len(val_x)

0.0011055359052397796

In [14]:
CTR = sum(val_y) / len(val_x)
print(CTR)

0.0006646376573167722


In [15]:
avgCTR = sum(train['click']) / len(train)
print(avgCTR)

0.0007375623256619447


In [16]:
def compare_performance(metrics_list, best_metrics):
    return metrics_list[0] >= best_metrics[0] and metrics_list[1] >= best_metrics[1]

In [17]:
def ortb(theta, c, _lambda):
    return math.sqrt(c / _lambda * theta + c * c) - c

In [18]:
new_val = val.copy()
last_paided_index = 0
_lambda = 7 * math.pow(10, -5)
results = []

for para in range(980, 990, 10):
    bidprices = [ortb(x, para, _lambda) for x in val_pred]
    new_val['bidprice'] = bidprices
#     new_val.loc[new_val.advertiser == 2259, 'bidprice'] = 15
#     new_val.loc[new_val.advertiser == 2261, 'bidprice'] = 15   
    budget = 6250
    suc_bids = new_val.query('bidprice >= payprice and bidprice >= slotprice')
    cost = 0
    clicks = 0
    imps = 0
    i = 0
    for index, row in suc_bids.iterrows():
        if cost <= budget:
            cost += row['payprice'] / 1000
            clicks += row['click']
            imps += 1
            i += 1
            last_paided_index = max(last_paided_index, i)
    eCPC = cost / clicks if clicks > 0 else float('inf')
    metrics_list = [clicks, clicks / imps * 100, cost, cost / imps * 1000, eCPC]
    results.append([_lambda, para, clicks, clicks / imps * 100, cost, cost / imps * 1000, eCPC])
    
results = pd.DataFrame(results)
results = results.rename(columns={0: "lambda", 1: "c", 2: "clicks",
    3: "CTR", 4: "cost", 5: "avg CPM", 6: "eCPC"})
results.to_csv(os.path.join('ortb_xgboost.csv'), index=False)

In [19]:
"""
Computes some basic statistics based on provided dataset such as Clicks, Imps and CTR.
"""
def calc_statistics(data):
    cols = data[['click', 'advertiser', 'payprice']]
    grouped_cols = cols.groupby('advertiser')
    new_features = grouped_cols.count()
    new_features['Imps'] = new_features['click']

    new_features.rename(columns={'click':'Clicks', 'payprice':'Cost'}, inplace = True)

    new_features[['Clicks', 'Cost']] = grouped_cols.sum()[['click', 'payprice']]
    new_features['Cost'] = new_features['Cost'] / 1000

    new_features[['Avg CPM']] = grouped_cols.mean()[['payprice']]
    new_features['CTR'] = new_features['Clicks'] / new_features['Imps'] * 100
    new_features['eCPC'] = new_features['Cost'] / new_features['Clicks']

    new_features = new_features.reindex(sorted(new_features.columns), axis=1)
    return new_features

In [20]:
results.loc[results.clicks == max(results['clicks'])]

Unnamed: 0,lambda,c,clicks,CTR,cost,avg CPM,eCPC
0,7e-05,980,166,0.130098,6181.054,48.442381,37.235265


In [21]:
suc_bids_statistics = calc_statistics(suc_bids[:last_paided_index])
print(suc_bids_statistics)
print('clicks: ' + str(sum(suc_bids_statistics['Clicks'])))

              Avg CPM       CTR  Clicks      Cost   Imps       eCPC
advertiser                                                         
1458        37.178113  0.221696      44   737.874  19847  16.769864
2259        40.912807  0.000000       0   255.255   6239        inf
2261        34.599192  0.040404       2   171.266   4950  85.633000
2821        50.527533  0.084069      12   721.230  14274  60.102500
2997        62.455328  0.422352      26   384.475   6156  14.787500
3358        50.957616  0.186488      22   601.147  11797  27.324864
3386        54.254416  0.057784      21  1971.714  36342  93.891143
3427        47.015986  0.206265      32   729.406  15514  22.793937
3476        48.784724  0.056103       7   608.687  12477  86.955286
clicks: 166


In [22]:
val_click_0 = new_val.query('advertiser == 2821 and click == 0')[['click', 'bidprice', 'payprice']]
val_click_1 = new_val.query('advertiser == 2821 and click == 1')[['click', 'bidprice', 'payprice']]

In [23]:
val_click_1.query('bidprice < payprice')

Unnamed: 0,click,bidprice,payprice
29442,1,161.678662,248
51363,1,107.352768,142
55240,1,102.41548,160
90338,1,28.838286,91
93048,1,158.220771,247
123201,1,23.904802,160
135205,1,30.101404,154
145721,1,86.40729,215
259465,1,29.925446,231
266153,1,194.445129,221


In [24]:
val_click_0.query('bidprice >= payprice')

Unnamed: 0,click,bidprice,payprice
45,0,34.640808,2
78,0,100.496293,94
105,0,86.259647,16
106,0,663.985833,16
122,0,71.495956,50
126,0,553.121972,45
158,0,184.622502,175
190,0,40.246952,31
241,0,40.707037,31
247,0,120.380826,48


In [25]:
val_statistics = calc_statistics(val)
print(val_statistics)
print('clicks: ' + str(sum(val_statistics['Clicks'])))

              Avg CPM       CTR  Clicks      Cost   Imps        eCPC
advertiser                                                          
1458        68.875627  0.078585      49  4294.602  62353   87.644939
2259        93.856297  0.011965       2  1568.808  16715  784.404000
2261        89.658745  0.022140       3  1214.876  13550  404.958667
2821        90.363355  0.086783      23  2394.900  26503  104.126087
2997        62.950777  0.420984      26   388.784   6176   14.953231
3358        84.824099  0.069826      23  2794.021  32939  121.479174
3386        76.780958  0.049413      28  4350.793  56665  155.385464
3427        75.259251  0.073730      37  3776.735  50183  102.073919
3476        77.077083  0.028321      11  2993.751  38841  272.159182
clicks: 202


In [26]:
test_pred = model = KFold_model(train_x, train_y, test_x)

[0]	train-rmse:0.365102	valid-rmse:0.365082
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.113773	valid-rmse:0.113173
[100]	train-rmse:0.111488	valid-rmse:0.112403
[150]	train-rmse:0.10977	valid-rmse:0.112407
[200]	train-rmse:0.108408	valid-rmse:0.112359
Stopping. Best iteration:
[173]	train-rmse:0.109178	valid-rmse:0.112149

The validation set score is :	0.11214888095855713

[0]	train-rmse:0.364852	valid-rmse:0.365558
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.112247	valid-rmse:0.118053
[100]	train-rmse:0.110117	valid-rmse:0.117275
[150]	train-rmse:0.108523	valid-rmse:0.117396
Stopping. Best iteration:
[108]	train-rmse:0.10976	valid-rmse:0.117079

The validation set score is :	0.1170787513256073

[0]	train-rmse:0.364974	valid-rmse:0.365235
Multiple eval

In [27]:
best_para = 980
submission = test.copy()[['advertiser', 'bidid']]
bidprices = np.asarray([ortb(x, best_para, _lambda) for x in test_pred])
submission['bidprice'] = bidprices
submission.set_index(['bidid'], inplace=True)
# submission.loc[submission.advertiser == 2259, 'bidprice'] = 15
# submission.loc[submission.advertiser == 2261, 'bidprice'] = 15
submission.drop('advertiser', axis=1, inplace=True)
submission.to_csv('Group_5.csv')