In [1]:
import csv
import math
import itertools
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from subprocess import check_output
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv("Desktop/Webeco/dataset/train.csv")
validation = pd.read_csv("Desktop/Webeco/dataset/validation.csv")

In [3]:
trainy = pd.read_csv("Desktop/Webeco/dataset/train.csv")["click"]
validationy = pd.read_csv("Desktop/Webeco/dataset/validation.csv")["click"]

def Columns_builder(df1,df2):
    for columns in df1:
        if columns in df2:
            continue
        else:
            missing_columns = columns
            ind = df1.columns.get_loc(missing_columns)
            df2.insert(ind,missing_columns,0.0)
    return df2


trainx = pd.read_csv("Desktop/Webeco/dataset/cleantrain.csv")
validationx = pd.read_csv("Desktop/Webeco/dataset/cleanvalidation.csv")

validationx = Columns_builder(trainx,validationx)

trainx = np.array(trainx)
triany = np.array(trainy)
trainy = [int(numeric_string) for numeric_string in trainy]
validationx = np.array(validationx)
validationy = np.array(validationy)
validationy = [int(numeric_string) for numeric_string in validationy]

In [4]:
#train logistc regression

In [5]:
model = LogisticRegression(class_weight = "balanced")
trainedlr = model.fit(trainx,trainy)

In [6]:
prob = trainedlr.predict_proba(validationx)

pClick = pd.DataFrame(prob)
predicty = trainedlr.predict(validationx)
precision = precision_score(validationy, predicty, average='micro')
correctpred = sum(predicty == validationy)

In [7]:
# apply negative downsampling to work out weights in order for probability of click to have the same ratio as training data, technique is called 
#model recalibration 

temp = len(train) / (2 * np.bincount(train.click))
w = temp[0]/temp[1]
print(w)

LRprob =[]
for p in pClick[1]:
    LRprob.append( p / (p + ((1-p)/w)))
LRprob[:5]

0.000754533880574


[0.00054017400040170952,
 0.0042812693741413523,
 0.0010873878524663902,
 0.00085829272510297753,
 0.00046335760608331327]

In [8]:
pClick[:5]

Unnamed: 0,0,1
0,0.582652,0.417348
1,0.149288,0.850712
2,0.409382,0.590618
3,0.467619,0.532381
4,0.619432,0.380568


In [9]:
print(correctpred,"correct clicks has been predicted in this LR model")

224234 correct clicks has been predicted in this LR model


In [10]:
print("The precision of lr model is",precision)

The precision of lr model is 0.748072554037


In [11]:
#ORTB

In [12]:
lambdas = [5.2e-10,5.2e-9,5.2e-8,5.2e-7,5.2e-6,5.2e-5,5.2e-4,5.2e-3,5.2e-2,5.2e-1]
c = np.arange(10,101,10).tolist()
parameters = list(itertools.product(c, lambdas))

In [13]:
LRctr = np.asarray(LRprob)

In [14]:
LRctr

array([ 0.00054017,  0.00428127,  0.00108739, ...,  0.0004593 ,
        0.00027177,  0.00063145])

In [15]:
def ortb1(c,lambdas,predictctr):
    impression = 0.0
    clicks = 0
    cost = 0.0
    budget = 2500000  
    bidortb1 = []
    
    for pctr in predictctr:
        ortb1 = math.sqrt(c/lambdas*pctr + c**2)-c
        bidortb1.append(ortb1)
    
    true_bids = bidortb1 >= validation.payprice
    for i in range(0, len(true_bids)):
        if true_bids[i] == True:
            impression += 1.0
            clicks += validation.click[i]
            cost += validation.payprice[i]
        if cost >= budget:
            break
    return impression, clicks, cost

In [16]:
def ortb2(c,lambdas,predictctr):
    impression = 0.0
    clicks = 0
    cost = 0.0
    budget = 2500000  
    bidortb2 = []
    
    for pctr in predictctr:
        ortb2 = c*(math.pow((pctr+math.sqrt((c**2)*(lambdas**2)+pctr**2))/(c*lambdas),1/3)-math.pow(c*lambdas/(pctr+math.sqrt((c**2)*(lambdas**2)+pctr**2)),1/3))
        bidortb2.append(ortb2)
    
    true_bids = bidortb2 >= validation.payprice
    for i in range(0, len(true_bids)):
        if true_bids[i] == True:
            impression += 1.0
            clicks += validation.click[i]
            cost += validation.payprice[i]
        if cost >= budget:
            break
    return impression, clicks, cost

In [17]:
def compute(predictctr,ortb):
    df = pd.DataFrame()
    
    imp = []
    clik = []
    ct = []
    clamb = []
    for parameter in parameters:
        c = parameter[0]
        lam = parameter[1]
        [imps, clicks, cost] = ortb(c,lam,predictctr)
        imp.append(imps)
        clik.append(clicks)
        ct.append(cost)
        clamb.append(parameter)
    df['C_Lambda'] = clamb
    df['impressions'] = imp
    df['total_cost'] = ct
    df['clicks'] = clik
    df['CTR'] = (df.clicks / df.impressions * 100).round(2).astype(str)
    df['CPM'] = (df.total_cost / df.impressions * 1000).round(2).astype(str)
    df['CPC'] = (df.total_cost / df.clicks).round(2).astype(str)
    return df

In [18]:
ortb1_function = compute(LRctr,ortb1)

In [19]:
ortb2_function = compute(LRctr,ortb2)

In [20]:
ortb1_function.sort_values("CTR",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
5,"(10, 5.2e-05)",4279.0,28565.0,8,0.19,6675.63,3570.62
45,"(50, 5.2e-05)",6621.0,66065.0,12,0.18,9978.1,5505.42
95,"(100, 5.2e-05)",7176.0,78216.0,13,0.18,10899.67,6016.62
55,"(60, 5.2e-05)",6799.0,69574.0,12,0.18,10232.98,5797.83
85,"(90, 5.2e-05)",7115.0,77039.0,13,0.18,10827.69,5926.08
15,"(20, 5.2e-05)",5333.0,42034.0,9,0.17,7881.87,4670.44
65,"(70, 5.2e-05)",6927.0,72621.0,12,0.17,10483.76,6051.75
75,"(80, 5.2e-05)",7042.0,75329.0,12,0.17,10697.1,6277.42
25,"(30, 5.2e-05)",5944.0,52106.0,10,0.17,8766.15,5210.6
35,"(40, 5.2e-05)",6377.0,61270.0,11,0.17,9607.97,5570.0


In [21]:
ortb2_function.sort_values("CTR",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
46,"(50, 0.00052)",401.0,1665.0,6,1.5,4152.12,277.5
56,"(60, 0.00052)",407.0,1724.0,6,1.47,4235.87,287.33
66,"(70, 0.00052)",408.0,1730.0,6,1.47,4240.2,288.33
76,"(80, 0.00052)",409.0,1736.0,6,1.47,4244.5,289.33
96,"(100, 0.00052)",412.0,1754.0,6,1.46,4257.28,292.33
86,"(90, 0.00052)",411.0,1748.0,6,1.46,4253.04,291.33
26,"(30, 0.00052)",362.0,1422.0,5,1.38,3928.18,284.4
36,"(40, 0.00052)",394.0,1623.0,5,1.27,4119.29,324.6
16,"(20, 0.00052)",289.0,972.0,2,0.69,3363.32,486.0
95,"(100, 5.2e-05)",11702.0,162321.0,18,0.15,13871.22,9017.83


In [22]:
ortb1_function.sort_values("clicks",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
44,"(50, 5.2e-06)",87225.0,2500010.0,95,0.11,28661.62,26315.89
34,"(40, 5.2e-06)",84589.0,2283046.0,89,0.11,26989.87,25652.2
54,"(60, 5.2e-06)",83373.0,2500004.0,88,0.11,29985.77,28409.14
84,"(90, 5.2e-06)",76678.0,2500013.0,87,0.11,32604.05,28735.78
94,"(100, 5.2e-06)",75307.0,2500035.0,85,0.11,33197.91,29412.18
64,"(70, 5.2e-06)",80578.0,2500021.0,84,0.1,31026.1,29762.15
74,"(80, 5.2e-06)",78551.0,2500013.0,82,0.1,31826.62,30487.96
24,"(30, 5.2e-06)",77628.0,1934584.0,78,0.1,24921.21,24802.36
14,"(20, 5.2e-06)",68170.0,1502087.0,62,0.09,22034.43,24227.21
3,"(10, 5.2e-07)",49951.0,2500018.0,49,0.1,50049.41,51020.78


In [23]:
ortb2_function.sort_values("clicks",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
44,"(50, 5.2e-06)",74308.0,2500071.0,76,0.1,33644.71,32895.67
34,"(40, 5.2e-06)",81930.0,2500030.0,76,0.09,30514.22,32895.13
24,"(30, 5.2e-06)",94092.0,2464370.0,75,0.08,26191.07,32858.27
54,"(60, 5.2e-06)",69386.0,2500031.0,68,0.1,36030.77,36765.16
94,"(100, 5.2e-06)",60797.0,2500019.0,68,0.11,41120.76,36764.99
84,"(90, 5.2e-06)",62140.0,2500004.0,66,0.11,40231.8,37878.85
64,"(70, 5.2e-06)",66226.0,2500066.0,64,0.1,37750.52,39063.53
3,"(10, 5.2e-07)",73876.0,2500017.0,63,0.09,33840.72,39682.81
74,"(80, 5.2e-06)",63889.0,2500013.0,63,0.1,39130.57,39682.75
23,"(30, 5.2e-07)",46383.0,2500003.0,47,0.1,53899.12,53191.55


In [24]:
def ortb1opt(c,lambdas,predictctr):
    impression = 0.0
    clicks = 0
    cost = 0.0
    budget = 2500000  
    bidortb1 = []
    
    for pctr in predictctr:
        if pctr <= 0.00023:
            ortb1 = 0
        else:
            ortb1 = math.sqrt(c/lambdas*pctr + c**2)-c
        bidortb1.append(ortb1)
    
    true_bids = bidortb1 >= validation.payprice
    for i in range(0, len(true_bids)):
        if true_bids[i] == True:
            impression += 1.0
            clicks += validation.click[i]
            cost += validation.payprice[i]
        if cost >= budget:
            break
    return impression, clicks, cost

In [25]:
ortb1_opt = compute(LRctr,ortb1opt)

In [26]:
def ortb2opt(c,lambdas,predictctr):
    impression = 0.0
    clicks = 0
    cost = 0.0
    budget = 2500000  
    bidortb2 = []
    
    for pctr in predictctr:
        if pctr <= 0.0023:
            bidprice = 0
        else:
            bidprice = c*(math.pow((pctr+math.sqrt((c**2)*(lambdas**2)+pctr**2))/(c*lambdas),1/3)-math.pow(c*lambdas/(pctr+math.sqrt((c**2)*(lambdas**2)+pctr**2)),1/3))
        bidortb2.append(bidprice)
    
    true_bids = bidortb2 >= validation.payprice
    for i in range(0, len(true_bids)):
        if true_bids[i] == True:
            impression += 1.0
            clicks += validation.click[i]
            cost += validation.payprice[i]
        if cost >= budget:
            break
    return impression, clicks, cost

In [27]:
ortb2_opt = compute(LRctr,ortb2opt)

In [28]:
ortb1_opt.sort_values("clicks",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
44,"(50, 5.2e-06)",85681.0,2500001.0,95,0.11,29178.01,26315.8
34,"(40, 5.2e-06)",82415.0,2265409.0,89,0.11,27487.82,25454.03
54,"(60, 5.2e-06)",81968.0,2500003.0,88,0.11,30499.74,28409.12
84,"(90, 5.2e-06)",75456.0,2500008.0,87,0.12,33132.0,28735.72
94,"(100, 5.2e-06)",74103.0,2500039.0,85,0.11,33737.35,29412.22
64,"(70, 5.2e-06)",79208.0,2500006.0,84,0.11,31562.54,29761.98
74,"(80, 5.2e-06)",77295.0,2500026.0,82,0.11,32343.95,30488.12
24,"(30, 5.2e-06)",75543.0,1918305.0,78,0.1,25393.55,24593.65
14,"(20, 5.2e-06)",66205.0,1487525.0,62,0.09,22468.47,23992.34
3,"(10, 5.2e-07)",49143.0,2500086.0,48,0.1,50873.7,52085.12


In [29]:
ortb2_opt.sort_values("clicks",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
0,"(10, 5.2e-10)",10838.0,829595.0,72,0.66,76545.03,11522.15
30,"(40, 5.2e-10)",10838.0,829595.0,72,0.66,76545.03,11522.15
32,"(40, 5.2e-08)",10838.0,829595.0,72,0.66,76545.03,11522.15
40,"(50, 5.2e-10)",10838.0,829595.0,72,0.66,76545.03,11522.15
41,"(50, 5.2e-09)",10838.0,829595.0,72,0.66,76545.03,11522.15
42,"(50, 5.2e-08)",10838.0,829595.0,72,0.66,76545.03,11522.15
43,"(50, 5.2e-07)",10824.0,825483.0,72,0.67,76264.14,11465.04
1,"(10, 5.2e-09)",10838.0,829595.0,72,0.66,76545.03,11522.15
51,"(60, 5.2e-09)",10838.0,829595.0,72,0.66,76545.03,11522.15
52,"(60, 5.2e-08)",10838.0,829595.0,72,0.66,76545.03,11522.15


In [30]:
ortb1_opt.sort_values("CTR",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
5,"(10, 5.2e-05)",4271.0,28557.0,8,0.19,6686.26,3569.62
95,"(100, 5.2e-05)",7157.0,78186.0,13,0.18,10924.41,6014.31
45,"(50, 5.2e-05)",6603.0,66037.0,12,0.18,10001.06,5503.08
55,"(60, 5.2e-05)",6780.0,69544.0,12,0.18,10257.23,5795.33
85,"(90, 5.2e-05)",7096.0,77009.0,13,0.18,10852.45,5923.77
15,"(20, 5.2e-05)",5320.0,42016.0,9,0.17,7897.74,4668.44
75,"(80, 5.2e-05)",7023.0,75299.0,12,0.17,10721.77,6274.92
25,"(30, 5.2e-05)",5927.0,52080.0,10,0.17,8786.91,5208.0
65,"(70, 5.2e-05)",6908.0,72591.0,12,0.17,10508.25,6049.25
35,"(40, 5.2e-05)",6360.0,61244.0,11,0.17,9629.56,5567.64


In [31]:
ortb2_opt.sort_values("CTR",ascending= False).head(10)

Unnamed: 0,C_Lambda,impressions,total_cost,clicks,CTR,CPM,CPC
46,"(50, 0.00052)",352.0,1608.0,6,1.7,4568.18,268.0
56,"(60, 0.00052)",358.0,1667.0,6,1.68,4656.42,277.83
76,"(80, 0.00052)",360.0,1679.0,6,1.67,4663.89,279.83
66,"(70, 0.00052)",359.0,1673.0,6,1.67,4660.17,278.83
86,"(90, 0.00052)",362.0,1691.0,6,1.66,4671.27,281.83
96,"(100, 0.00052)",363.0,1697.0,6,1.65,4674.93,282.83
26,"(30, 0.00052)",313.0,1365.0,5,1.6,4361.02,273.0
36,"(40, 0.00052)",345.0,1566.0,5,1.45,4539.13,313.2
16,"(20, 0.00052)",240.0,915.0,2,0.83,3812.5,457.5
23,"(30, 5.2e-07)",10435.0,729937.0,70,0.67,69950.84,10427.67
