In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sql
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.plotly as py
import scipy as sp
from scipy import stats
from scipy.stats import skew, kurtosis
%matplotlib inline

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
validation_data = pd.read_csv("validation.csv")

In [3]:
def split_data_os(data):
    return data.split('_')[0]
def split_data_browser(data):
    return data.split('_')[1]
def create_os_browser(data):
    os = data['useragent'].apply(split_data_os)
    browser = data['useragent'].apply(split_data_browser)
    os=np.array(os)
    browser = np.array(browser)
    df_os = pd.DataFrame(columns=['OS'], data = os)
    df_browser = pd.DataFrame(columns=['Browser'], data = browser)
    data.insert(0,'OS',df_os)
    data.insert(1,'Browser',df_browser)

In [4]:
create_os_browser(train_data)
create_os_browser(test_data)
create_os_browser(validation_data)

# Data Preprocessing

In [5]:
cols = ['weekday','hour','region','advertiser','OS','Browser','adexchange','slotformat', 'slotvisibility', 'creative']

In [6]:
def get_unique(data):
    cols = data.columns
    col_unique = {}
    for col in cols:
        unique = data[col].unique()
        col_unique[col]=unique
        print(col,":")
        print(unique)
        print()
    return col_unique

In [7]:
col_unique = get_unique(train_data)

OS :
['windows' 'mac' 'android' 'linux' 'ios' 'other']

Browser :
['ie' 'chrome' 'other' 'safari' 'maxthon' 'firefox' 'opera' 'theworld'
 'sogou']

click :
[0 1]

weekday :
[5 1 3 6 4 2 0]

hour :
[22 20 13 23  6 17 12 16 14  0 18 21  2  1 19 10 11 15  8  5  9  3  4  7]

bidid :
['b7bea80521fdecd95d2d761a38c91c3f09618066'
 '4f51205475678f5a124bc76b2c54163bf8eaa7eb'
 'b604e3fd054a658ab7ced4285ebf2ef54d2bd890' ...,
 '2a9d622f70b8c61c418c97fbab18a4c088c7768a'
 '3f90fdf1527c224d4b684af565668719833283f7'
 '0721d40588ea0fb9f48843e1164d818738245043']

userid :
['2e880fb7d690cf7377b2e42e701728e3f3c0e4c1'
 '3a1fe01360ff8100e7d006b83b77a3e4c01d928c'
 '801d18a056b6fe6b06a794aef17fb0d6daff2414' ...,
 'bd2bbb9b1cc25ec32a82e58eb8bab12dfdbf5b34'
 '502b1ccf0dbc6e228f8ef2427b16d5c7cc9d937c'
 'fa8971421a2797bd0345d7dbb8cca0be01cd40ae']

useragent :
['windows_ie' 'windows_chrome' 'mac_other' 'windows_other' 'android_safari'
 'mac_safari' 'windows_maxthon' 'windows_firefox' 'mac_chrome'
 'linux_firefox' '

advertiser :
[3427 2821 1458 2259 3386 3358 3476 2261 2997]

usertag :
['null' '10052,10006,13866,10110' '13866,10063,10111' ...,
 '10063,10006,10076,10083,10059,10075,10110'
 '10048,13496,14273,10083,13776,10111,13403,10063,10133,10116,10057,10059,10079,10076,10077,10093,10075,10074,10102,10024,10006,10148,16706,10052,10127,11680'
 '10024,13403,10074,13800,10063,10006,10110']



In [8]:
cols_len = {}
for col in cols:
    temp = col_unique[col]
    cols_len[col] = len(temp)

In [9]:
cols_len

{'Browser': 9,
 'OS': 6,
 'adexchange': 5,
 'advertiser': 9,
 'creative': 131,
 'hour': 24,
 'region': 35,
 'slotformat': 4,
 'slotvisibility': 11,
 'weekday': 7}

In [10]:
def onehot_encoding(data,cols):
    for col in cols:
        print("Complete: "+col)
        df_temp = pd.get_dummies(data[col],prefix=col)
        data = data.join(df_temp)
    return data

In [11]:
train = onehot_encoding(train_data,cols)
train.head()

Complete: weekday
Complete: hour
Complete: region
Complete: advertiser
Complete: OS
Complete: Browser
Complete: adexchange
Complete: slotformat
Complete: slotvisibility
Complete: creative


Unnamed: 0,OS,Browser,click,weekday,hour,bidid,userid,useragent,IP,region,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,windows,ie,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,...,0,0,0,0,0,0,0,0,0,0
1,windows,chrome,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,...,0,0,0,0,0,0,0,0,0,0
2,windows,ie,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,...,0,0,0,0,0,0,0,0,0,0
3,windows,ie,0,6,23,0348beeae93e561584c3b50fc9e7746a33048ad7,0d6eaf2259699990e38a1fc5116f112070b9ecdc,windows_ie,114.250.226.*,1,...,0,0,0,0,0,0,0,0,0,0
4,windows,ie,0,5,6,268149c1789bce2bc9798ffd97ec431219bafeb3,a239d9bb642460d974ba67f85e63b8d3e214da0e,windows_ie,183.63.192.*,216,...,0,0,0,0,0,0,0,0,0,0


In [12]:
validation = onehot_encoding(validation_data,cols)
validation.head()

Complete: weekday
Complete: hour
Complete: region
Complete: advertiser
Complete: OS
Complete: Browser
Complete: adexchange
Complete: slotformat
Complete: slotvisibility
Complete: creative


Unnamed: 0,OS,Browser,click,weekday,hour,bidid,userid,useragent,IP,region,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,windows,ie,0,4,20,bbcb813b6166538503d8b33a5602d7d72f6019dc,663169f66491c98c69f1f94a5c48fa34aa9fe06f,windows_ie,211.144.203.*,79,...,0,0,0,0,0,0,0,0,0,0
1,windows,chrome,0,1,21,5a07316c49477cb5d9b4d5aa39c27d6c3be7f92d,a23f0cfab6592c137f796e68fa752fceba08bb1b,windows_chrome,58.247.250.*,79,...,0,0,0,0,0,0,0,0,0,0
2,windows,ie,0,4,8,f6ece71dae81d6b16bfb24ad6dd5611472d4c673,5105a013ea54a9706146033e6a138d5234a3803e,windows_ie,117.12.111.*,2,...,0,0,0,0,0,0,0,0,0,0
3,windows,chrome,0,5,15,b4d5c57c9b38ff5a12954fa01e11931b4e6bfbbb,577e9a768ce17e73c61ba7022db927f9ee761830,windows_chrome,61.187.224.*,201,...,0,0,0,0,0,0,0,0,0,0
4,windows,chrome,0,1,18,0899bf144249458ea9c89188473694bf44c7ca15,e4bf79c562745d671b19f2edf7fda89c2e25987f,windows_chrome,117.41.145.*,134,...,0,0,0,0,0,0,0,0,0,0


In [13]:
test = onehot_encoding(test_data,cols)
test.head()

Complete: weekday
Complete: hour
Complete: region
Complete: advertiser
Complete: OS
Complete: Browser
Complete: adexchange
Complete: slotformat
Complete: slotvisibility
Complete: creative


Unnamed: 0,OS,Browser,weekday,hour,bidid,userid,useragent,IP,region,city,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,windows,chrome,0,12,366c563de7d90feb9d4dab53e795a93fb3157387,75045dd2f2136c93fe55fe6c446ec1527ed8f0bb,windows_chrome,27.197.36.*,146,159,...,0,0,0,0,0,0,0,0,0,0
1,android,safari,3,14,29167d4caa719788b5a342dbaa25151d53121f80,11279eb1f8f7a88f877db911673522b6ff202aa7,android_safari,124.126.227.*,1,1,...,0,0,0,0,0,0,0,0,0,0
2,windows,ie,5,19,ff8bc3f4d44a3ea60c5f3a3a8fbe7cd98fb2966e,4771a819a3f5b86776d8a9456f4f2506578f78d8,windows_ie,116.116.104.*,27,34,...,0,0,0,0,0,0,0,0,0,0
3,windows,ie,0,21,844c2da00d45315f20b748ec131c26ee99a7cbc7,5360671379f3204afa9cc0edd8fd2e2096cb09eb,windows_ie,115.46.145.*,238,245,...,0,0,0,0,0,0,0,0,0,0
4,windows,ie,2,20,c6017f0ad0c44d7d0c9b62583ea863f28941c0ca,d791b5f6306b9d299f81daa5448ced7e5bfa010f,windows_ie,221.199.203.*,27,35,...,0,0,0,0,0,0,0,0,0,0


In [14]:
not_contains = ['weekday','hour','region','slotvisibility','slotformat','advertiser','OS','Browser','adexchange','bidid','userid','useragent','IP','city','domain','url','urlid','slotid','creative','bidprice','payprice','keypage','slotformat', 'slotvisibility', 'creative','slotprice']
not_contains_test = ['weekday','hour','region','slotvisibility','slotformat','advertiser','OS','Browser','adexchange','bidid','userid','useragent','IP','city','domain','url','urlid','slotid','creative','keypage','slotformat', 'slotvisibility', 'creative','slotprice']

In [15]:
train = train.drop(not_contains,axis=1)
train.head()

Unnamed: 0,click,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,0,200,200,null,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,300,250,null,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,250,250,10052100061386610110,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,160,600,138661006310111,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,728,90,null,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
validation = validation.drop(not_contains,axis=1)
validation.head()

Unnamed: 0,click,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,0,160,600,1386610111,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,950,90,1007510057100241005210083100631000610110,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,300,250,1386610024100591006313776100831000610111,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,336,280,1386610057100061006310110,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,200,200,10006100631377610110,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
test = test.drop(not_contains_test,axis=1)
test.head()

Unnamed: 0,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,creative_e1af08818a6cd6bbba118bb54a651961,creative_e1b0b6fb39abeb138c0b1e37c5f6d777,creative_e87d7633d474589c2e2e3ba4eda53f6c,creative_f1691b7571803d5a46adcb1f39f94d44,creative_f49541b4e0999d0c934ee3eea142a60e,creative_f65c8bdb41e9015970bac52baa813239,creative_fa8f0532dd5144b5fa748459e8d90b49,creative_fb5afa9dba1274beaf3dad86baf97e89,creative_fe222c13e927077ad3ea087a92c0935c,creative_ff5123fb9333ca095034c62fdaaf51aa
0,300,250,"10024,10077,10075,10063,10031,10102,10006,1304...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,320,50,,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,336,280,"13866,10006,10024,10059,10048,10063,10067,1008...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,960,90,10075130421000610110137761003110063,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,300,250,10063,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
def getAllUserTag(data):
    taglist = []
    for tags in data["usertag"].values:
        if type(tags)is str:
            t = tags.split(',')
            taglist.extend(t)
    tagset = set(taglist)
    return tagset
def addUserTagOneHot(data,tagset):
    df_empty = pd.DataFrame(columns=tagset)
    for tag in tagset:
        temp = np.zeros(len(data))
        temp = temp.astype(np.int8)
        df_empty[tag] = temp
    return df_empty
def processing_user_tag(data):
    tagset = getAllUserTag(data)
    df_usertag = addUserTagOneHot(data,tagset)
    usertag = data['usertag'].values
    row = 0
    for tag in usertag:
        if tag == 'null':
            df_usertag[tag][row] = 1
        else:
            t = tag.split(',')
            for col in t:
                df_usertag[col][row] = 1
        row += 1
    return df_usertag

In [19]:
df_usertag_train = processing_user_tag(train)
df_usertag_train.head()

Unnamed: 0,10138,10067,10102,13403,11379,10125,11092,10083,13866,10133,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_usertag_validation = processing_user_tag(validation)
df_usertag_validation.head()

Unnamed: 0,10138,10067,10102,13403,11379,10125,11092,10083,13866,10133,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
df_usertag_test = processing_user_tag(test)
df_usertag_test.head()

Unnamed: 0,10138,10067,10102,13403,11379,10125,11092,10083,13866,10133,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
train = train.join(df_usertag_train)
train.head()

Unnamed: 0,click,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,200,200,null,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,300,250,null,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,250,250,10052100061386610110,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,160,600,138661006310111,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,728,90,null,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [23]:
validation = validation.join(df_usertag_validation)
validation.head()

Unnamed: 0,click,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,160,600,1386610111,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,950,90,1007510057100241005210083100631000610110,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,300,250,1386610024100591006313776100831000610111,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,0,336,280,1386610057100061006310110,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,200,200,10006100631377610110,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
test = test.join(df_usertag_test)
test.head()

Unnamed: 0,slotwidth,slotheight,usertag,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,300,250,"10024,10077,10075,10063,10031,10102,10006,1304...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,320,50,,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,336,280,"13866,10006,10024,10059,10048,10063,10067,1008...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
3,960,90,10075130421000610110137761003110063,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,300,250,10063,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
def save_processed_data(train,validation,test):
    train.to_csv('train_processed_1.csv')
    print('saved train')
    validation.to_csv('validation_processed_1.csv')
    print('saved validation')
    test.to_csv('test_processed_1.csv')
    print('saved test')

In [26]:
def delete_train_columns(train,validation,test):
    train_columns = np.array(train.columns)
    validation_columns = np.array(validation.columns)
    test_columns = np.array(test.columns)
    delete_columns = []
    for col in train_columns:
        if col not in validation_columns and col != 'click':
            delete_columns.append(col)
    for col in train_columns:
        if col not in test_columns and col != 'click':
            delete_columns.append(col)
    train = train.drop(delete_columns, axis=1)
    return train
def delete_validation_columns(train,validation,test):
    train_columns = np.array(train.columns)
    validation_columns = np.array(validation.columns)
    test_columns = np.array(test.columns)
    delete_columns = []
    for col in validation_columns:
        if col not in train_columns and col != 'click':
            delete_columns.append(col)
    for col in validation_columns:
        if col not in test_columns and col != 'click':
            delete_columns.append(col)
    validation = validation.drop(delete_columns, axis=1)
    return validation
def delete_test_columns(train,validation,test):
    train_columns = np.array(train.columns)
    validation_columns = np.array(validation.columns)
    test_columns = np.array(test.columns)
    delete_columns = []
    for col in test_columns:
        if col not in train_columns and col != 'click':
            delete_columns.append(col)
    for col in test_columns:
        if col not in validation_columns and col != 'click':
            delete_columns.append(col)
    test = test.drop(delete_columns, axis=1)
    return test

In [27]:
train = delete_train_columns(train,validation,test)
validation = delete_validation_columns(train,validation,test)
test = delete_test_columns(train,validation,test)

In [28]:
train = train.drop('usertag',axis=1)
test = test.drop('usertag',axis=1)
validation = validation.drop('usertag',axis=1)

In [29]:
train.head()

Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,200,200,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,300,250,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,250,250,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,160,600,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,728,90,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
validation.head()

Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,0,160,600,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,950,90,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,300,250,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,336,280,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,200,200,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
test.head()

Unnamed: 0,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,...,10684,11576,11632,16593,11724,11944,10111,14273,10116,10024
0,300,250,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,320,50,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,336,280,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,960,90,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,300,250,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
cols = ['slotwidth','slotheight']

In [33]:
train = onehot_encoding(train,cols)
train.head()

Complete: slotwidth
Complete: slotheight


Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,200,200,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,0,300,250,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,250,250,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,160,600,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,728,90,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
validation = onehot_encoding(validation,cols)
validation.head()

Complete: slotwidth
Complete: slotheight


Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,160,600,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,950,90,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,300,250,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,336,280,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,200,200,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [35]:
test = onehot_encoding(test,cols)
test.head()

Complete: slotwidth
Complete: slotheight


Unnamed: 0,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,300,250,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,320,50,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,336,280,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,960,90,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,300,250,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [36]:
train = delete_train_columns(train,validation,test)
validation = delete_validation_columns(train,validation,test)
test = delete_test_columns(train,validation,test)

In [37]:
train.head()

Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,200,200,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,0,300,250,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,250,250,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,160,600,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,728,90,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
validation.head()

Unnamed: 0,click,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,160,600,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,950,90,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,300,250,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,336,280,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,200,200,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [39]:
test.head()

Unnamed: 0,slotwidth,slotheight,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,300,250,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,320,50,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,336,280,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,960,90,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,300,250,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [40]:
#save_processed_data(train,validation,test)

In [41]:
train = train.drop(cols,axis=1)
train.head()

Unnamed: 0,click,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
validation = validation.drop(cols,axis=1)
validation.head()

Unnamed: 0,click,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [43]:
test = test.drop(cols,axis=1)
test.head()

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,hour_0,hour_1,hour_2,...,slotheight_100,slotheight_125,slotheight_150,slotheight_200,slotheight_230,slotheight_240,slotheight_250,slotheight_280,slotheight_300,slotheight_600
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [44]:
train_x = train.drop('click',axis=1).values
train_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0]], dtype=int16)

In [45]:
train_y = train['click'].values
train_y

array([0, 0, 0, ..., 0, 0, 0])

In [46]:
validation_x = validation.drop('click',axis=1).values
validation_x

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [47]:
validation_y = validation['click'].values
validation_y

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
test_x = test.values
test_x

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ..., 
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [49]:
print(train_x.shape)
print(validation_x.shape)
print(test_x.shape)

(2430981, 342)
(303925, 342)
(303375, 342)


# Model testing Method

In [176]:
#Bidding strategy testing for validation dataset
def test_model(data):
    data = data[data['bidprice_pre'] > data['payprice']]
    click = data['click'].values
    payprice = data['payprice'].values
    cnt_click = 0
    cost = 0
    imps = 0
    for i in range(len(data)):
        cnt_click += click[i]
        cost += payprice[i]/1000
        imps += 1
        if cost > 6250:
            break
    ctr = cnt_click / imps
    cpc = cost / cnt_click
    result = {'cost:':cost,'ctr:':ctr,'cpc:':cpc,'click:':cnt_click,'imps:':imps}
    return result
def test_model_pre(bid_price):
    validation_data['bidprice_pre'] = bid_price
    result = test_model(validation_data)
    print(result)
    return result

In [56]:
def insert_bidprice_to_dataset_and_save(bid_price,data,filename):
    df_bidprice = pd.DataFrame(columns=['bidprice'], data = bid_price)
    df_bidid = data['bidid']
    #data.insert(0,'bidprice_pre',df_bidprice)
    result = df_bidprice
    result.insert(0,'bidid',df_bidid)
    result.to_csv(filename, index=False)

In [57]:
def get_base_price(data):
    clicks_arr = []
    clicks_price = {}
    clicks = data['click'].values
    payprice = data['payprice'].values
    bidprice = data['bidprice_pre']
    n = int(max(bidprice))
    for i in range(n):
        print(i)
        count = 0
        cost = 0
        for j in range(len(payprice)):
            if payprice[j] < i:
                count += clicks[j]
                cost += payprice[j]/1000
            if cost > 6250:
                break
        if count not in clicks_arr:
            clicks_arr.append(count)
            clicks_price[count] = i
    clicks_arr.sort()
    return clicks_price[clicks_arr[len(clicks_arr)-1]]

In [58]:
def get_base_bid_4(pCTR,data):
    payprice = data['payprice'].values
    click = data['click'].values
    click_arr = []
    click_price = {}
    for base_bid in range(300):
        bidprice = base_bid * pCTR / avgCTR
        cost = 0
        cnt_click = 0
        count = 0
        cost = 0
        for i in range(len(payprice)):
            if payprice[i] < bidprice[i]:
                count += click[i]
                cost += payprice[i]/1000
            if cost > 6250:
                break
        if count not in click_arr:
            click_arr.append(count)
            click_price[count] = base_bid
    click_arr.sort()
    return click_price[click_arr[len(click_arr)-1]]

# Method 1: Logistic with linear bidding strategy

base_bid uses the average of payprice in train dataset

In [62]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(C=1000.0, random_state=0)
logistic_model.fit(train_x, train_y)
print("Training End")

Training End


In [63]:
avgCTR = np.mean(train_data['click'].values)
avgCTR

0.00073756232566194466

In [64]:
base_bid = np.mean(train_data['payprice'].values)
base_bid

78.151416238958674

In [65]:
pCTR_test = logistic_model.predict_proba(test_x)[:,1]
pCTR_test

array([  7.40712094e-04,   3.98157787e-03,   3.95997953e-04, ...,
         3.30050233e-05,   1.32721439e-03,   1.05161213e-05])

In [66]:
pCTR_validation = logistic_model.predict_proba(validation_x)[:,1]
pCTR_validation

array([  5.48131266e-04,   2.64834490e-04,   1.89202921e-04, ...,
         4.42955400e-04,   4.78770470e-05,   2.46233049e-04])

In [67]:
bidprice_test = base_bid * pCTR_test / avgCTR
bidprice_test

array([  78.48516275,  421.88427837,   41.9595738 , ...,    3.49718149,
        140.63039941,    1.1142784 ])

In [68]:
bidprice_validation = base_bid * pCTR_validation / avgCTR
bidprice_validation

array([ 58.07947782,  28.06161562,  20.04776505, ...,  46.93514112,
         5.07300725,  26.09062425])

In [69]:
np.mean(bidprice_test)

77.828657845293748

In [70]:
test_model_pre(bidprice_validation)

{'ctr:': 0.0012478385653421752, 'cpc:': 37.34398571430021, 'click:': 140, 'imps:': 112194}


In [71]:
insert_bidprice_to_dataset_and_save(bidprice_test,test_data,'method_1.csv')

# Method 2: Logistic Regression with Linear bidding strategy

base_bid uses get_base_price() method

In [72]:
validation_data['bidprice_pre'] = bidprice_validation

In [73]:
base_bid = get_base_bid_4(pCTR_validation,validation_data)
base_bid

90

In [74]:
bidprice_validation = base_bid * pCTR_validation / avgCTR
bidprice_validation

array([ 66.88494279,  32.31605423,  23.08721891, ...,  54.05100642,
         5.84212897,  30.04623966])

In [75]:
test_model_pre(bidprice_validation)

{'ctr:': 0.0011518617765868095, 'cpc:': 42.355770833351919, 'click:': 144, 'imps:': 125015}


In [76]:
insert_bidprice_to_dataset_and_save(bidprice_test,test_data,'method_2.csv')

# Method 3: SVM model with Linear bidding strategy

base_bid uses get_base_price() method

In [None]:
#from sklearn.svm import SVC
#svm = SVC()
#svm.fit(train_x,train_y)
#print('Training End')

# Method 4: XGBoost model with Linear bidding strategy

XGBoost with default parameters

In [None]:
from xgboost.sklearn import XGBClassifier 
xclas = XGBClassifier()
xclas.fit(train_x, train_y, eval_metric='rmse', verbose = True, eval_set = [(validation_x, validation_y)],early_stopping_rounds=100) 
print('Training End')

In [None]:
pCTR_validation = xclas.predict_proba(validation_x)[:,1]
pCTR_validation

In [None]:
base_bid_validation = get_base_bid_4(pCTR_validation,validation_data)
base_bid_validation

In [None]:
bidprice_validation = base_bid_validation * pCTR_validation / avgCTR
bidprice_validation

In [None]:
test_model_pre(bidprice_validation)

In [None]:
bidprice_validation = 78 * pCTR_validation / avgCTR
bidprice_validation

In [None]:
test_model_pre(bidprice_validation)

In [None]:
pCTR_test = xclas.predict_proba(test_x)[:,1]
pCTR_test

In [None]:
base_bid_validation

In [None]:
bidprice_test = base_bid_validation * pCTR_test / avgCTR
bidprice_test

In [None]:
np.mean(bidprice_test)

In [None]:
insert_bidprice_to_dataset_and_save(bidprice_test,test_data,'method_4.csv')

# Method 5: XGBoost model with Linear bidding strategy

XGBoost with special parameters

In [None]:
#validation 151 159
from xgboost.sklearn import XGBClassifier 
xclas = XGBClassifier(
    ooster='gbtree',
    learning_rate = 0.5,
    n_estimators=50,
    max_depth=9,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.85,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1, 
    seed=27,
    n_jobs=4)
xclas.fit(train_x, train_y, eval_metric='rmse', verbose = True, eval_set = [(validation_x, validation_y)],early_stopping_rounds=100) 
print('Training End')

In [1]:
#171 test 169 validation
from xgboost.sklearn import XGBClassifier 
xclas = XGBClassifier(
    ooster='gbtree',
    learning_rate = 0.5,
    n_estimators=50,
    max_depth=10,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.85,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1, 
    seed=27,
    n_jobs=4)
xclas.fit(train_x, train_y, eval_metric='rmse', verbose = True, eval_set = [(validation_x, validation_y)],early_stopping_rounds=100) 
print('Training End')

NameError: name 'train_x' is not defined

In [93]:
len(pCTR_validation)

303925

In [53]:
pCTR_validation = xclas.predict_proba(validation_x)[:,1]
pCTR_validation

array([  4.71709018e-06,   4.71709018e-06,   4.71709018e-06, ...,
         4.71709018e-06,   4.71709018e-06,   4.71709018e-06], dtype=float32)

In [61]:
base_bid_validation = get_base_bid_4(pCTR_validation,validation_data)
base_bid_validation

0

In [None]:
bidprice_validation = base_bid_validation * pCTR_validation / avgCTR
bidprice_validation

In [None]:
np.mean(bidprice_validation)

In [None]:
test_model_pre(bidprice_validation)

In [None]:
pCTR_test = xclas.predict_proba(test_x)[:,1]
pCTR_test

In [None]:
bidprice_test = base_bid_validation * pCTR_test / avgCTR
bidprice_test

In [None]:
insert_bidprice_to_dataset_and_save(bidprice_test,test_data,'method_5.csv')

# Method 6: XGBoost model with Linear bidding strategy¶

Use 40000 samples as training data to train best XGBoost model with best parameters

In [159]:
from sklearn.utils import resample
def get_sample(data):
    majority = data[data.click == 0]
    minority = data[data.click == 1]
    majority = majority.sample(100000-len(minority))
    data = pd.concat([minority, majority],axis=0)
    return data

In [160]:
pCTR_validation = np.zeros(len(validation))
pCTR_test = np.zeros(len(test))
for i in range(10):
    train_temp = get_sample(train)
    train_x = train_temp.drop('click',axis=1).values
    train_y = train_temp['click'].values
    xclas = XGBClassifier(
    ooster='gbtree',
    learning_rate = 0.5,
    n_estimators=50,
    max_depth=10,
    min_child_weight=9,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.85,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    n_jobs=4)
    xclas.fit(train_x, train_y, eval_metric='rmse', verbose = True, eval_set = [(validation_x, validation_y)],early_stopping_rounds=100)
    pctr_v = xclas.predict_proba(validation_x)[:,1]
    pctr_t = xclas.predict_proba(test_x)[:,1]
    pCTR_validation = pCTR_validation + pctr_v
    pCTR_test = pCTR_test + pctr_t
    xclas = 0
pCTR_validation = pCTR_validation / 10
pCTR_test = pCTR_test / 10

[0]	validation_0-rmse:0.275305
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166168
[2]	validation_0-rmse:0.106064
[3]	validation_0-rmse:0.072364
[4]	validation_0-rmse:0.054951
[5]	validation_0-rmse:0.046411
[6]	validation_0-rmse:0.041976
[7]	validation_0-rmse:0.0391
[8]	validation_0-rmse:0.038613
[9]	validation_0-rmse:0.038444
[10]	validation_0-rmse:0.038149
[11]	validation_0-rmse:0.03819
[12]	validation_0-rmse:0.038414
[13]	validation_0-rmse:0.038452
[14]	validation_0-rmse:0.038779
[15]	validation_0-rmse:0.038517
[16]	validation_0-rmse:0.038807
[17]	validation_0-rmse:0.038769
[18]	validation_0-rmse:0.038823
[19]	validation_0-rmse:0.03879
[20]	validation_0-rmse:0.03904
[21]	validation_0-rmse:0.039052
[22]	validation_0-rmse:0.039109
[23]	validation_0-rmse:0.039171
[24]	validation_0-rmse:0.039433
[25]	validation_0-rmse:0.039817
[26]	validation_0-rmse:0.039679
[27]	validation_0-rmse:0.039937
[28]	validation_0-rmse:0.039829
[29]	validation_0-rms

[49]	validation_0-rmse:0.040344
[0]	validation_0-rmse:0.275259
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166046
[2]	validation_0-rmse:0.105945
[3]	validation_0-rmse:0.072403
[4]	validation_0-rmse:0.05499
[5]	validation_0-rmse:0.046369
[6]	validation_0-rmse:0.042107
[7]	validation_0-rmse:0.039171
[8]	validation_0-rmse:0.038508
[9]	validation_0-rmse:0.038428
[10]	validation_0-rmse:0.037664
[11]	validation_0-rmse:0.037279
[12]	validation_0-rmse:0.037585
[13]	validation_0-rmse:0.038054
[14]	validation_0-rmse:0.0385
[15]	validation_0-rmse:0.038305
[16]	validation_0-rmse:0.038369
[17]	validation_0-rmse:0.038476
[18]	validation_0-rmse:0.03885
[19]	validation_0-rmse:0.038763
[20]	validation_0-rmse:0.038965
[21]	validation_0-rmse:0.038834
[22]	validation_0-rmse:0.038747
[23]	validation_0-rmse:0.038962
[24]	validation_0-rmse:0.039047
[25]	validation_0-rmse:0.039517
[26]	validation_0-rmse:0.039622
[27]	validation_0-rmse:0.039807
[28]	validation_0-rm

[48]	validation_0-rmse:0.040887
[49]	validation_0-rmse:0.040792


In [163]:
pCTR_validation

array([ 0.00095145,  0.0017712 ,  0.00087431, ...,  0.02750509,
        0.00433335,  0.00420803])

In [164]:
pCTR_test

array([ 0.0323883 ,  0.0347326 ,  0.00103412, ...,  0.00265007,
        0.00833148,  0.00110702])

In [166]:
base_bid_validation = get_base_bid_4(pCTR_validation,validation_data)
base_bid_validation

6

In [167]:
bidprice_validation = base_bid_validation * pCTR_validation / avgCTR
bidprice_validation

array([   7.73999547,   14.40851049,    7.11241469, ...,  223.75134566,
         35.25136306,   34.23194971])

In [168]:
test_model_pre(bidprice_validation)

{'ctr:': 0.0014144151052716786, 'cpc:': 37.136548192784005, 'click:': 166, 'imps:': 117363}


In [172]:
bidprice_test = base_bid_validation * pCTR_test / avgCTR
bidprice_test

array([ 263.47578009,  282.5464676 ,    8.41247683, ...,   21.5580295 ,
         67.77577103,    9.00550308])

In [173]:
np.mean(bidprice_test)

97.196700476931383

In [174]:
insert_bidprice_to_dataset_and_save(bidprice_test,test_data,'method_6.csv')

In [133]:
len(get_sample(train))

100000

# Method 6: Loop 100 times and get best pCTR

In [186]:
def get_pctr(train,validation,test,validation_x,test_x,validation_data):
    validation_pCTR_result = 0
    test_pCTR_result = 0
    base_bid_result = 0
    click_cnt = 0
    for j in range(50):
        pCTR_validation = np.zeros(len(validation))
        pCTR_test = np.zeros(len(test))
        for i in range(10):
            train_temp = get_sample(train)
            train_x = train_temp.drop('click',axis=1).values
            train_y = train_temp['click'].values
            xclas = XGBClassifier(
            ooster='gbtree',
            learning_rate = 0.5,
            n_estimators=50,
            max_depth=10,
            min_child_weight=9,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.85,
            objective= 'binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=27,
            n_jobs=4)
            xclas.fit(train_x, train_y, eval_metric='rmse', verbose = True, eval_set = [(validation_x, validation_y)],early_stopping_rounds=100)
            pctr_v = xclas.predict_proba(validation_x)[:,1]
            pctr_t = xclas.predict_proba(test_x)[:,1]
            pCTR_validation = pCTR_validation + pctr_v
            pCTR_test = pCTR_test + pctr_t
            xclas = 0
        pCTR_validation = pCTR_validation / 10
        pCTR_test = pCTR_test / 10
        base_bid_validation = get_base_bid_4(pCTR_validation,validation_data)
        bidprice_validation = base_bid_validation * pCTR_validation / avgCTR
        test_result = test_model_pre(bidprice_validation)
        temp_click = test_result['click:']
        if temp_click > click_cnt:
            click_cnt = temp_click
            validation_pCTR_result = pCTR_validation
            test_pCTR_result = pCTR_test
            base_bid_result = base_bid_validation
    return validation_pCTR_result,test_pCTR_result,base_bid_result
        

In [None]:
pCTR_validation,pCTR_test,base_bid = get_pctr(train,validation,test,validation_x,test_x,validation_data)

[0]	validation_0-rmse:0.275259
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166145
[2]	validation_0-rmse:0.106094
[3]	validation_0-rmse:0.072378
[4]	validation_0-rmse:0.055051
[5]	validation_0-rmse:0.046511
[6]	validation_0-rmse:0.042407
[7]	validation_0-rmse:0.039863
[8]	validation_0-rmse:0.039472
[9]	validation_0-rmse:0.039006
[10]	validation_0-rmse:0.03881
[11]	validation_0-rmse:0.038366
[12]	validation_0-rmse:0.038459
[13]	validation_0-rmse:0.038367
[14]	validation_0-rmse:0.038869
[15]	validation_0-rmse:0.038663
[16]	validation_0-rmse:0.038768
[17]	validation_0-rmse:0.038863
[18]	validation_0-rmse:0.038869
[19]	validation_0-rmse:0.039142
[20]	validation_0-rmse:0.039326
[21]	validation_0-rmse:0.039237
[22]	validation_0-rmse:0.039188
[23]	validation_0-rmse:0.039387
[24]	validation_0-rmse:0.039588
[25]	validation_0-rmse:0.039958
[26]	validation_0-rmse:0.040097
[27]	validation_0-rmse:0.040214
[28]	validation_0-rmse:0.04028
[29]	validation_0-

[48]	validation_0-rmse:0.040761
[49]	validation_0-rmse:0.040704
[0]	validation_0-rmse:0.275335
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166347
[2]	validation_0-rmse:0.106331
[3]	validation_0-rmse:0.072606
[4]	validation_0-rmse:0.055584
[5]	validation_0-rmse:0.046828
[6]	validation_0-rmse:0.04273
[7]	validation_0-rmse:0.040236
[8]	validation_0-rmse:0.039732
[9]	validation_0-rmse:0.039132
[10]	validation_0-rmse:0.038818
[11]	validation_0-rmse:0.038376
[12]	validation_0-rmse:0.038675
[13]	validation_0-rmse:0.038652
[14]	validation_0-rmse:0.039164
[15]	validation_0-rmse:0.038974
[16]	validation_0-rmse:0.039133
[17]	validation_0-rmse:0.039023
[18]	validation_0-rmse:0.039201
[19]	validation_0-rmse:0.039266
[20]	validation_0-rmse:0.039437
[21]	validation_0-rmse:0.039579
[22]	validation_0-rmse:0.039651
[23]	validation_0-rmse:0.039591
[24]	validation_0-rmse:0.039852
[25]	validation_0-rmse:0.039939
[26]	validation_0-rmse:0.040004
[27]	validation_0

[46]	validation_0-rmse:0.040867
[47]	validation_0-rmse:0.040882
[48]	validation_0-rmse:0.040897
[49]	validation_0-rmse:0.041043
{'cost:': 6155.1020000021044, 'ctr:': 0.0014201021446964318, 'cpc:': 37.07892771085605, 'click:': 166, 'imps:': 116893}
[0]	validation_0-rmse:0.275263
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166128
[2]	validation_0-rmse:0.106022
[3]	validation_0-rmse:0.072396
[4]	validation_0-rmse:0.05518
[5]	validation_0-rmse:0.046408
[6]	validation_0-rmse:0.042076
[7]	validation_0-rmse:0.039346
[8]	validation_0-rmse:0.038704
[9]	validation_0-rmse:0.038193
[10]	validation_0-rmse:0.03779
[11]	validation_0-rmse:0.037486
[12]	validation_0-rmse:0.037788
[13]	validation_0-rmse:0.037541
[14]	validation_0-rmse:0.038117
[15]	validation_0-rmse:0.037874
[16]	validation_0-rmse:0.038071
[17]	validation_0-rmse:0.038067
[18]	validation_0-rmse:0.03837
[19]	validation_0-rmse:0.038382
[20]	validation_0-rmse:0.038541
[21]	validation_0-rmse:0.03

[41]	validation_0-rmse:0.040755
[42]	validation_0-rmse:0.041146
[43]	validation_0-rmse:0.041048
[44]	validation_0-rmse:0.04085
[45]	validation_0-rmse:0.041063
[46]	validation_0-rmse:0.041182
[47]	validation_0-rmse:0.041164
[48]	validation_0-rmse:0.041005
[49]	validation_0-rmse:0.041257
[0]	validation_0-rmse:0.275331
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.16621
[2]	validation_0-rmse:0.106089
[3]	validation_0-rmse:0.072446
[4]	validation_0-rmse:0.055055
[5]	validation_0-rmse:0.046195
[6]	validation_0-rmse:0.042074
[7]	validation_0-rmse:0.039557
[8]	validation_0-rmse:0.039169
[9]	validation_0-rmse:0.038783
[10]	validation_0-rmse:0.038518
[11]	validation_0-rmse:0.037973
[12]	validation_0-rmse:0.037934
[13]	validation_0-rmse:0.038277
[14]	validation_0-rmse:0.038844
[15]	validation_0-rmse:0.038601
[16]	validation_0-rmse:0.038582
[17]	validation_0-rmse:0.038426
[18]	validation_0-rmse:0.038728
[19]	validation_0-rmse:0.038407
[20]	validation_0-

[39]	validation_0-rmse:0.041375
[40]	validation_0-rmse:0.041431
[41]	validation_0-rmse:0.041471
[42]	validation_0-rmse:0.041904
[43]	validation_0-rmse:0.041907
[44]	validation_0-rmse:0.041947
[45]	validation_0-rmse:0.042156
[46]	validation_0-rmse:0.042115
[47]	validation_0-rmse:0.041934
[48]	validation_0-rmse:0.042093
[49]	validation_0-rmse:0.042089
{'cost:': 6140.8970000021291, 'ctr:': 0.0014010866965109524, 'cpc:': 37.44449390245201, 'click:': 164, 'imps:': 117052}
[0]	validation_0-rmse:0.275321
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166226
[2]	validation_0-rmse:0.106207
[3]	validation_0-rmse:0.072713
[4]	validation_0-rmse:0.05546
[5]	validation_0-rmse:0.0468
[6]	validation_0-rmse:0.04272
[7]	validation_0-rmse:0.039948
[8]	validation_0-rmse:0.039448
[9]	validation_0-rmse:0.039068
[10]	validation_0-rmse:0.038549
[11]	validation_0-rmse:0.038466
[12]	validation_0-rmse:0.03848
[13]	validation_0-rmse:0.038614
[14]	validation_0-rmse:0.0387

[34]	validation_0-rmse:0.03979
[35]	validation_0-rmse:0.040106
[36]	validation_0-rmse:0.040247
[37]	validation_0-rmse:0.040268
[38]	validation_0-rmse:0.04035
[39]	validation_0-rmse:0.040248
[40]	validation_0-rmse:0.04015
[41]	validation_0-rmse:0.040367
[42]	validation_0-rmse:0.040735
[43]	validation_0-rmse:0.040692
[44]	validation_0-rmse:0.040799
[45]	validation_0-rmse:0.04113
[46]	validation_0-rmse:0.041155
[47]	validation_0-rmse:0.040967
[48]	validation_0-rmse:0.040786
[49]	validation_0-rmse:0.040795
[0]	validation_0-rmse:0.275234
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.16603
[2]	validation_0-rmse:0.105898
[3]	validation_0-rmse:0.072255
[4]	validation_0-rmse:0.054762
[5]	validation_0-rmse:0.045865
[6]	validation_0-rmse:0.041694
[7]	validation_0-rmse:0.039082
[8]	validation_0-rmse:0.038538
[9]	validation_0-rmse:0.037885
[10]	validation_0-rmse:0.037147
[11]	validation_0-rmse:0.037481
[12]	validation_0-rmse:0.037386
[13]	validation_0-rms

[33]	validation_0-rmse:0.040587
[34]	validation_0-rmse:0.040483
[35]	validation_0-rmse:0.040586
[36]	validation_0-rmse:0.040653
[37]	validation_0-rmse:0.040776
[38]	validation_0-rmse:0.040675
[39]	validation_0-rmse:0.040525
[40]	validation_0-rmse:0.040457
[41]	validation_0-rmse:0.040738
[42]	validation_0-rmse:0.041122
[43]	validation_0-rmse:0.04092
[44]	validation_0-rmse:0.040807
[45]	validation_0-rmse:0.04123
[46]	validation_0-rmse:0.041237
[47]	validation_0-rmse:0.041236
[48]	validation_0-rmse:0.041233
[49]	validation_0-rmse:0.041309
{'cost:': 6147.5660000021317, 'ctr:': 0.0014103166802000085, 'cpc:': 37.257975757588675, 'click:': 165, 'imps:': 116995}
[0]	validation_0-rmse:0.275258
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166244
[2]	validation_0-rmse:0.106159
[3]	validation_0-rmse:0.072391
[4]	validation_0-rmse:0.054949
[5]	validation_0-rmse:0.046056
[6]	validation_0-rmse:0.041947
[7]	validation_0-rmse:0.039092
[8]	validation_0-rmse:0

[28]	validation_0-rmse:0.039984
[29]	validation_0-rmse:0.039749
[30]	validation_0-rmse:0.039625
[31]	validation_0-rmse:0.039924
[32]	validation_0-rmse:0.040267
[33]	validation_0-rmse:0.04013
[34]	validation_0-rmse:0.040074
[35]	validation_0-rmse:0.040224
[36]	validation_0-rmse:0.040096
[37]	validation_0-rmse:0.040386
[38]	validation_0-rmse:0.040313
[39]	validation_0-rmse:0.040319
[40]	validation_0-rmse:0.040252
[41]	validation_0-rmse:0.040469
[42]	validation_0-rmse:0.040767
[43]	validation_0-rmse:0.041082
[44]	validation_0-rmse:0.040962
[45]	validation_0-rmse:0.041329
[46]	validation_0-rmse:0.041425
[47]	validation_0-rmse:0.041292
[48]	validation_0-rmse:0.041296
[49]	validation_0-rmse:0.041149
[0]	validation_0-rmse:0.275345
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166282
[2]	validation_0-rmse:0.106293
[3]	validation_0-rmse:0.072947
[4]	validation_0-rmse:0.055608
[5]	validation_0-rmse:0.046902
[6]	validation_0-rmse:0.042767
[7]	validation

[26]	validation_0-rmse:0.04008
[27]	validation_0-rmse:0.040201
[28]	validation_0-rmse:0.040421
[29]	validation_0-rmse:0.039853
[30]	validation_0-rmse:0.039684
[31]	validation_0-rmse:0.039893
[32]	validation_0-rmse:0.040197
[33]	validation_0-rmse:0.040367
[34]	validation_0-rmse:0.040521
[35]	validation_0-rmse:0.040597
[36]	validation_0-rmse:0.040762
[37]	validation_0-rmse:0.040777
[38]	validation_0-rmse:0.04061
[39]	validation_0-rmse:0.040681
[40]	validation_0-rmse:0.040536
[41]	validation_0-rmse:0.040479
[42]	validation_0-rmse:0.041022
[43]	validation_0-rmse:0.04098
[44]	validation_0-rmse:0.040839
[45]	validation_0-rmse:0.040907
[46]	validation_0-rmse:0.040775
[47]	validation_0-rmse:0.040682
[48]	validation_0-rmse:0.040727
[49]	validation_0-rmse:0.040702
{'cost:': 6154.662000002103, 'ctr:': 0.0014007755513418404, 'cpc:': 37.528426829281116, 'click:': 164, 'imps:': 117078}
[0]	validation_0-rmse:0.275292
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-r

[21]	validation_0-rmse:0.039798
[22]	validation_0-rmse:0.039797
[23]	validation_0-rmse:0.039673
[24]	validation_0-rmse:0.039947
[25]	validation_0-rmse:0.040191
[26]	validation_0-rmse:0.040218
[27]	validation_0-rmse:0.04056
[28]	validation_0-rmse:0.040546
[29]	validation_0-rmse:0.040394
[30]	validation_0-rmse:0.040195
[31]	validation_0-rmse:0.04057
[32]	validation_0-rmse:0.040776
[33]	validation_0-rmse:0.040825
[34]	validation_0-rmse:0.040693
[35]	validation_0-rmse:0.040862
[36]	validation_0-rmse:0.04096
[37]	validation_0-rmse:0.041011
[38]	validation_0-rmse:0.040687
[39]	validation_0-rmse:0.040611
[40]	validation_0-rmse:0.040545
[41]	validation_0-rmse:0.04063
[42]	validation_0-rmse:0.041095
[43]	validation_0-rmse:0.04108
[44]	validation_0-rmse:0.041437
[45]	validation_0-rmse:0.041761
[46]	validation_0-rmse:0.041586
[47]	validation_0-rmse:0.041843
[48]	validation_0-rmse:0.041891
[49]	validation_0-rmse:0.041901
[0]	validation_0-rmse:0.275229
Will train until validation_0-rmse hasn't impr

[20]	validation_0-rmse:0.038868
[21]	validation_0-rmse:0.038849
[22]	validation_0-rmse:0.039081
[23]	validation_0-rmse:0.03921
[24]	validation_0-rmse:0.039284
[25]	validation_0-rmse:0.039571
[26]	validation_0-rmse:0.03971
[27]	validation_0-rmse:0.040017
[28]	validation_0-rmse:0.039887
[29]	validation_0-rmse:0.039817
[30]	validation_0-rmse:0.03971
[31]	validation_0-rmse:0.040415
[32]	validation_0-rmse:0.040442
[33]	validation_0-rmse:0.040414
[34]	validation_0-rmse:0.0405
[35]	validation_0-rmse:0.040604
[36]	validation_0-rmse:0.040608
[37]	validation_0-rmse:0.040792
[38]	validation_0-rmse:0.040527
[39]	validation_0-rmse:0.040696
[40]	validation_0-rmse:0.040637
[41]	validation_0-rmse:0.040806
[42]	validation_0-rmse:0.041088
[43]	validation_0-rmse:0.041183
[44]	validation_0-rmse:0.040864
[45]	validation_0-rmse:0.041173
[46]	validation_0-rmse:0.041193
[47]	validation_0-rmse:0.041065
[48]	validation_0-rmse:0.041209
[49]	validation_0-rmse:0.041214
{'cost:': 6142.1540000021314, 'ctr:': 0.00140

[15]	validation_0-rmse:0.038927
[16]	validation_0-rmse:0.039196
[17]	validation_0-rmse:0.039019
[18]	validation_0-rmse:0.039181
[19]	validation_0-rmse:0.039314
[20]	validation_0-rmse:0.039685
[21]	validation_0-rmse:0.039879
[22]	validation_0-rmse:0.039984
[23]	validation_0-rmse:0.040101
[24]	validation_0-rmse:0.040173
[25]	validation_0-rmse:0.04039
[26]	validation_0-rmse:0.040489
[27]	validation_0-rmse:0.040575
[28]	validation_0-rmse:0.040723
[29]	validation_0-rmse:0.040318
[30]	validation_0-rmse:0.040003
[31]	validation_0-rmse:0.040545
[32]	validation_0-rmse:0.040766
[33]	validation_0-rmse:0.040879
[34]	validation_0-rmse:0.040875
[35]	validation_0-rmse:0.040844
[36]	validation_0-rmse:0.041007
[37]	validation_0-rmse:0.041249
[38]	validation_0-rmse:0.040876
[39]	validation_0-rmse:0.040806
[40]	validation_0-rmse:0.040764
[41]	validation_0-rmse:0.040735
[42]	validation_0-rmse:0.041202
[43]	validation_0-rmse:0.041209
[44]	validation_0-rmse:0.041229
[45]	validation_0-rmse:0.041636
[46]	vali

[14]	validation_0-rmse:0.038117
[15]	validation_0-rmse:0.038239
[16]	validation_0-rmse:0.038274
[17]	validation_0-rmse:0.038077
[18]	validation_0-rmse:0.038577
[19]	validation_0-rmse:0.038531
[20]	validation_0-rmse:0.03873
[21]	validation_0-rmse:0.038619
[22]	validation_0-rmse:0.038654
[23]	validation_0-rmse:0.038772
[24]	validation_0-rmse:0.039126
[25]	validation_0-rmse:0.039634
[26]	validation_0-rmse:0.039489
[27]	validation_0-rmse:0.039545
[28]	validation_0-rmse:0.039637
[29]	validation_0-rmse:0.03949
[30]	validation_0-rmse:0.039243
[31]	validation_0-rmse:0.039653
[32]	validation_0-rmse:0.039861
[33]	validation_0-rmse:0.04001
[34]	validation_0-rmse:0.039936
[35]	validation_0-rmse:0.039878
[36]	validation_0-rmse:0.040296
[37]	validation_0-rmse:0.040154
[38]	validation_0-rmse:0.039896
[39]	validation_0-rmse:0.039959
[40]	validation_0-rmse:0.039705
[41]	validation_0-rmse:0.040055
[42]	validation_0-rmse:0.040322
[43]	validation_0-rmse:0.040344
[44]	validation_0-rmse:0.040307
[45]	valida

[9]	validation_0-rmse:0.039359
[10]	validation_0-rmse:0.038563
[11]	validation_0-rmse:0.038238
[12]	validation_0-rmse:0.038507
[13]	validation_0-rmse:0.038466
[14]	validation_0-rmse:0.038788
[15]	validation_0-rmse:0.038686
[16]	validation_0-rmse:0.038899
[17]	validation_0-rmse:0.038808
[18]	validation_0-rmse:0.038882
[19]	validation_0-rmse:0.038677
[20]	validation_0-rmse:0.039207
[21]	validation_0-rmse:0.039218
[22]	validation_0-rmse:0.039282
[23]	validation_0-rmse:0.039296
[24]	validation_0-rmse:0.039361
[25]	validation_0-rmse:0.039594
[26]	validation_0-rmse:0.03958
[27]	validation_0-rmse:0.039829
[28]	validation_0-rmse:0.039941
[29]	validation_0-rmse:0.039867
[30]	validation_0-rmse:0.039719
[31]	validation_0-rmse:0.039963
[32]	validation_0-rmse:0.040148
[33]	validation_0-rmse:0.040064
[34]	validation_0-rmse:0.040166
[35]	validation_0-rmse:0.040166
[36]	validation_0-rmse:0.040318
[37]	validation_0-rmse:0.040223
[38]	validation_0-rmse:0.039979
[39]	validation_0-rmse:0.039972
[40]	valid

[8]	validation_0-rmse:0.038917
[9]	validation_0-rmse:0.038843
[10]	validation_0-rmse:0.038557
[11]	validation_0-rmse:0.038186
[12]	validation_0-rmse:0.037954
[13]	validation_0-rmse:0.038221
[14]	validation_0-rmse:0.038637
[15]	validation_0-rmse:0.038625
[16]	validation_0-rmse:0.038653
[17]	validation_0-rmse:0.038651
[18]	validation_0-rmse:0.038733
[19]	validation_0-rmse:0.039117
[20]	validation_0-rmse:0.039348
[21]	validation_0-rmse:0.039299
[22]	validation_0-rmse:0.039298
[23]	validation_0-rmse:0.039257
[24]	validation_0-rmse:0.039412
[25]	validation_0-rmse:0.039605
[26]	validation_0-rmse:0.039528
[27]	validation_0-rmse:0.039763
[28]	validation_0-rmse:0.039556
[29]	validation_0-rmse:0.039249
[30]	validation_0-rmse:0.039216
[31]	validation_0-rmse:0.039422
[32]	validation_0-rmse:0.03958
[33]	validation_0-rmse:0.039677
[34]	validation_0-rmse:0.039723
[35]	validation_0-rmse:0.039996
[36]	validation_0-rmse:0.039886
[37]	validation_0-rmse:0.039949
[38]	validation_0-rmse:0.039742
[39]	valida

[3]	validation_0-rmse:0.073126
[4]	validation_0-rmse:0.056269
[5]	validation_0-rmse:0.047704
[6]	validation_0-rmse:0.043665
[7]	validation_0-rmse:0.040746
[8]	validation_0-rmse:0.040336
[9]	validation_0-rmse:0.040025
[10]	validation_0-rmse:0.039655
[11]	validation_0-rmse:0.039066
[12]	validation_0-rmse:0.039156
[13]	validation_0-rmse:0.03925
[14]	validation_0-rmse:0.039808
[15]	validation_0-rmse:0.039556
[16]	validation_0-rmse:0.039798
[17]	validation_0-rmse:0.039612
[18]	validation_0-rmse:0.039771
[19]	validation_0-rmse:0.03978
[20]	validation_0-rmse:0.040088
[21]	validation_0-rmse:0.040197
[22]	validation_0-rmse:0.040009
[23]	validation_0-rmse:0.040007
[24]	validation_0-rmse:0.04029
[25]	validation_0-rmse:0.040809
[26]	validation_0-rmse:0.040695
[27]	validation_0-rmse:0.040891
[28]	validation_0-rmse:0.040966
[29]	validation_0-rmse:0.040701
[30]	validation_0-rmse:0.040519
[31]	validation_0-rmse:0.040893
[32]	validation_0-rmse:0.041145
[33]	validation_0-rmse:0.041178
[34]	validation_0-

[2]	validation_0-rmse:0.105973
[3]	validation_0-rmse:0.07231
[4]	validation_0-rmse:0.054958
[5]	validation_0-rmse:0.046368
[6]	validation_0-rmse:0.042321
[7]	validation_0-rmse:0.039425
[8]	validation_0-rmse:0.039175
[9]	validation_0-rmse:0.038749
[10]	validation_0-rmse:0.038418
[11]	validation_0-rmse:0.037928
[12]	validation_0-rmse:0.03826
[13]	validation_0-rmse:0.038576
[14]	validation_0-rmse:0.039048
[15]	validation_0-rmse:0.038623
[16]	validation_0-rmse:0.038902
[17]	validation_0-rmse:0.038878
[18]	validation_0-rmse:0.039104
[19]	validation_0-rmse:0.039274
[20]	validation_0-rmse:0.039488
[21]	validation_0-rmse:0.039438
[22]	validation_0-rmse:0.039283
[23]	validation_0-rmse:0.039363
[24]	validation_0-rmse:0.039604
[25]	validation_0-rmse:0.039939
[26]	validation_0-rmse:0.040068
[27]	validation_0-rmse:0.040304
[28]	validation_0-rmse:0.040316
[29]	validation_0-rmse:0.040099
[30]	validation_0-rmse:0.039715
[31]	validation_0-rmse:0.040105
[32]	validation_0-rmse:0.040505
[33]	validation_0-

[49]	validation_0-rmse:0.040841
[0]	validation_0-rmse:0.275351
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166294
[2]	validation_0-rmse:0.106373
[3]	validation_0-rmse:0.072691
[4]	validation_0-rmse:0.055524
[5]	validation_0-rmse:0.046797
[6]	validation_0-rmse:0.042733
[7]	validation_0-rmse:0.040128
[8]	validation_0-rmse:0.039643
[9]	validation_0-rmse:0.039474
[10]	validation_0-rmse:0.038677
[11]	validation_0-rmse:0.038526
[12]	validation_0-rmse:0.038542
[13]	validation_0-rmse:0.038671
[14]	validation_0-rmse:0.038979
[15]	validation_0-rmse:0.038749
[16]	validation_0-rmse:0.039154
[17]	validation_0-rmse:0.03924
[18]	validation_0-rmse:0.039382
[19]	validation_0-rmse:0.039529
[20]	validation_0-rmse:0.039892
[21]	validation_0-rmse:0.039849
[22]	validation_0-rmse:0.039993
[23]	validation_0-rmse:0.039948
[24]	validation_0-rmse:0.040271
[25]	validation_0-rmse:0.040381
[26]	validation_0-rmse:0.04046
[27]	validation_0-rmse:0.040589
[28]	validation_0-

[48]	validation_0-rmse:0.040657
[49]	validation_0-rmse:0.040751
[0]	validation_0-rmse:0.275268
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.16622
[2]	validation_0-rmse:0.10617
[3]	validation_0-rmse:0.07248
[4]	validation_0-rmse:0.055394
[5]	validation_0-rmse:0.046864
[6]	validation_0-rmse:0.042508
[7]	validation_0-rmse:0.039702
[8]	validation_0-rmse:0.03887
[9]	validation_0-rmse:0.038318
[10]	validation_0-rmse:0.037525
[11]	validation_0-rmse:0.037755
[12]	validation_0-rmse:0.037781
[13]	validation_0-rmse:0.038234
[14]	validation_0-rmse:0.038663
[15]	validation_0-rmse:0.038816
[16]	validation_0-rmse:0.039026
[17]	validation_0-rmse:0.038864
[18]	validation_0-rmse:0.038968
[19]	validation_0-rmse:0.039081
[20]	validation_0-rmse:0.039303
[21]	validation_0-rmse:0.039332
[22]	validation_0-rmse:0.039561
[23]	validation_0-rmse:0.039326
[24]	validation_0-rmse:0.039312
[25]	validation_0-rmse:0.039369
[26]	validation_0-rmse:0.039511
[27]	validation_0-rm

[43]	validation_0-rmse:0.040998
[44]	validation_0-rmse:0.040976
[45]	validation_0-rmse:0.041218
[46]	validation_0-rmse:0.041105
[47]	validation_0-rmse:0.040969
[48]	validation_0-rmse:0.040879
[49]	validation_0-rmse:0.040858
[0]	validation_0-rmse:0.275297
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.16623
[2]	validation_0-rmse:0.106188
[3]	validation_0-rmse:0.072549
[4]	validation_0-rmse:0.055452
[5]	validation_0-rmse:0.046636
[6]	validation_0-rmse:0.042484
[7]	validation_0-rmse:0.040209
[8]	validation_0-rmse:0.039595
[9]	validation_0-rmse:0.039739
[10]	validation_0-rmse:0.039045
[11]	validation_0-rmse:0.03875
[12]	validation_0-rmse:0.038799
[13]	validation_0-rmse:0.038927
[14]	validation_0-rmse:0.039445
[15]	validation_0-rmse:0.039044
[16]	validation_0-rmse:0.039065
[17]	validation_0-rmse:0.039072
[18]	validation_0-rmse:0.039254
[19]	validation_0-rmse:0.039284
[20]	validation_0-rmse:0.039351
[21]	validation_0-rmse:0.039448
[22]	validation_0-

[42]	validation_0-rmse:0.041162
[43]	validation_0-rmse:0.041172
[44]	validation_0-rmse:0.041016
[45]	validation_0-rmse:0.041341
[46]	validation_0-rmse:0.041329
[47]	validation_0-rmse:0.041107
[48]	validation_0-rmse:0.041419
[49]	validation_0-rmse:0.041325
[0]	validation_0-rmse:0.275291
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166092
[2]	validation_0-rmse:0.10599
[3]	validation_0-rmse:0.072584
[4]	validation_0-rmse:0.055341
[5]	validation_0-rmse:0.046481
[6]	validation_0-rmse:0.042545
[7]	validation_0-rmse:0.039588
[8]	validation_0-rmse:0.039099
[9]	validation_0-rmse:0.03885
[10]	validation_0-rmse:0.03827
[11]	validation_0-rmse:0.038181
[12]	validation_0-rmse:0.03839
[13]	validation_0-rmse:0.038452
[14]	validation_0-rmse:0.039027
[15]	validation_0-rmse:0.038896
[16]	validation_0-rmse:0.03938
[17]	validation_0-rmse:0.038919
[18]	validation_0-rmse:0.038958
[19]	validation_0-rmse:0.038934
[20]	validation_0-rmse:0.039321
[21]	validation_0-rms

[37]	validation_0-rmse:0.040675
[38]	validation_0-rmse:0.04058
[39]	validation_0-rmse:0.040541
[40]	validation_0-rmse:0.040568
[41]	validation_0-rmse:0.040643
[42]	validation_0-rmse:0.041075
[43]	validation_0-rmse:0.040983
[44]	validation_0-rmse:0.040916
[45]	validation_0-rmse:0.041279
[46]	validation_0-rmse:0.041259
[47]	validation_0-rmse:0.041145
[48]	validation_0-rmse:0.040935
[49]	validation_0-rmse:0.041009
[0]	validation_0-rmse:0.27523
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166113
[2]	validation_0-rmse:0.105913
[3]	validation_0-rmse:0.072325
[4]	validation_0-rmse:0.055035
[5]	validation_0-rmse:0.046167
[6]	validation_0-rmse:0.042092
[7]	validation_0-rmse:0.03921
[8]	validation_0-rmse:0.038745
[9]	validation_0-rmse:0.038652
[10]	validation_0-rmse:0.037947
[11]	validation_0-rmse:0.037793
[12]	validation_0-rmse:0.037907
[13]	validation_0-rmse:0.037998
[14]	validation_0-rmse:0.038248
[15]	validation_0-rmse:0.038117
[16]	validation_0-r

[35]	validation_0-rmse:0.040323
[36]	validation_0-rmse:0.040417
[37]	validation_0-rmse:0.04021
[38]	validation_0-rmse:0.040322
[39]	validation_0-rmse:0.040565
[40]	validation_0-rmse:0.040629
[41]	validation_0-rmse:0.040618
[42]	validation_0-rmse:0.040917
[43]	validation_0-rmse:0.041046
[44]	validation_0-rmse:0.040985
[45]	validation_0-rmse:0.041183
[46]	validation_0-rmse:0.04106
[47]	validation_0-rmse:0.040917
[48]	validation_0-rmse:0.040999
[49]	validation_0-rmse:0.040798
[0]	validation_0-rmse:0.275241
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166094
[2]	validation_0-rmse:0.105948
[3]	validation_0-rmse:0.072475
[4]	validation_0-rmse:0.055233
[5]	validation_0-rmse:0.046317
[6]	validation_0-rmse:0.042069
[7]	validation_0-rmse:0.039373
[8]	validation_0-rmse:0.039077
[9]	validation_0-rmse:0.038634
[10]	validation_0-rmse:0.038391
[11]	validation_0-rmse:0.03788
[12]	validation_0-rmse:0.038102
[13]	validation_0-rmse:0.038261
[14]	validation_0-r

[30]	validation_0-rmse:0.038926
[31]	validation_0-rmse:0.039242
[32]	validation_0-rmse:0.039405
[33]	validation_0-rmse:0.039403
[34]	validation_0-rmse:0.039346
[35]	validation_0-rmse:0.039536
[36]	validation_0-rmse:0.039651
[37]	validation_0-rmse:0.039742
[38]	validation_0-rmse:0.03983
[39]	validation_0-rmse:0.039845
[40]	validation_0-rmse:0.039728
[41]	validation_0-rmse:0.039971
[42]	validation_0-rmse:0.040334
[43]	validation_0-rmse:0.040432
[44]	validation_0-rmse:0.040402
[45]	validation_0-rmse:0.040523
[46]	validation_0-rmse:0.040516
[47]	validation_0-rmse:0.040375
[48]	validation_0-rmse:0.04042
[49]	validation_0-rmse:0.040357
[0]	validation_0-rmse:0.275229
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166057
[2]	validation_0-rmse:0.105877
[3]	validation_0-rmse:0.072176
[4]	validation_0-rmse:0.054794
[5]	validation_0-rmse:0.045989
[6]	validation_0-rmse:0.041712
[7]	validation_0-rmse:0.039043
[8]	validation_0-rmse:0.038497
[9]	validation_0-

[29]	validation_0-rmse:0.039788
[30]	validation_0-rmse:0.039527
[31]	validation_0-rmse:0.039859
[32]	validation_0-rmse:0.0403
[33]	validation_0-rmse:0.040474
[34]	validation_0-rmse:0.040375
[35]	validation_0-rmse:0.040779
[36]	validation_0-rmse:0.040843
[37]	validation_0-rmse:0.040849
[38]	validation_0-rmse:0.04081
[39]	validation_0-rmse:0.040778
[40]	validation_0-rmse:0.040661
[41]	validation_0-rmse:0.040743
[42]	validation_0-rmse:0.040946
[43]	validation_0-rmse:0.041111
[44]	validation_0-rmse:0.040565
[45]	validation_0-rmse:0.040852
[46]	validation_0-rmse:0.041016
[47]	validation_0-rmse:0.041093
[48]	validation_0-rmse:0.040968
[49]	validation_0-rmse:0.040886
[0]	validation_0-rmse:0.275281
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166197
[2]	validation_0-rmse:0.106104
[3]	validation_0-rmse:0.072623
[4]	validation_0-rmse:0.055343
[5]	validation_0-rmse:0.046818
[6]	validation_0-rmse:0.042846
[7]	validation_0-rmse:0.040028
[8]	validation_0-

[24]	validation_0-rmse:0.039482
[25]	validation_0-rmse:0.039863
[26]	validation_0-rmse:0.040096
[27]	validation_0-rmse:0.040281
[28]	validation_0-rmse:0.040219
[29]	validation_0-rmse:0.040073
[30]	validation_0-rmse:0.039901
[31]	validation_0-rmse:0.040126
[32]	validation_0-rmse:0.04037
[33]	validation_0-rmse:0.040389
[34]	validation_0-rmse:0.04035
[35]	validation_0-rmse:0.040446
[36]	validation_0-rmse:0.040636
[37]	validation_0-rmse:0.04068
[38]	validation_0-rmse:0.040348
[39]	validation_0-rmse:0.040752
[40]	validation_0-rmse:0.040842
[41]	validation_0-rmse:0.041113
[42]	validation_0-rmse:0.041403
[43]	validation_0-rmse:0.041253
[44]	validation_0-rmse:0.041302
[45]	validation_0-rmse:0.041506
[46]	validation_0-rmse:0.041474
[47]	validation_0-rmse:0.041621
[48]	validation_0-rmse:0.041306
[49]	validation_0-rmse:0.04116
[0]	validation_0-rmse:0.275244
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.166266
[2]	validation_0-rmse:0.106161
[3]	validatio

[23]	validation_0-rmse:0.039546
[24]	validation_0-rmse:0.039803
[25]	validation_0-rmse:0.04029
[26]	validation_0-rmse:0.040181
[27]	validation_0-rmse:0.04045
[28]	validation_0-rmse:0.04018
[29]	validation_0-rmse:0.039968
[30]	validation_0-rmse:0.039722
[31]	validation_0-rmse:0.040015
[32]	validation_0-rmse:0.040357
[33]	validation_0-rmse:0.040507
[34]	validation_0-rmse:0.040706
[35]	validation_0-rmse:0.04071
[36]	validation_0-rmse:0.040825
[37]	validation_0-rmse:0.040821
[38]	validation_0-rmse:0.040644
[39]	validation_0-rmse:0.040686
[40]	validation_0-rmse:0.040901
[41]	validation_0-rmse:0.041036
[42]	validation_0-rmse:0.041344
[43]	validation_0-rmse:0.041238
[44]	validation_0-rmse:0.041241
[45]	validation_0-rmse:0.041476
[46]	validation_0-rmse:0.041418
[47]	validation_0-rmse:0.04122
[48]	validation_0-rmse:0.041395
[49]	validation_0-rmse:0.041265
[0]	validation_0-rmse:0.275327
Will train until validation_0-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:0.16623
[2]	validation

[18]	validation_0-rmse:0.039317
[19]	validation_0-rmse:0.039174
[20]	validation_0-rmse:0.039579
[21]	validation_0-rmse:0.039558
[22]	validation_0-rmse:0.039895
[23]	validation_0-rmse:0.039923
[24]	validation_0-rmse:0.040113
[25]	validation_0-rmse:0.040522
[26]	validation_0-rmse:0.040666
[27]	validation_0-rmse:0.040968
[28]	validation_0-rmse:0.04088
[29]	validation_0-rmse:0.040713
[30]	validation_0-rmse:0.040396
[31]	validation_0-rmse:0.040766
[32]	validation_0-rmse:0.040864
[33]	validation_0-rmse:0.040969
[34]	validation_0-rmse:0.040916
[35]	validation_0-rmse:0.0411
[36]	validation_0-rmse:0.041154
[37]	validation_0-rmse:0.041153
[38]	validation_0-rmse:0.041208
[39]	validation_0-rmse:0.041161
[40]	validation_0-rmse:0.041068
[41]	validation_0-rmse:0.041132
[42]	validation_0-rmse:0.041271
[43]	validation_0-rmse:0.041306
[44]	validation_0-rmse:0.041533
[45]	validation_0-rmse:0.041783
[46]	validation_0-rmse:0.041562
[47]	validation_0-rmse:0.041493
[48]	validation_0-rmse:0.04157
[49]	validat

[17]	validation_0-rmse:0.038716
[18]	validation_0-rmse:0.038809
[19]	validation_0-rmse:0.03882
[20]	validation_0-rmse:0.03908
[21]	validation_0-rmse:0.039224
[22]	validation_0-rmse:0.039118
[23]	validation_0-rmse:0.039237
[24]	validation_0-rmse:0.039489
[25]	validation_0-rmse:0.039707
[26]	validation_0-rmse:0.039813
[27]	validation_0-rmse:0.039946
[28]	validation_0-rmse:0.0399
[29]	validation_0-rmse:0.039819
[30]	validation_0-rmse:0.03958
[31]	validation_0-rmse:0.040161
[32]	validation_0-rmse:0.040113
[33]	validation_0-rmse:0.040254
[34]	validation_0-rmse:0.04032
[35]	validation_0-rmse:0.04035
[36]	validation_0-rmse:0.040337
[37]	validation_0-rmse:0.040346
[38]	validation_0-rmse:0.040265
[39]	validation_0-rmse:0.040328
[40]	validation_0-rmse:0.040285
[41]	validation_0-rmse:0.040444
[42]	validation_0-rmse:0.040682
[43]	validation_0-rmse:0.040527
[44]	validation_0-rmse:0.040514
[45]	validation_0-rmse:0.040817
[46]	validation_0-rmse:0.040593
[47]	validation_0-rmse:0.040462
[48]	validation

[12]	validation_0-rmse:0.038165
[13]	validation_0-rmse:0.0381
[14]	validation_0-rmse:0.038645
[15]	validation_0-rmse:0.038511
[16]	validation_0-rmse:0.038489
[17]	validation_0-rmse:0.038825
[18]	validation_0-rmse:0.038774
[19]	validation_0-rmse:0.038989
[20]	validation_0-rmse:0.038971
[21]	validation_0-rmse:0.039212
[22]	validation_0-rmse:0.039425
[23]	validation_0-rmse:0.039355
[24]	validation_0-rmse:0.039609
[25]	validation_0-rmse:0.039787
[26]	validation_0-rmse:0.039936
[27]	validation_0-rmse:0.040282
[28]	validation_0-rmse:0.040447
[29]	validation_0-rmse:0.040126
[30]	validation_0-rmse:0.040154
[31]	validation_0-rmse:0.040415
[32]	validation_0-rmse:0.040633
[33]	validation_0-rmse:0.040521
[34]	validation_0-rmse:0.040344
[35]	validation_0-rmse:0.040464
[36]	validation_0-rmse:0.040576
[37]	validation_0-rmse:0.040716
[38]	validation_0-rmse:0.040805
[39]	validation_0-rmse:0.040855
[40]	validation_0-rmse:0.040525
[41]	validation_0-rmse:0.040692
[42]	validation_0-rmse:0.040944
[43]	valid

[11]	validation_0-rmse:0.037858
[12]	validation_0-rmse:0.037817
[13]	validation_0-rmse:0.038413
[14]	validation_0-rmse:0.038638
[15]	validation_0-rmse:0.038477
[16]	validation_0-rmse:0.038862
[17]	validation_0-rmse:0.038967
[18]	validation_0-rmse:0.038895
[19]	validation_0-rmse:0.038806
[20]	validation_0-rmse:0.03903
[21]	validation_0-rmse:0.039019
[22]	validation_0-rmse:0.039039
[23]	validation_0-rmse:0.039018
[24]	validation_0-rmse:0.039313
[25]	validation_0-rmse:0.039328
[26]	validation_0-rmse:0.039625
[27]	validation_0-rmse:0.039763
[28]	validation_0-rmse:0.039704
[29]	validation_0-rmse:0.039248
[30]	validation_0-rmse:0.039274
[31]	validation_0-rmse:0.039598
[32]	validation_0-rmse:0.039975
[33]	validation_0-rmse:0.039783
[34]	validation_0-rmse:0.039673
[35]	validation_0-rmse:0.039895
[36]	validation_0-rmse:0.040102
[37]	validation_0-rmse:0.040402
[38]	validation_0-rmse:0.040294
[39]	validation_0-rmse:0.040211
[40]	validation_0-rmse:0.040018
[41]	validation_0-rmse:0.040203
[42]	vali

[6]	validation_0-rmse:0.042401
[7]	validation_0-rmse:0.039777
[8]	validation_0-rmse:0.038978
[9]	validation_0-rmse:0.038762
[10]	validation_0-rmse:0.03799
[11]	validation_0-rmse:0.037609
[12]	validation_0-rmse:0.03777
[13]	validation_0-rmse:0.038036
[14]	validation_0-rmse:0.038527
[15]	validation_0-rmse:0.038251
[16]	validation_0-rmse:0.03829
[17]	validation_0-rmse:0.038214
[18]	validation_0-rmse:0.038496
[19]	validation_0-rmse:0.038477
[20]	validation_0-rmse:0.038737
[21]	validation_0-rmse:0.038723
[22]	validation_0-rmse:0.038912
[23]	validation_0-rmse:0.038781
[24]	validation_0-rmse:0.038945
[25]	validation_0-rmse:0.038939
[26]	validation_0-rmse:0.039124
[27]	validation_0-rmse:0.039274
[28]	validation_0-rmse:0.039029
[29]	validation_0-rmse:0.038979
[30]	validation_0-rmse:0.038977
[31]	validation_0-rmse:0.039359
[32]	validation_0-rmse:0.039721
[33]	validation_0-rmse:0.039683
[34]	validation_0-rmse:0.039705
[35]	validation_0-rmse:0.039933
[36]	validation_0-rmse:0.040029
[37]	validation

In [179]:
pCTR_validation

array([ 0.00107158,  0.0009308 ,  0.00080768, ...,  0.03502777,
        0.00366385,  0.00418926])

In [180]:
pCTR_test

{'click:': 163,
 'cost:': 6132.9900000021134,
 'cpc:': 37.625705521485358,
 'ctr:': 0.0013972825854013972,
 'imps:': 116655}

In [181]:
base_bid_validation = get_base_bid_4(pCTR_validation,validation_data)
base_bid_validation

6

In [182]:
bidprice_validation = base_bid_validation * pCTR_validation / avgCTR
bidprice_validation

array([   8.71718003,    7.57200257,    6.57038273, ...,  284.94764338,
         29.80509794,   34.07925144])

In [183]:
test_model_pre(bidprice_validation)

{'cost:': 6164.1890000021331, 'ctr:': 0.0014338753040583792, 'cpc:': 36.691601190488889, 'click:': 168, 'imps:': 117165}


{'click:': 168,
 'cost:': 6164.1890000021331,
 'cpc:': 36.691601190488889,
 'ctr:': 0.0014338753040583792,
 'imps:': 117165}