In [None]:
!pip install turicreate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [11]:
import pandas as pd
import turicreate as tc
from turicreate.toolkits.classifier import boosted_trees_classifier
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import log_loss

In [None]:
data_frame = tc.SFrame.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')
# data_frame = tc.SFrame.read_csv('train.csv', nrows=100000)

In [None]:
data_frame.shape

In [None]:
top_sites = data_frame.groupby('site_id', operations={'count': tc.aggregate.COUNT()}).sort('count', ascending=False)[1:10]

In [None]:
top_sites

# Creating subframes

In [None]:
def build_subframes(df, column, names):
  subframes = []
  i = 1
  for name in names:
    subframes.append(df.filter_by(name, column))
    print('build_subframes: ' + str(i) + ' subframe processed')
    i += 1
  return subframes

In [None]:
subframes = build_subframes(data_frame, 'site_id', top_sites['site_id'])

# Replace date & time with weekday & hour functions

In [12]:
def get_date(date):
    y = '20'+str(date)[:2]
    m = str(date)[2:4]
    d = str(date)[4:6]
    return y+'-'+m+'-'+d

In [13]:
def add_weekday_hour(data_frame):
    data_frame['weekday'] = data_frame['hour'].astype(str) \
    .apply(lambda x: get_date(x)).str_to_datetime().apply(lambda x: x.weekday())
    print('add_weekday_hour: half completed')
    data_frame['hour'] = data_frame['hour'].astype(str).apply(lambda x: x[6:])
    print('add_weekday_hour: completed')

# Count CTR features functions


In [14]:
def ctr_count(df, column):
    features = df.groupby(column, operations={'count': tc.aggregate.COUNT()}).sort('count', ascending=False)
    max_count = features['count'][0] + 1
    features_dict = features.to_dataframe().set_index(column).to_dict(orient='dict')['count']
    df[column] = df[column].apply(lambda x: features_dict[x] / max_count)

In [15]:
def ctr_features(df):
    for column in ['site_id','site_domain','site_category','app_id','app_domain','app_category','device_id','device_ip','device_model']:
        ctr_count(df, column)
        print('ctr_features: ' + column + ' column done.')

# Applying model

In [None]:
def apply_model(df):
    return boosted_trees_classifier.create(df, 'click', metric='log_loss', max_iterations=100, validation_set=None)

In [None]:
i = 0
for subframe in subframes:
  print(subframe.shape)
  add_weekday_hour(subframe)
  ctr_features(subframe)
  subframe.export_csv('/content/drive/My Drive/Colab Notebooks/subframe_site_id_' + str(i) + '.csv')
  print('subframe ' + str(i) + ' exported')
  i += 1

In [None]:
models = []
i = 0
for subframe in subframes:
  models.append(apply_model(subframe))
  models[-1].save('/content/drive/My Drive/Colab Notebooks/model_site_id_' + str(i))
  i += 1

# Ensemble functions

In [16]:
def logistic_function(x):
    return 1 / (1 + np.exp(4-8*x))
def inversed_logistic(x):
    return 1 / 2 - np.log(1 / x - 1) / 8

In [17]:
def reverse_class(frame):
    if frame['class'] == 0:
        return 1 - frame['probability']
    return frame['probability']

In [51]:
def sum_row(df):
  sum = 0
  for column in df:
    sum += df[column]
  return sum

In [50]:
def predict(models, X):
    results_frame = tc.SFrame()
    i = 0
    for model in models:
        frame = model.classify(X).apply(reverse_class)
        results_frame[str(len(results_frame.column_names()))] = inversed_logistic(frame)
        print('predict: ' + str(i) + ' model processed')
        i += 1
    
    results_frame['summary'] = logistic_function(np.array(results_frame.apply(sum_row, dtype=float)) / len(results_frame.column_names()))
    return results_frame['summary']

# Test model

## Load models if necessary

In [19]:
models = []
for i in range(0, 9):
  models.append(tc.load_model('/content/drive/My Drive/Colab Notebooks/model_site_id_' + str(i)))

## Preprocess for the `test_frame`

In [54]:
test_frame = tc.SFrame.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')
# test_frame = tc.SFrame.read_csv('test.csv', nrows=100000)

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,int,int,int,str,str,str,str,str,str,str,str,str,int,int,int,int,int,int,int,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [55]:
results_frame = test_frame[['id']]

In [56]:
test_frame.remove_column('id')

hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category
14103100,1005,0,235ba823,f6ebf28e,f028772b,ecad2386,7801e8d9,07d7df22
14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22
14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22
14103100,1005,0,85f751fd,c4e18dd6,50e219e0,51cedd4e,aefc06bd,0f2161f8
14103100,1005,0,85f751fd,c4e18dd6,50e219e0,9c13b419,2347f47a,f95efa07
14103100,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,07d7df22
14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22
14103100,1005,0,85f751fd,c4e18dd6,50e219e0,388d9bfb,2347f47a,cef3e649
14103100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,07d7df22
14103100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22

device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20
a99f214a,69f45779,0eb711ec,1,0,8330,320,50,761,3,175,100075
a99f214a,e8d44657,ecb851b2,1,0,22676,320,50,2616,0,35,100083
a99f214a,10fb085b,1f0bc64f,1,0,22676,320,50,2616,0,35,100083
a99f214a,422d257a,542422a7,1,0,18648,320,50,1092,3,809,100156
a99f214a,078c6b38,1f0bc64f,1,0,23160,320,50,2667,0,47,-1
a99f214a,e75922ff,68b6db2c,1,0,6563,320,50,572,2,39,-1
a99f214a,f1e8683d,d4897fef,1,0,22813,320,50,2647,2,39,100148
3772665a,a4a540c1,a2140f4f,1,3,23214,300,250,2675,3,939,100058
a99f214a,dc17d849,ac9ad752,1,0,23642,320,50,2709,3,39,-1
a99f214a,feb189df,8a4875bd,1,0,15699,320,50,1722,0,35,-1

C21
23
51
51
61
221
32
23
100
23
79


In [57]:
add_weekday_hour(test_frame)

add_weekday_hour: half completed
add_weekday_hour: completed


In [58]:
ctr_features(test_frame)

ctr_features: site_id column done.
ctr_features: site_domain column done.
ctr_features: site_category column done.
ctr_features: app_id column done.
ctr_features: app_domain column done.
ctr_features: app_category column done.
ctr_features: device_id column done.
ctr_features: device_ip column done.
ctr_features: device_model column done.


In [59]:
test_frame.shape

(4577464, 24)

In [60]:
len(models)

9

## Evaluate resulting framework

In [61]:
results_frame['click'] = predict(models, test_frame)

predict: 0 model processed
predict: 1 model processed
predict: 2 model processed
predict: 3 model processed
predict: 4 model processed
predict: 5 model processed
predict: 6 model processed
predict: 7 model processed
predict: 8 model processed


In [62]:
results_frame.head()

id,click
10000174058809263569,0.2054974036509299
10000182526920855428,0.1772596605707312
10000554139829213984,0.1956781528028018
10001094637809798845,0.1399792188826671
10001377041558670745,0.2428502145037776
10001521204153353724,0.1984725779077584
10001911056707023378,0.3142020241997974
10001982898844213216,0.2011840085574231
10002000217531288531,0.127989502388257
10002107385290585663,0.2428127558785451


In [64]:
results_frame.export_csv('/content/drive/My Drive/Colab Notebooks/results_frame.csv')

In [65]:
results_frame.shape

(4577464, 2)