In [1]:
from fastai.basics import *
from dataprocess_lib0 import *
from IPython.core.pylabtools import figsize
import gc, json
from pandas.io.json import json_normalize
from datetime import datetime
gc.enable()

In [2]:
PATH=Path('../data/Google Analytics Customer Revenue Prediction')

In [4]:
%%time
tr1 = pd.read_pickle(PATH/'tr1_clean')
tr2 = pd.read_pickle(PATH/'tr2_clean')
tr3 = pd.read_pickle(PATH/'tr3_clean')
tr4 = pd.read_pickle(PATH/'tr4_clean')

CPU times: user 1.46 s, sys: 516 ms, total: 1.98 s
Wall time: 2.25 s


In [5]:
val = tr4.copy()
val['target'] = np.nan
val['ret'] = np.nan

In [6]:
train_val = pd.concat([tr1, tr2, tr3, val], axis=0, sort=False).reset_index(drop=True)
train_val['interval_dates'] = train_val['interval_dates'].dt.days
train_val['first_ses_from_the_period_start'] = train_val['first_ses_from_the_period_start'].dt.days
train_val['last_ses_from_the_period_end'] = train_val['last_ses_from_the_period_end'].dt.days

In [7]:
train_val.shape

(1417575, 42)

In [45]:
# train_val.dtypes

## LGB model

#### Parameters of 'isReturned' classficator

In [32]:
params_lgb1 = {
        "objective" : "binary",
        "metric" : "binary_logloss",
        "max_leaves": 256,
        "num_leaves" : 15,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1           
    }

#### Parameters of 'how_much_returned_will_pay' regressor

In [33]:
params_lgb2 = {
        "objective" : "regression",
        "metric" : "rmse", 
        "max_leaves": 256,
        "num_leaves" : 9,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1      
    }

#### Validation: Averaging of 10 [Classificator*Regressor] values

In [26]:
# change objects to category type
cat_train(train_val)

In [27]:
train = train_val[train_val['target'].notnull()]

In [28]:
dev = train_val[train_val['target'].isnull()]

In [29]:
target_cols = ['target', 'ret', 'fullVisitorId']

dtrain = lgb.Dataset(train.drop(target_cols, axis=1), label=train['ret'])

dtrain_ret = lgb.Dataset(train.drop(target_cols, axis=1)[train['ret']==1], 
                         label=train['target'][train['ret']==1])

In [30]:
pr_lgb_sum = 0

In [34]:
print('Training and predictions')
for i in range(10):
    print('Interation number ', i)
    lgb_model1 = lgb.train(params_lgb1, dtrain, num_boost_round=1200)
    pr_lgb = lgb_model1.predict(dev.drop(target_cols, axis=1))
    
    lgb_model2 = lgb.train(params_lgb2, dtrain_ret, num_boost_round=368)
    pr_lgb_ret = lgb_model2.predict(dev.drop(target_cols, axis=1))
    
    pr_lgb_sum = pr_lgb_sum + pr_lgb*pr_lgb_ret

pr_final_lgb = pr_lgb_sum/10

Training and predictions
Interation number  0
Interation number  1
Interation number  2
Interation number  3
Interation number  4
Interation number  5
Interation number  6
Interation number  7
Interation number  8
Interation number  9


In [35]:
targets = tr4['target']

In [43]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_lgb = sqrt(mean_squared_error(targets, pr_final_lgb))

In [44]:
rmse_lgb

0.3166215786279411

## Text vectorization + LGB

#### Text vectorization

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
def vectorize_text(tmp, max_features=None):
    # tmp: pandas series 
    # output: numpy 2d-array (len(tmp), max_features)   
    corpus = tmp[tmp.notnull()]
    idx =corpus.index.values
    vectorizer = CountVectorizer(max_features=max_features)
    vectorizer.fit(corpus)
    X = vectorizer.transform(corpus)
    mat = np.zeros((len(tmp),max_features), dtype='int')
    mat[idx,:] = X.toarray()    
    print(vectorizer.get_feature_names())
    #print(type(vectorizer))
    return mat, vectorizer

In [127]:
train_val = pd.read_pickle(PATH/'train_val_clean')
train_val.shape

(1417575, 42)

In [148]:
train_val['combined'] = train_val['source'].str.cat(train_val['referralPath'], sep=" "). \
                        str.cat(train_val['networkDomain'],sep=" ") 

In [149]:
tmp = train_val['combined']
mat_combined, _ = vectorize_text(tmp, max_features=200)

['2145', '3bb', '419', '4spnk9', 'about', 'ac', 'actcorp', 'ad', 'ads', 'advertise', 'airtelbroadband', 'alphabet', 'amazonaws', 'analytics', 'and', 'app', 'ar', 'as5580', 'asianet', 'at', 'au', 'awards', 'bbtec', 'bell', 'benefits', 'bezeqint', 'bg', 'blog', 'br', 'brand', 'brasiltelecom', 'btcentralplus', 'by', 'c10b14f9a69ff71b1b7a', 'ca', 'cable', 'can', 'cantv', 'chello', 'cn', 'co', 'cogentco', 'com', 'comcast', 'comcastbusiness', 'comments', 'copyright', 'cox', 'creators', 'cs', 'cz', 'de', 'deals', 'dev', 'direct', 'discounts', 'do', 'doubleclick', 'edu', 'en', 'es', 'fl', 'forum', 'fr', 'free', 'gb', 'get', 'go', 'golang', 'google', 'googleads', 'googleplex', 'googletopia', 'gopher', 'gr', 'gvt', 'he', 'head', 'hinet', 'home', 'how', 'hr', 'htm', 'html', 'hu', 'id', 'il', 'in', 'index', 'infinitum', 'inpage_launch', 'intl', 'ip', 'ipconnect', 'it', 'items', 'iw', 'ja', 'jobs', 'jp', 'lineups', 'logo', 'mail', 'megared', 'mobile', 'mountain', 'music', 'mx', 'ne', 'net', 'nl', '

In [150]:
train_val = train_val.join(pd.DataFrame(mat_combined, dtype='object', 
                                    columns=['sourc_referralP_network_{}'.format(i) for i in range(200)]))

In [151]:
train_val.drop(['source', 'referralPath', 'networkDomain', 'combined'], axis=1, inplace=True)

In [152]:
from fastai.tabular import *
import gc
gc.collect()

1038

In [157]:
int_cols = []
for col in train_val.columns:
    try:
        train_val[col].dropna().astype(int)
        int_cols.append(col)
    except:
        print('error in transfering {} type'.format(col))

error in transfering fullVisitorId type
error in transfering city type
error in transfering operatingSystem type
error in transfering metro type
error in transfering region type
error in transfering channelGrouping type
error in transfering country type
error in transfering medium type
error in transfering keyword type
error in transfering browser type
error in transfering deviceCategory type
error in transfering continent type


In [160]:
cat_cols = set(train_val.columns)-set(int_cols)
cat_cols

{'browser',
 'channelGrouping',
 'city',
 'continent',
 'country',
 'deviceCategory',
 'fullVisitorId',
 'keyword',
 'medium',
 'metro',
 'operatingSystem',
 'region'}

In [139]:
train_val['gclId'] = train_val['gclId'].astype(str)
train_val['gclId'][train_val['gclId']!='nan'] = 1 
train_val['gclId'][train_val['gclId']=='nan'] = 0 
train_val.rename(columns={'gclId':'gclId_captured'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [305]:
cat_train(train_val)

In [315]:
train2 = train_val.iloc[range(0, (len(train_val))-len(tr4))]
train2.shape

(1051373, 239)

In [316]:
dev2 = train_val.iloc[range(len(train_val)-len(tr4), len(train_val))]
dev2.shape

(366202, 239)

In [317]:
target_cols = ['target', 'ret', 'fullVisitorId']

dtrain2 = lgb.Dataset(train2.drop(target_cols, axis=1), train2['ret'])

dtrain_ret2 = lgb.Dataset(train2.drop(target_cols, axis=1)[train2['ret']==1], \
                         label=train2['target'][train2['ret']==1] )

In [318]:
pr_lgb_sum2 = 0

In [319]:
print('Training and predictions')
for i in range(10):
    print('Interation number ', i)
    lgb_model3 = lgb.train(params_lgb1, dtrain2, num_boost_round=1200)
    pr_lgb2 = lgb_model3.predict(dev2.drop(target_cols, axis=1))
    
    lgb_model4 = lgb.train(params_lgb2, dtrain_ret2, num_boost_round=368)
    pr_lgb_ret2 = lgb_model4.predict(dev2.drop(target_cols, axis=1))
    
    pr_lgb_sum2 = pr_lgb_sum2 + pr_lgb2 * pr_lgb_ret2

pr_final_lgb2 = pr_lgb_sum2 / 10

Training and predictions
Interation number  0
Interation number  1
Interation number  2
Interation number  3
Interation number  4
Interation number  5
Interation number  6
Interation number  7
Interation number  8
Interation number  9


In [320]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse_lgb = sqrt(mean_squared_error(targets, pr_final_lgb2))
rmse_lgb

0.3160319850511174