In [None]:
# necessary packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datetime import datetime
from pprint import pprint
from os.path import join as pjoin

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

import math
import os
from scipy.stats import kurtosis, skew
from IPython import embed
from IPython.terminal.embed import InteractiveShellEmbed

import random
import seaborn as sns
import matplotlib.pyplot as plt
import shap

plt.rcParams['figure.figsize'] = (15,7)
data_root = '../input/build-my-data'
print(os.listdir(data_root))

## 1 - data reading 

In [None]:
def load_data(data='train',n=2):
    df = pd.DataFrame()
    for i in range(n) :
        if data=='train':
            if i > 8 :
                break
            dfpart = pd.read_pickle(pjoin(data_root,f'train_{i}.pkl'))
        elif data=='test':
            if i > 2 :
                break
            dfpart = pd.read_pickle(pjoin(data_root,f'test_{i}.pkl'))
        df = pd.concat([df,dfpart])
        del dfpart
    return df

%time
df_train = load_data(n=9)
df_test = load_data('test',n=4)        

In [None]:
#df_train.isnull().sum()
#df_test.isnull().sum()
df_all = pd.concat([df_train,df_test]).reset_index(drop=True)
print({"all":df_all.shape,
       "df_train":df_train.shape,
       "df_test":df_test.shape,})
df_all.head()

In [None]:
def feature_engineering(df):
    df = df.copy()
    
    df['month_unique_user_count'] = df.groupby('Date_Month')['fullVisitorId'].transform('nunique')
    df['day_unique_user_count'] = df.groupby('Date_Day')['fullVisitorId'].transform('nunique')
    df['sum_pageviews_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('sum')
    df['mean_pageviews_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('mean')
    df['sum_hits_per_day'] = df.groupby(['Date_Day'])['totals_hits'].transform('sum')
    df['count_pageviews_per_region'] = df.groupby('geoNetwork_region')['totals_pageviews'].transform('count')
    df['mean_pageviews_per_region'] = df.groupby('geoNetwork_region')['totals_pageviews'].transform('mean')
    df['sum_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('sum')
    df['sum_hits_per_region'] = df.groupby('geoNetwork_region')['totals_hits'].transform('sum')
    df['sum_hits_per_country'] = df.groupby('geoNetwork_country')['totals_hits'].transform('sum')
    df['user_pageviews_sum'] = df.groupby('fullVisitorId')['totals_pageviews'].transform('sum')
    df['user_hits_sum'] = df.groupby('fullVisitorId')['totals_hits'].transform('sum')
    
    print("Done........")
    return df

In [None]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

# label encoder for categorical attributes
def encode_data(df ,verbose=False):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['fullVisitorId','visitId','visitStartTime','date']:
            if verbose:
                print(col)
            lb = LabelEncoder()
            lb.fit( list(df[col].unique()))
            df[col] = lb.transform(df[col])
    return df 

df_all=encode_data(df_all,True)

In [None]:
def generate_label(label,id_dfx):
    col_label=['fullVisitorId','totals_transactionRevenue']
    
    #Select only the id is in df_train for the label
    label = label[label.fullVisitorId.isin(id_dfx)].copy()
    label=label.reset_index(drop=True)
    
    #drop all columuns else fullvisitorsid and totaltransations
    for c in label.columns:
        if(c not in col_label ):
            label.drop(c,axis=1,inplace=True)
            
    #Select the id in train not in label       
    id_label = label.fullVisitorId.drop_duplicates()
    not_in_label=list(set(id_dfx) - set(id_label))
    zeros=[0 for c in range(0,len(not_in_label))]
    df_label_0=pd.DataFrame(list(zip(not_in_label, zeros)) ,columns=['fullVisitorId','totals_transactionRevenue'])
    
    #Contatane  dataframes label and df_label_0
    label=pd.concat([label,df_label_0]).reset_index(drop=True)
    return label

## 2- Data Splitting times series

In [None]:
# train_v2.csv - from August 1st 2016 to April 30th 2018.
# test_v2.csv - from May 1st 2018 to October 15th 2018.
# sample_submission_v2.csv - from December 1st 2018 to January 31st 2019
#test
#df_test_x = df_all[(df_all.date >= "2018-05-01") & (df_all.date <= "2018-10-15")].copy()
#df_test_x =df_test_x.reset_index(drop=True)

#train1 
df_train1_x = df_all[df_all.date <= "2016-12-30"].copy() #5 months(august ---> dec)
df_train1_x=df_train1_x.reset_index(drop=True)
label_1 = df_all[(df_all.date >= "2017-01-01") & (df_all.date <= "2017-03-01")].copy() #2 months (jan--->feb)
id_train1 = df_train1_x.fullVisitorId.drop_duplicates()

#Generate label
label_1=generate_label(label_1,id_train1).copy()

# feature engeneering
df_train1_x=feature_engineering(df_train1_x).copy()

In [None]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
def three_month_after(train_begin,train_end,label_begin,label_end,verbose=False):
    # three moth ago for each date
    tb=datetime.strptime(train_begin,'%Y-%m-%d')+relativedelta(months=3);
    te=datetime.strptime(train_end,'%Y-%m-%d')+relativedelta(months=3);
    lb=datetime.strptime(label_begin,'%Y-%m-%d')+relativedelta(months=3);
    le=datetime.strptime(label_end,'%Y-%m-%d')+relativedelta(months=3);
    if verbose:
        print("train---------------------------------")
        print( 'intial:',datetime.strptime(train_begin,'%Y-%m-%d'))
        print( 'After 3 Month:', tb.strftime('%Y-%m-%d'))
        print( 'intial:',datetime.strptime(train_end,'%Y-%m-%d'))
        print( 'After 3 Month:', te.strftime('%Y-%m-%d'))
        print("label---------------------------------")
        print( 'intial:',datetime.strptime(label_begin,'%Y-%m-%d'))
        print( 'After 3 Month:', lb.strftime('%Y-%m-%d'))
        print( 'intial:',datetime.strptime(label_end,'%Y-%m-%d'))
        print( 'After 3 Month:', le.strftime('%Y-%m-%d'))
    
    # Generate df _ train for 5 month
    df_x = df_all[ (df_all.date >= tb.strftime('%Y-%m-%d')) & (df_all.date <= te.strftime('%Y-%m-%d'))].copy() #5,5 months(oct ---> march *0.5)
    df_x=df_x.reset_index(drop=True)
    #<-> 1.5 month 
    label = df_all[(df_all.date >= lb.strftime('%Y-%m-%d')) & (df_all.date <= le.strftime('%Y-%m-%d'))].copy() #2 months (may--->june)
    id_train = df_x.fullVisitorId.drop_duplicates()

    #Generate label 2 month
    label=generate_label(label,id_train).copy()
    
    return df_x,label

In [None]:
#train2
df_train2_x,label_2=three_month_after("2016-08-01","2016-12-30","2017-01-01","2017-03-01",verbose=True)
# feature engeneering
df_train2_x=feature_engineering(df_train2_x).copy()

In [None]:
#train3 
df_train3_x,label_3=three_month_after("2016-11-01","2017-03-30","2017-04-01","2017-06-01",verbose=True)
# feature engeneering
df_train3_x=feature_engineering(df_train3_x).copy()

In [None]:
#train4 
df_train4_x,label_4=three_month_after("2017-02-01","2017-06-30","2017-07-01","2017-09-01",verbose=True)
# feature engeneering
df_train4_x=feature_engineering(df_train4_x).copy()

In [None]:
#train5 
df_train5_x,label_5=three_month_after("2017-05-01","2017-09-30","2017-10-01","2017-12-01",verbose=True)
# feature engeneering
df_train5_x=feature_engineering(df_train5_x).copy()

In [None]:
#train6
df_train6_x,label_6=three_month_after("2017-08-01","2017-12-30","2018-01-01","2018-03-01",verbose=True)
# feature engeneering
df_train6_x=feature_engineering(df_train6_x).copy()

## 3- Data Aggregating

In [None]:
def group_by_fullVistorsId(df_x,y):
    cat_cols=["channelGrouping","device_browser","device_deviceCategory","device_operatingSystem","geoNetwork_city",
          "geoNetwork_continent","geoNetwork_country","geoNetwork_metro", "geoNetwork_networkDomain",
          "geoNetwork_region","geoNetwork_subContinent", "trafficSource_adContent",
          "trafficSource_adwordsClickInfo.adNetworkType","trafficSource_adwordsClickInfo.gclId",
          "trafficSource_adwordsClickInfo.slot","trafficSource_campaign","trafficSource_keyword",
          "trafficSource_medium","trafficSource_referralPath","trafficSource_source","customDimensions_value"]
    
    eng_cols= ['sum_hits_per_country', 'month_unique_user_count', 'sum_pageviews_per_network_domain', 'mean_pageviews_per_region',
    'sum_hits_per_network_domain','count_pageviews_per_region', 'sum_hits_per_day','mean_pageviews_per_network_domain',
    'user_pageviews_sum', 'user_hits_sum', 'day_unique_user_count', 'sum_hits_per_region']

    # for categorrials columns
    last_cols = cat_cols + ["fullVisitorId"]
    df_x_agg_last = df_x[last_cols].groupby("fullVisitorId",as_index=False).last().sort_values("fullVisitorId").reset_index(drop=True).copy()
    
    # for numeric columns
    num_cols = [item for item in df_train.columns if "totals" in item]
    sum_cols = num_cols + eng_cols + ["fullVisitorId"]
    df_x_agg_sum = df_x[sum_cols].groupby("fullVisitorId",as_index=False).sum().sort_values("fullVisitorId").reset_index(drop=True).copy()
    y_agg = y[["fullVisitorId",'totals_transactionRevenue']].groupby("fullVisitorId",as_index=False).sum().sort_values("fullVisitorId").reset_index(drop=True).copy()
    
    # log for totals_transactionRevenue
    df_x_agg_sum['totals_transactionRevenue'] = np.log1p(df_x_agg_sum['totals_transactionRevenue'])
    y_agg['totals_transactionRevenue'] = np.log1p(y_agg['totals_transactionRevenue'])
    
    # merge horizontaly dataframes
    df_x_agg = pd.merge(df_x_agg_sum,df_x_agg_last, how='left',on="fullVisitorId").sort_values("fullVisitorId").reset_index(drop=True).copy()
    print("Done.........")
    return df_x_agg,y_agg

In [None]:
# group by dataframes
df_train1_agg,label1_agg=group_by_fullVistorsId(df_train1_x,label_1)
df_train2_agg,label2_agg=group_by_fullVistorsId(df_train2_x,label_2)
df_train3_agg,label3_agg=group_by_fullVistorsId(df_train3_x,label_3)

In [None]:
df_train4_agg,label4_agg=group_by_fullVistorsId(df_train4_x,label_4)
df_train5_agg,label5_agg=group_by_fullVistorsId(df_train5_x,label_5)
df_train6_agg,label6_agg=group_by_fullVistorsId(df_train6_x,label_6)

In [None]:
# drop no trainning attributes

#fold1
#train 
df_train1_agg = df_train1_agg.drop(["fullVisitorId"],axis=1)
label1_agg= label1_agg["totals_transactionRevenue"]
#validation
df_train2_agg = df_train2_agg.drop(["fullVisitorId"],axis=1)
label2_agg= label2_agg["totals_transactionRevenue"]

#fold2
#train : df_train2_agg,label2_agg
#validation
df_train3_agg = df_train3_agg.drop(["fullVisitorId"],axis=1)
label3_agg= label3_agg["totals_transactionRevenue"]

#fold3
#train :df_train3_agg,label3_agg
#validation
df_train4_agg = df_train4_agg.drop(["fullVisitorId"],axis=1)
label4_agg= label4_agg["totals_transactionRevenue"]

#fold4
#train :df_train4_agg,label4_agg
#validation
df_train5_agg = df_train5_agg.drop(["fullVisitorId"],axis=1)
label5_agg= label5_agg["totals_transactionRevenue"]

#fold5
#train :df_train5_agg,label5_agg
#validation
df_train6_agg = df_train6_agg.drop(["fullVisitorId"],axis=1)
label6_agg= label6_agg["totals_transactionRevenue"]

## 4- Grid Search

In [None]:
#params to do un grid search
#max_depth= [3,10,15,30]
import lightgbm as lgb
n_estimators = [100,500,1000,2000]
learning_rate=[0.001,0.01,0.1]
min_child_samples=[40,50,60] 
num_leaves=[28,31,35,38] 
learning_rate=[0.005,0.01,0.1,0.5]


In [None]:
import lightgbm as lgb
print('Starting Grid Search ...')
mean=[]
avg=[]
axis_x=[]

for i in learning_rate:
    for j in n_estimators:
        rmse_val=[];
        print("learning_rate : ",i," n_estimators: ",j)
        params = {"objective" : "regression", "metric" : "rmse", "max_depth": 12,"learning_rate" : i,"n_estimators": j}
        gbm = lgb.LGBMRegressor(**params, nthread = 4, n_jobs = -1,early_stopping_rounds=100,silent=True)

        gbm.fit(df_train1_agg, label1_agg,
        eval_set=[(df_train1_agg, label1_agg),(df_train2_agg, label2_agg)],
        eval_metric='rmse',verbose=False)
        rmse_val.append(gbm.best_score_['valid_1']['rmse'])
        #
        gbm.fit(df_train2_agg, label2_agg,
        eval_set=[(df_train2_agg, label2_agg),(df_train3_agg, label3_agg)],
        eval_metric='rmse',verbose=False)
        rmse_val.append(gbm.best_score_['valid_1']['rmse'])
        #
        gbm.fit(df_train3_agg, label3_agg,
        eval_set=[(df_train3_agg, label3_agg),(df_train4_agg, label4_agg)],
        eval_metric='rmse',verbose=False)
        rmse_val.append(gbm.best_score_['valid_1']['rmse'])
        #
        gbm.fit(df_train4_agg, label4_agg,
        eval_set=[(df_train4_agg, label4_agg),(df_train5_agg, label5_agg)],
        eval_metric='rmse',verbose=False)
        rmse_val.append(gbm.best_score_['valid_1']['rmse'])
        #
        gbm.fit(df_train5_agg, label5_agg,
        eval_set=[(df_train5_agg, label5_agg),(df_train6_agg, label6_agg)],
        eval_metric='rmse',verbose=False)
        rmse_val.append(gbm.best_score_['valid_1']['rmse'])
        arr=np.array(rmse_val)
        print(rmse_val)   
        print("mean: ",arr.mean()," avg: ",arr.std())
        mean.append(arr.mean())
        avg.append(arr.std())
        axis_x.append(str(i)+"_"+str(j))

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(len(mean)), mean)
plt.plot(range(len(avg)), avg)
plt.xticks(range(len(avg)), axis_x)
plt.xticks(rotation=90)
plt.title('model logloss')  
plt.ylabel('logloss')  
plt.xlabel('parameters')  
plt.legend(['mean', 'avg']) 
plt.show()

## 5- Trainning and Cross Validation

In [None]:
#params = {"objective" : "regression", "metric" : "rmse", "max_depth": 12, "min_child_samples": 20, "num_leaves" : 31, "learning_rate" : 0.01}
params = {"objective" : "regression", "metric" : "rmse", "max_depth": 4,"learning_rate" : 0.01,"n_estimators": 1500,"min_child_samples": 500,'subsample': 0.9}
gbm = lgb.LGBMRegressor(**params, nthread = 4, n_jobs = -1,early_stopping_rounds=100)

In [None]:
train_rmse={}
valid_rmse={}

In [None]:
#fold1
gbm.fit(df_train1_agg, label1_agg,
        eval_set=[(df_train1_agg, label1_agg),(df_train2_agg, label2_agg)],
        eval_metric='rmse',verbose=False)

train_rmse.update({'fold1': gbm.best_score_['training']['rmse']}) 
valid_rmse.update({'fold1': gbm.best_score_['valid_1']['rmse']}) 

In [None]:
#fold2
gbm.fit(df_train2_agg, label2_agg,
        eval_set=[(df_train2_agg, label2_agg),(df_train3_agg, label3_agg)],
        eval_metric='rmse',verbose=False)
train_rmse.update({'fold2': gbm.best_score_['training']['rmse']}) 
valid_rmse.update({'fold2': gbm.best_score_['valid_1']['rmse']}) 

In [None]:
#fold3
gbm.fit(df_train3_agg, label3_agg,
        eval_set=[(df_train3_agg, label3_agg),(df_train4_agg, label4_agg)],
        eval_metric='rmse',verbose=False)
train_rmse.update({'fold3': gbm.best_score_['training']['rmse']}) 
valid_rmse.update({'fold3': gbm.best_score_['valid_1']['rmse']}) 

In [None]:
#fold4
gbm.fit(df_train4_agg, label4_agg,
        eval_set=[(df_train4_agg, label4_agg),(df_train5_agg, label5_agg)],
        eval_metric='rmse',verbose=False)
train_rmse.update({'fold4': gbm.best_score_['training']['rmse']}) 
valid_rmse.update({'fold4': gbm.best_score_['valid_1']['rmse']}) 

In [None]:
#fold5
gbm.fit(df_train5_agg, label5_agg,
        eval_set=[(df_train5_agg, label5_agg),(df_train6_agg, label6_agg)],
        eval_metric='rmse',verbose=False)
train_rmse.update({'fold5': gbm.best_score_['training']['rmse']}) 
valid_rmse.update({'fold5': gbm.best_score_['valid_1']['rmse']}) 

In [None]:
#plot k fold logloss

plt.plot(range(len(train_rmse)), train_rmse.values())
plt.plot(range(len(valid_rmse)), valid_rmse.values())
plt.xticks(range(len(train_rmse)), list(train_rmse.keys()))
plt.xticks(rotation=90)

plt.title('model logloss')  
plt.ylabel('logloss')  
plt.xlabel('fold')  
plt.legend(['train', 'valid']) 
plt.show()

In [None]:
# plot features importances
lgb.plot_importance(gbm, max_num_features=30)

In [None]:
['month_unique_user_count','day_unique_user_count','weekday_unique_user_count','weekofyear_unique_user_count',
    'sum_pageviews_per_network_domain','count_pageviews_per_network_domain','mean_pageviews_per_network_domain',
    'sum_hits_per_network_domain','count_hits_per_network_domain','mean_hits_per_network_domain',
    'mean_hits_per_day','sum_hits_per_day',
    'sum_pageviews_per_region','count_pageviews_per_region','mean_pageviews_per_region',
    'sum_hits_per_network_domain','count_hits_per_network_domain','mean_hits_per_network_domain',
    'sum_hits_per_region','count_hits_per_region','mean_hits_per_region',
    'sum_hits_per_country','count_hits_per_country','mean_hits_per_country',
    'user_pageviews_sum','user_hits_sum',
    'user_pageviews_count','user_hits_count',
    'user_pageviews_sum_to_mean','user_hits_sum_to_mean']

#[sum_hits_per_country month_unique_user_count sum_pageviews_per_network_domain mean_pageviews_per_region
#sum_hits_per_network_domain count_pageviews_per_region sum_hits_per_day mean_pageviews_per_network_domain
#user_pageviews_sum user_hits_sum day_unique_user_count sum_hits_per_region]
   
#print("13..............")
#df['user_pageviews_to_region'] = df['user_pageviews_sum'] / df['mean_pageviews_per_region']
#df['user_hits_to_region'] = df['user_hits_sum'] / df['mean_hits_per_region']
#df['user_pageviews_sum_to_mean'] = df['user_pageviews_sum'] / df['user_pageviews_sum'].mean()
#df['user_hits_sum_to_mean'] = df['user_hits_sum'] / df['user_hits_sum'].mean()
#df['user_pageviews_count'] = df.groupby('fullVisitorId')['totals_pageviews'].transform('count')
#df['user_hits_count'] = df.groupby('fullVisitorId')['totals_hits'].transform('count')
#df['count_hits_per_country'] = df.groupby('geoNetwork_country')['totals_hits'].transform('count')
#df['mean_hits_per_country'] = df.groupby('geoNetwork_country')['totals_hits'].transform('mean')
#df['count_hits_per_region'] = df.groupby('geoNetwork_region')['totals_hits'].transform('count')
#df['mean_hits_per_region'] = df.groupby('geoNetwork_region')['totals_hits'].transform('mean')
#df['count_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('count')
#df['mean_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('mean')
#df['mean_hits_per_day'] = df.groupby(['Date_Day'])['totals_hits'].transform('mean')
#df['sum_pageviews_per_region'] = df.groupby('geoNetwork_region')['totals_pageviews'].transform('sum')
#df['count_pageviews_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_pageviews'].transform('count')
#df['sum_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('sum')
#df['count_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('count')
#df['mean_hits_per_network_domain'] = df.groupby('geoNetwork_networkDomain')['totals_hits'].transform('mean')