### Data Fields
- fullVisitorId- A unique identifier for each user of the Google Merchandise Store.

- channelGrouping - The channel via which the user came to the Store.

- date - The date on which the user visited the Store.

- device - The specifications for the device used to access the Store.

- geoNetwork - This section contains information about the geography of the user.

- sessionId - A unique identifier for this visit to the store.

- socialEngagementType - Engagement type, either "Socially Engaged" or "Not Socially Engaged".

- totals - This section contains aggregate values across the session.

- trafficSource - This section contains information about the Traffic Source from which the session originated.

- visitId - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.

- visitNumber - The session number for this user. If this is the first session, then this is set to 1.

- visitStartTime - The timestamp (expressed as POSIX time).

### Removed Data Fields
Some fields were censored to remove target leakage. The major censored fields are listed below.

- hits - This row and nested fields are populated for any and all types of hits. Provides a record of all page visits.

- customDimensions - This section contains any user-level or session-level custom dimensions that are set for a session. This is a repeated field and has an entry for each dimension that is set.

- totals - Multiple sub-columns were removed from the totals field.

Data (116 MB)

In [None]:
import os
import json
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import gc
import os

from load_df import *
from tqdm import tqdm_notebook, tnrange
from pandas.io.json import json_normalize
from sklearn import model_selection, preprocessing, metrics
from plotly import tools

py.init_notebook_mode(connected=True)
color = sns.color_palette()
feat_orignal_dir = "./datasetsV2/data/"
submission_dir = 'submission/'
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES']='1'

%matplotlib

In [None]:
%%time
# 加载原始数据
train_path = feat_orignal_dir + 'train_v2.csv'
test_path = feat_orignal_dir + 'test_v2.csv'

df_train,df_test = get_df(train_path,test_path)

print(df_train.shape)
print(df_test.shape)
print(set(df_train.columns) - (set(df_train.columns) & set(df_test.columns)))

In [None]:
def downCast_dtype(df):
    '''
        对数据类型进行转换
    '''
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols = [c for c in df if df[c].dtype == 'int64']
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return df

In [None]:
def remove_const_cols(df_train,df_test):
    #查询出具有列元素全部相同的列名
    const_cols = [c for c in tqdm_notebook(df_test.columns) if df_test[c].nunique(dropna=False)==1 and c!='totals.visits']
    if len(const_cols):
        dropCols = const_cols +['trafficSource.adwordsClickInfo.page']+['totals.totalTransactionRevenue']+['totals.transactions']
        df_test = df_test.drop(dropCols,axis=1)
        df_train = df_train.drop(dropCols+['trafficSource.campaignCode'],axis=1)
    return df_train,df_test

df_train,df_test = remove_const_cols(df_train,df_test)
df_train.sort_values(['date'],ascending=True,inplace=True)
# df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].apply(lambda x:np.log1p(float(x)) if float(x) > 0 else 0)
df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].fillna(0)
df_test.sort_values(['date'],ascending=True,inplace=True)
# df_test['totals.transactionRevenue'] = df_test['totals.transactionRevenue'].apply(lambda x:np.log1p(float(x)) if float(x) > 0 else 0)
df_test['totals.transactionRevenue'] = df_test['totals.transactionRevenue'].fillna(0)
df_test['is_test'] = True
df_train['is_test'] = False


In [None]:
df_data = pd.concat((df_train,df_test))
df_data = downCast_dtype(df_data)

print(df_train.shape,df_test.shape)
print(df_data.shape)

In [None]:
from datetime import date,timedelta

def getDateFeatures(df):
    
    holiday = ['01-01','02-01','02-12','02-14','02-22','03-07','03-17','04-01','05-01','05-14',
               '06-01','06-21','06-14','07-01','08-01','09-01','10-01','10-31','11-01','11-11',
               '12-24','12-25']
    df['visitStartTime'] = pd.to_datetime(df['visitStartTime'],unit='s')
#     df['visitStartTime'] = pd.to_datetime(df['date'])
    df['month'] = df['visitStartTime'].dt.month
    df['week'] = df['visitStartTime'].dt.week
    df['day'] = df['visitStartTime'].dt.day
    df["weekOfday"] = df['visitStartTime'].dt.weekday
    for h in holiday:
        h_month,h_day = h.split('-')
        df['is_holiday'] = ((df['month']==int(h_month))&(df['day']==int(h_day))|(df['weekOfday']==5)|(df['weekOfday']==6))*1

    #某日前两天、和后两天是否为假期
    df['prev_day_is_holiday'] = df['is_holiday'].shift().fillna(0)
    df['pre2_day_is_holiday'] = df['is_holiday'].shift(2).fillna(0)
    df['next_day_is_holiday'] = df['is_holiday'].shift(-1).fillna(0)
    df['next2_day_is_holiday'] = df['is_holiday'].shift(-2).fillna(0)

    df['month.unique.user.count'] = df.groupby('month')['fullVisitorId'].transform('nunique')
    df['day.unique.user.count'] = df.groupby('day')['fullVisitorId'].transform('nunique')
    df['weekday.unique.user.count'] = df.groupby('weekOfday')['fullVisitorId'].transform('nunique')

    df['next_session_1'] = (
        df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(1)
    ).astype(np.int64) // 1e9 // 60 // 60
    df['next_session_2'] = (
        df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(-1)
    ).astype(np.int64) // 1e9 // 60 // 60

    days_of_months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    #当天距离月初月末的距离
    df["days_to_side"] = df['visitStartTime'].apply(
                            lambda x: min(x.day, days_of_months[x.month-1]-x.day))
    df['day'] = df['day'].apply(lambda x:0 if x<=7 else 2 if x>=24 else 1)
    df.drop(columns=['visitStartTime'],axis=1,inplace=True)
    
    return df
df_data = getDateFeatures(df_data)
print(df_data.shape)

In [None]:
def getLagFeatures(df, time_col, group_cols, shifts):    
    '''
    For epoch time, compute deltas with the specified shift on sequences
    aggregated by group_cols, return df with new columns
    '''
    df = df.sort_values(by=time_col)
    for shift in shifts:
        feat_name = '_'.join(group_cols) + ('_delta_shift_%d' % shift) 
        
        df[feat_name] = (df.groupby(group_cols)
                            [time_col].shift(shift) - df[time_col]).astype(np.float32)
        df[feat_name] = df[feat_name] * -1 * np.sign(shift) # flip sign for lags
        df[feat_name] = df[feat_name].fillna(0)
    return df

lags = [x for x in range(-6,7) if x!=0]

# for df in [train_df,test_df]:
#     df = add_grouped_time_delta_features(df,'visitStartTime',['fullVisitorId'],lags)
#     df = add_grouped_time_delta_features(df,'totals.visits',['fullVisitorId','month','week_day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.visits',['fullVisitorId','month','day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.visits',['fullVisitorId','month','hour'],lags)
#     df = add_grouped_time_delta_features(df,'totals.visits',['fullVisitorId','month'],lags)

#     df = add_grouped_time_delta_features(df,'totals.hits',['fullVisitorId','month','week_day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.hits',['fullVisitorId','month','day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.hits',['fullVisitorId','month','hour'],lags)
#     df = add_grouped_time_delta_features(df,'totals.hits',['fullVisitorId','month'],lags)

#     df = add_grouped_time_delta_features(df,'totals.pageviews',['fullVisitorId','month','week_day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.pageviews',['fullVisitorId','month','day'],lags)
#     df = add_grouped_time_delta_features(df,'totals.pageviews',['fullVisitorId','month','hour'],lags)
#     df = add_grouped_time_delta_features(df,'totals.pageviews',['fullVisitorId','month'],lags)

# print(train_df.shape)
# print(test_df.shape)

In [None]:
def getTotalFeatures(df):
    '''
        totals点击量处理
    '''
    df['totals.pageviews'] = df['totals.pageviews'].fillna(1.0)
    df['totals.newVisits'] = df['totals.newVisits'].fillna(0)
    df['totals.bounces'] = df['totals.bounces'].fillna(0)
    
    #用户每天，每周，每月的访问量
    df['sum_visits_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.visits'].transform('sum')
    df['sum_visits_per_week_visitor'] = df.groupby(['fullVisitorId','month','weekOfday'])['totals.visits'].transform('sum')
    df['sum_visits_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.visits'].transform('sum')
    
    #用户每天，每周，每月的点击量
    df['sum_hits_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.hits'].transform('sum')
    df['sum_hits_per_week_visitor'] = df.groupby(['fullVisitorId','month','weekOfday'])['totals.hits'].transform('sum')
    df['sum_hits_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.hits'].transform('sum')
    
    #用户每天，每周，每月的网页浏览量
    df['sum_pageviews_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.pageviews'].transform('sum')
    df['sum_pageviews_per_week_visitor'] = df.groupby(['fullVisitorId','month','weekOfday'])['totals.pageviews'].transform('sum')
    df['sum_pageviews_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.pageviews'].transform('sum')
    
    #用户每个pageView产生的点击量hits/pageviews
    df['hits/pageViews_per_day_visitor'] = df['sum_hits_per_day_visitor'].values/df['sum_pageviews_per_day_visitor'].values
    df['hits/pageViews_per_week_visitor'] = df['sum_hits_per_week_visitor'].values/df['sum_pageviews_per_week_visitor'].values
    df['hits/pageviews_per_month_visitor'] =df['sum_hits_per_month_visitor'].values/df['sum_pageviews_per_month_visitor'].values
    
    #用户每天，每周，每月的session量
    df['sum_visitNumber_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['visitNumber'].transform('sum')
    df['sum_visitNumber_per_week_visitor'] = df.groupby(['fullVisitorId','month','weekOfday'])['visitNumber'].transform('sum')
    df['sum_visitNumber_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['visitNumber'].transform('sum')
    
    #平均session量
    df['mean_vistiNumber_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['visitNumber'].transform('mean')
    df['mean_visitNumebr_per_week_visitor']= df.groupby(['fullVisitorId','month','weekOfday'])['visitNumber'].transform('mean')
    df['mean_visitNumber_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['visitNumber'].transform('mean')
    
    #每次session产生的pageviews pageview/visitNumber
    df['pageviews/visitNumber_per_day_visitor'] = df['sum_pageviews_per_day_visitor'].values/df['sum_visitNumber_per_day_visitor'].values
    df['pageviews/visitNumber_per_week_visitor'] = df['sum_pageviews_per_week_visitor'].values/df['sum_visitNumber_per_week_visitor'].values
    df['pageviews/visitNumber_per_month_visitor'] = df['sum_pageviews_per_month_visitor'].values/df['sum_visitNumber_per_month_visitor'].values
    
    df['mean.hits.per.day'] = df.groupby(['month','day'])['totals.hits'].transform('mean')
    df['sum.hits.per.day'] = df.groupby(['month','day'])['totals.hits'].transform('sum')
    
    df['mean.hits.per.month'] = df.groupby(['month'])['totals.hits'].transform('mean')
    df['sum.hits.per.month'] = df.groupby(['month'])['totals.hits'].transform('sum')
    
    df['mean.hits.per.week_day'] = df.groupby(['month','weekOfday'])['totals.hits'].transform('mean')
    df['sum.hits.per.week_day'] = df.groupby(['month','weekOfday'])['totals.hits'].transform('sum')
    
    df['mean.pageviews.per.day'] = df.groupby(['month','day'])['totals.pageviews'].transform('mean')
    df['sum.pageviews.per.day'] = df.groupby(['month','day'])['totals.pageviews'].transform('sum')
    
    df['mean.pageviews.per.month'] = df.groupby(['month'])['totals.pageviews'].transform('mean')
    df['sum.pageviews.per.month'] = df.groupby(['month'])['totals.pageviews'].transform('sum')
    
    return df
df_data = getTotalFeatures(df_data)
print(df_data.shape)  

In [None]:
#类别特征处理
device_browers = list(df_data['device.browser'].value_counts().reset_index()['index'][0:30])
device_os = list(df_data['device.operatingSystem'].value_counts().reset_index()['index'][0:15])

geoNetwork_cities = list(df_data['geoNetwork.city'].value_counts().reset_index()['index'][0:10])
geoNetwork_country = list(df_data['geoNetwork.country'].value_counts().reset_index()['index'][0:20])
geoNetwork_metro = list(df_data['geoNetwork.metro'].value_counts().reset_index()['index'][0:40])
geoNetwork_networkDomain = list(df_data['geoNetwork.networkDomain'].value_counts().reset_index()['index'][0:40])

def browser_mapping(x):
    if x in device_browers:
        return x.lower()
    else:
        return 'others'

def geoNetwork_city(x):
    if x in geoNetwork_cities:
        return x.lower()
    else:
        return 'others'
    
def geoNetwork_countries(x):
    if x in geoNetwork_country:
        return x.lower()
    else:
        return 'others'

def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def device_operatingSystem(x):
    if x in device_os:
        return x.lower()
    else:
        return 'others'

def traficSource_referralPath(x):
    if x == '/':
        return '/'
    elif 'yt/about/' in x:
        return x
    elif 'google' in x:
        return '/google'
    elif 'mail' in x:
        return '/mail'
    elif 'yt/advertise/' in x:
        return x
    elif 'offer/2145' in x:
        return x
    elif 'yt/creators/' in x:
        return x
    elif 'pagead/ads' in x:
        return x
    elif '/intl/' in x:
        return x;
    elif 'shirt' in x:
        return x;
    elif '/analytics/app/' in x:
        return x
    elif 'using-the-logo':
        return x
    elif '/moma' in x:
        return x
    else:
        return '/others'  
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif('(direct)'in x):
        return x;
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

for df in [df_data]:  
    df['device.browser'] = df['device.browser'].map(lambda x:browser_mapping(str(x))).astype('str')
    df['device.operatingSystem'] = df['device.operatingSystem'].map(lambda x:device_operatingSystem(str(x))).astype('str')
    df['trafficSource.adContent'] = df['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
    df['trafficSource.source'] = df['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')
    df['geoNetwork.city'] = df['geoNetwork.city'].map(lambda x:geoNetwork_city(str(x))).astype('str')
    df['geoNetwork.country'] = df['geoNetwork.country'].map(lambda x:geoNetwork_countries(str(x))).astype('str')
    df['trafficSource.referralPath'] = df['trafficSource.referralPath'].map(lambda x:traficSource_referralPath(str(x))).astype('str')

#将具有从属关系的特征进行合并:地区从属关系，市场分区
for df in [df_data]:
    print("... process device ...")
    df['source.country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    df['campaign.medium'] = df['trafficSource.campaign'] + '_' + df['trafficSource.medium']
    df['browser.category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser.os'] = df['device.browser'] + '_' + df['device.operatingSystem']

#将设备中具有从属关系的特征整合
def getCustormFeature(df):
    print('... custom ...')
    df['device_deviceCategory_channelGrouping'] = df['device.deviceCategory'] + "_" + df['channelGrouping']
    df['channelGrouping_browser'] = df['device.browser'] + "_" + df['channelGrouping']
    df['channelGrouping_OS'] = df['device.operatingSystem'] + "_" + df['channelGrouping']
    
    df['city_continent_country_metro_networkDomain_region_subContinent'] = df['geoNetwork.city']+"_"+df['geoNetwork.continent']+"_"+df['geoNetwork.country']+"_"+df['geoNetwork.metro']+"_"+df['geoNetwork.networkDomain']+"_"+df['geoNetwork.region']+df['geoNetwork.subContinent']
    
    for i in ['geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country','geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region','geoNetwork.subContinent']:
        for j in ['device.browser','device.deviceCategory', 'device.operatingSystem', 'trafficSource.source']:
            df[i + "_" + j] = df[i] + "_" + df[j]
    
    df['content.source'] = df['trafficSource.adContent'] + "_" + df['source.country']
    df['medium.source'] = df['trafficSource.medium'] + "_" + df['source.country']
    
    return df

df_data = getCustormFeature(df_data)

print(df_data.shape)

In [None]:
#设备&网络特征
def getDeviceFeature(df,groups=None,feaCols=None):
    feCols = []
    for group in groups:
        for fe in feaCols:
            df['sum_%s_By%s'%(fe,group)] = df.groupby(group)[fe].transform('sum')
            df['mean_%s_By%s'%(fe,group)] = df.groupby(group)[fe].transform('mean')
            feCols.append('sum_%s_By%s'%(fe,group))
            feCols.append('mean_%s_By%s'%(fe,group))
    df.fillna(0,inplace=True)
    return df,feCols
deviceCols = []
deviceGroups = ['device.browser','device.operatingSystem','device.isMobile','device.deviceCategory',
                'geoNetwork.networkDomain']
totalCols = ['totals.hits','totals.visits','totals.pageviews','totals.timeOnSite','totals.sessionQualityDim']
df_data,deviceCols = getDeviceFeature(df_data,groups=deviceGroups,feaCols=totalCols)

df_data = downCast_dtype(df_data)

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
excluded_cols = ['fullVisitorId','date','totals.transactionRevenue','visitStartTime']
cat_cols = [col for col in df_data.columns if df_data[col].dtype =="object" and col not in excluded_cols]
# label encode the categorical variables and convert the numerical variables to float
def label_encoding(df_data):
    for col in tqdm_notebook(cat_cols):
        if df_data[col].dtype == bool:#将bool变量转成整数
            df_data[col] = df_data[col].astype(int)
        lbl = LabelEncoder()
        lbl.fit(list(df_data[col].values.astype('str')))
        
        df_data[col] = lbl.transform(list(df_data[col].values.astype('str')))
        
    return df_data

df_data = label_encoding(df_data)

### 特征文件保存

In [None]:
df_data.replace([np.inf,-np.inf],np.nan,inplace=True)
df_data.fillna(0,inplace=True)

df_train = df_data[df_data.is_test==0]
df_test = df_data[df_data.is_test==1]
df_train.to_csv('./features/'+'df_train.csv',index=None)
df_test.to_csv('./features/'+'df_test.csv',index=None)
print(df_train.shape)
print(df_test.shape)

### Performance Metrics

In [None]:
from sklearn import metrics
def score_model(pred_val,val_df):
    '''
    input :
        pred_val:numpy array
        val_df:DataFrame 
    return :
        score
    '''
    pred_val[pred_val<0] = 0
    val_pred_df = pd.DataFrame({"fullVisitorId":val_df["fullVisitorId"].values})

    val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
    val_pred_df["PredictedRevenue"] = np.expm1(pred_val)

    val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
    
    score = np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values)))
    
    return score

### 数据集加载

In [None]:
df_train = pd.read_csv('./features/'+'df_train.csv',dtype={'fullVisitorId': np.str,'date':np.str})
df_test = pd.read_csv('./features/'+'df_test.csv',dtype={'fullVisitorId': np.str,'date':np.str})
print(df_train.shape,df_test.shape)

### Classification With GroupKFold
   * lgbclf
   * xgbclf
   * catclf

In [None]:
excluded_cols = ['date','fullVisitorId','totals.transactionRevenue','visitStartTime']
tr_features = [_f for _f in df_train.columns if _f not in excluded_cols]
train_label = (df_train['totals.transactionRevenue']>0).astype(np.uint8)
train_y = df_train['totals.transactionRevenue'].apply(lambda x:np.log1p(float(x)) if float(x) > 0 else 0)

#检查分类样本是否不平衡
train_label.value_counts()/train_label.shape[0]*100

In [None]:
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as catb

def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

In [None]:
#注意分类样本存在不平衡
def clf_getOutFold(clf,x_train,y_train,x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 5
    
    folds = get_folds(df=x_train, n_splits=NFOLDS)
    oof_train_pred = np.zeros((ntrain,2))
    oof_test_pred = np.zeros((ntest,2))
    oof_test_pred_skf = np.empty((ntest,2,NFOLDS))
  
    for i, (dev_index, val_index) in enumerate(folds):
        x_dev = x_train[tr_features].iloc[dev_index]
        y_dev = y_train.iloc[dev_index]
        x_val = x_train[tr_features].iloc[val_index]
        y_val = y_train.iloc[val_index]

        clf.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=100,verbose=100)
        
        oof_test_pred_skf[:,:,i] = clf.predict_proba(x_test[tr_features],num_iteration=clf.best_iteration_)
        oof_train_pred[val_index,:] = clf.predict_proba(x_val,num_iteration=clf.best_iteration_)
            
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=-1)
    
    print("-"*50+str("clf training done！")+"-"*50)
    
    return clf,oof_test_pred,oof_train_pred

lgb_clf_params = {
    'num_leaves':31,
    'learning_rate':0.03,
    'n_estimators':5000,
    'class_weight':'balanced',#平衡样本
    'subsample':0.9,
    'colsample_bytree':0.9,
    'random_state':20,
    'objective':'binary',
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
}
lgb_clf = lgb.LGBMClassifier(**lgb_clf_params)

lgb_clf,clf_test_pred,clf_train_pred = clf_getOutFold(lgb_clf,df_train,train_label,df_test)
# pd.DataFrame(data=clf_test_pred,columns=['lgb_proba_0','lgb_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'lgbclf_testpred.csv',index=None)
# pd.DataFrame(data=clf_train_pred,columns=['lgb_proba_0','lgb_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'lgbclf_trainpred.csv',index=None)
#加入分类特征
df_train['lgb_proba_0'] = clf_train_pred[:,0]
df_train['lgb_proba_1'] = clf_train_pred[:,1]
df_test['lgb_proba_0'] = clf_test_pred[:,0]
df_test['lgb_proba_1'] = clf_test_pred[:,1]

## XGB Classifier
def clf_getOutFold2(clf,x_train,y_train,x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 10
    
    folds = get_folds(df=x_train, n_splits=NFOLDS)

    oof_train_pred = np.zeros((ntrain,2))
    oof_test_pred = np.zeros((ntest,2))
    oof_test_pred_skf = np.empty((ntest,2,NFOLDS))

    for i, (dev_index, val_index) in enumerate(folds):
        x_dev = x_train[tr_features].iloc[dev_index]
        y_dev = y_train.iloc[dev_index]
        x_val = x_train[tr_features].iloc[val_index]
        y_val = y_train.iloc[val_index]

        clf.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=100,verbose=100)
    
        oof_test_pred_skf[:,:,i] = clf.predict_proba(x_test[tr_features])
        oof_train_pred[val_index] = clf.predict_proba(x_val)
        
    oof_test_pred[:] = oof_test_pred_skf.mean(axis=-1)

    print("-"*50+str("clf training done！")+"-"*50)
    
    return clf,oof_test_pred,oof_train_pred

xgb_clf_params = {
    'learning_rate': 0.01,
    'n_estimators':5000,
    'scale_pos_weight':77.4,#平衡样本
    'objective':'binary:logistic',
    'max_depth': 22,
    'min_child_weight': 57,
    'subsample': 0.67,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.50,
    'n_jobs': -1,
    'random_state': 20
}

xgb_clf = xgb.XGBClassifier(**xgb_clf_params)

xgb_clf,clf_test_pred,clf_train_pred = clf_getOutFold2(xgb_clf,df_train,train_label,df_test)

# pd.DataFrame(data=clf_test_pred,columns=['xgb_proba_0','xgb_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'xgbclf_testpred.csv',index=None)
# pd.DataFrame(data=clf_train_pred,columns=['xgb_proba_0','xgb_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'xgbclf_trainpred.csv',index=None)

df_train['xgb_proba_0'] = clf_train_pred[:,0]
df_train['xgb_proba_1'] = clf_train_pred[:,1]
df_test['xgb_proba_0'] = clf_test_pred[:,0]
df_test['xgb_proba_1'] = clf_test_pred[:,1]

### Catboost classifier
import catboost as catb
cat_clf_params = {
    'n_estimators':5000,
    'learning_rate':0.02,
    'max_depth':10,
    'scale_pos_weight':77.4,#平衡样本
    'loss_function':'Logloss',
    'eval_metric':'Logloss',
    'random_state':20,
    'bagging_temperature':0.2,
    'od_type':'Iter',
    'od_wait':20
}
cat_clf = catb.CatBoostClassifier(**cat_clf_params)

cat_clf,clf_test_pred,clf_train_pred = clf_getOutFold2(cat_clf,df_train,train_label,df_test)

# pd.DataFrame(data=clf_test_pred,columns=['cat_proba_0','cat_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'catclf_testpred.csv',index=None)
# pd.DataFrame(data=clf_train_pred,columns=['cat_proba_0','cat_proba_1'],dtype=np.float32).to_csv(feat_orignal_dir+'catclf_trainpred.csv',index=None)

df_train['cat_proba_0'] = clf_train_pred[:,0]
df_train['cat_proba_1'] = clf_train_pred[:,1]
df_test['cat_proba_0'] = clf_test_pred[:,0]
df_test['cat_proba_1'] = clf_test_pred[:,1]

del cat_clf,xgb_clf,lgb_clf,clf_test_pred,clf_train_pred
gc.collect()

### Regressions With GroupKFold

In [None]:
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
tr_features = [_f for _f in df_train.columns if _f not in excluded_cols]

#GroupKFold 交叉验证输出
def get_out_fold(model,x_train,y_train,x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 10
    
    folds = get_folds(df=x_train, n_splits=NFOLDS)
#     获取预测叶子节点Index
#     train_index_features = np.zeros((ntrain,model.n_estimators))
#     test_index_features = np.zeros((ntest,model.n_estimators))
    
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    #针对不同的模型采用不同的训练方式
    for i, (dev_index, val_index) in enumerate(folds):
        x_dev = x_train[tr_features].iloc[dev_index]
        y_dev = y_train.iloc[dev_index]
        x_val = x_train[tr_features].iloc[val_index]
        y_val = y_train.iloc[val_index]

        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=100,verbose=100)
    
        oof_test_pred_skf[i, :] = model.predict(x_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
#     train_index_features = model.predict(x_train[tr_features],pred_leaf=True)
#     test_index_features = model.predict(x_test[tr_features],pred_leaf=True)
    
    oof_test_pred_skf[oof_test_pred_skf<0] = 0
    oof_test_pred[:] = np.expm1(oof_test_pred_skf).mean(axis=0)
    oof_train_pred = np.expm1(oof_train_pred)
    
    print("-"*50+str("model training done！")+"-"*50)
    
    return model,oof_test_pred.reshape(-1, 1),oof_train_pred.reshape(-1,1)

lgb_params={
    'learning_rate': 0.03,
    'objective':'regression',
    'n_estimators':5000,
    'metric':'rmse',
    'num_leaves': 31,
    'verbose': 1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "random_state":42,
    'max_depth': 15,
    'lambda_l2': 0.02,
    'lambda_l1': 0.004,
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.7,
    'min_child_samples': 21
}
lgb_est = lgb.LGBMRegressor(**lgb_params)
lgb_est,oof_lgb_test_pred,oof_lgb_train_pred = get_out_fold(lgb_est,df_train,train_y,df_test)
# lgb_est.booster_.save_model('lgb_est_session.txt')

# lgb_train_pred = pd.DataFrame({"fullVisitorId":train_df['fullVisitorId'].values,"lgb_train_pred":oof_lgb_train_pred.reshape(-1,)})
# lgb_test_pred = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId'].values,"lgb_test_pred":oof_lgb_test_pred.reshape(-1,)})
# lgb_train_pred.to_csv('lgb_train_session_pred.csv',index=False)
# lgb_test_pred.to_csv('lgb_test_session_pred.csv',index=False)

#特征重要性分析
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(lgb_est,max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("LGBM - Feature Importance", fontsize=10)
plt.show()

In [None]:
def get_out_fold2(model,x_train,y_train,x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 10
    
    folds = get_folds(df=x_train, n_splits=NFOLDS)
#     获取预测叶子节点Index
#     train_index_features = np.zeros((ntrain,model.n_estimators))
#     test_index_features = np.zeros((ntest,model.n_estimators))
    
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    #针对不同的模型采用不同的训练方式
    for i, (dev_index, val_index) in enumerate(folds):
        x_dev = x_train[tr_features].iloc[dev_index]
        y_dev = y_train.iloc[dev_index]
        x_val = x_train[tr_features].iloc[val_index]
        y_val = y_train.iloc[val_index]

        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=100,verbose=100)
    
        oof_test_pred_skf[i, :] = model.predict(x_test[tr_features])
        oof_train_pred[val_index] = model.predict(x_val)
#     train_index_features = model.predict(x_train[tr_features],pred_leaf=True)
#     test_index_features = model.predict(x_test[tr_features],pred_leaf=True)
    
    oof_test_pred_skf[oof_test_pred_skf<0] = 0
    oof_test_pred[:] = np.expm1(oof_test_pred_skf).mean(axis=0)
    oof_train_pred = np.expm1(oof_train_pred)
    
    print("-"*50+str("model training done！")+"-"*50)
    
    return model,oof_test_pred.reshape(-1, 1),oof_train_pred.reshape(-1,1)

xgb_params = {
    'objective': 'reg:linear',
    'booster': 'gbtree',
    'learning_rate': 0.02,
    'n_estimators':5000,
    'max_depth': 22,
    'min_child_weight': 57,
    'gamma' : 1.45,
    'alpha': 0.0,
    'lambda': 0.0,
    'subsample': 0.67,
    'colsample_bytree': 0.054,
    'colsample_bylevel': 0.50,
    'n_jobs': -1,
    'random_state': 20
}
xgb_est = xgb.XGBRegressor(**xgb_params)
xgb_est,oof_xgb_test_pred,oof_xgb_train_pred = get_out_fold2(xgb_est,df_train,train_y,df_test)
# xgb_est.save_model('xgb_est_session.model')

# xgb_train_pred = pd.DataFrame({"fullVisitorId":train_df['fullVisitorId'].values,"xgb_train_pred":oof_xgb_train_pred.reshape(-1,)})
# xgb_test_pred = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId'].values,"xgb_test_pred":oof_xgb_test_pred.reshape(-1,)})
# xgb_train_pred.to_csv('xgb_train_session_pred.csv',index=False)
# xgb_test_pred.to_csv('xgb_test_session_pred.csv',index=False)
#特征重要性分析
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(xgb_est,max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("XGB - Feature Importance", fontsize=10)
plt.show()

In [None]:
cat_params = {
        'n_estimators':5000,
        'learning_rate':0.01,
        'max_depth':7,
        'loss_function':'RMSE',
        'eval_metric':'RMSE',
        'random_state':42,
        'bagging_temperature':0.2,
        'od_type':'Iter',
        'od_wait':20
    }
cat_est = catb.CatBoostRegressor(**cat_params)
cat_est,oof_cat_test_pred,oof_cat_train_pred = get_out_fold2(cat_est,df_train,train_y,df_test)

# cat_est.save_model('catb_est_session.mlmodel',format="coreml")
# cat_train_pred = pd.DataFrame({"fullVisitorId":train_df['fullVisitorId'].values,"cat_train_pred":oof_cat_train_pred.reshape(-1,)})
# cat_test_pred = pd.DataFrame({"fullVisitorId":test_df['fullVisitorId'].values,"cat_test_pred":oof_cat_test_pred.reshape(-1,)})
# cat_train_pred.to_csv('cat_train_session_pred.csv',index=False)
# cat_test_pred.to_csv('cat_test_session_pred.csv',index=False)

cat_est.get_feature_importance(prettified=True)

In [None]:
oof_test_pred = oof_lgb_test_pred*0.2+ oof_xgb_test_pred*0.4 + oof_cat_test_pred*0.4
oof_train_pred = oof_lgb_train_pred*0.2+ oof_xgb_train_pred*0.4  + oof_cat_train_pred*0.4 
del oof_lgb_test_pred,oof_xgb_test_pred,oof_cat_test_pred
del oof_lgb_train_pred,oof_xgb_train_pred,oof_cat_train_pred
del lgb_est,xgb_est,cat_est
gc.collect()

df_train.to_csv('./features/'+'df_trainV2.csv',index=None)
df_test.to_csv('./features/'+'df_testV2.csv',index=None)

### 创建基于用户的预测

In [None]:
df_train = pd.read_csv('./features/'+'df_trainV2.csv',dtype={'fullVisitorId':np.str,'date':np.str},low_memory=False)
df_test = pd.read_csv('./features/'+'df_testV2.csv',dtype={'fullVisitorId':np.str,'date':np.str},low_memory=False)

In [None]:
#将预测收入作为特征加入到数据集中
df_train['X_pred'] = oof_train_pred
df_test['X_pred'] = oof_test_pred

df_test = df_test[df_test.date>='20180815']

tr_features = [_f for _f in df_train.columns if _f not in excluded_cols]

train_data = df_train[tr_features+['fullVisitorId']].groupby('fullVisitorId').mean().reset_index()
test_data = df_test[tr_features+['fullVisitorId']].groupby('fullVisitorId').mean().reset_index()

train_pred_list = df_train[['fullVisitorId', 'X_pred']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.X_pred))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})
test_pred_list = df_test[['fullVisitorId','X_pred']].groupby('fullVisitorId')\
    .apply(lambda df:list(df.X_pred))\
    .apply(lambda x:{'pred_'+str(i):pred for i,pred in enumerate(x)})

print(train_data.shape)
print(test_data.shape)

train_all_preds = pd.DataFrame(list(train_pred_list.values),index = train_data['fullVisitorId'])
test_all_preds = pd.DataFrame(list(test_pred_list.values),index = test_data['fullVisitorId'])

print(train_all_preds.shape,test_all_preds.shape)

In [None]:
#将test_all_preds中缺失的列补齐
for col in train_all_preds.columns:
    if col not in test_all_preds.columns:
        test_all_preds[col] = np.nan
print(train_all_preds.shape,test_all_preds.shape)

In [None]:
#对预测的收入进行特征工程
train_all_preds['pred_max'] = np.log1p(train_all_preds.mean(axis=1))
train_all_preds['pred_min'] = np.log1p(train_all_preds.mean(axis=1))
train_all_preds['pred_sum'] = np.log1p(train_all_preds.fillna(0).sum(axis=1))
train_all_preds['pred_median'] = np.log1p(train_all_preds.median(axis=1))
train_all_preds['pred_nullrate'] = train_all_preds.isnull().sum(axis=1)/len(train_all_preds.columns)

test_all_preds['pred_max'] = np.log1p(test_all_preds.mean(axis=1))
test_all_preds['pred_min'] = np.log1p(test_all_preds.mean(axis=1))
test_all_preds['pred_median'] = np.log1p(test_all_preds.median(axis=1))
test_all_preds['pred_sum'] = np.log1p(test_all_preds.fillna(0).sum(axis=1))
test_all_preds['pred_nullrate'] = test_all_preds.isnull().sum(axis=1)/len(test_all_preds.columns)

train_data = train_data.merge(train_all_preds.reset_index(),on='fullVisitorId',how='left')
test_data = test_data.merge(test_all_preds.reset_index(),on='fullVisitorId',how='left')

In [None]:
user_train_y = df_train[['fullVisitorId','totals.transactionRevenue']].groupby('fullVisitorId').sum().reset_index()

print(train_data.shape)
print(test_data.shape)
print(user_train_y.shape)

del df_train
gc.collect()

In [None]:
def modelXGBAndLGB(trn_data,te_data,trnUsery):
    
    folds = get_folds(df=trn_data[['fullVisitorId','totals.pageviews']], n_splits=5)
    
    oof_preds = np.zeros(trn_data.shape[0])
    sub_preds = np.zeros(te_data.shape[0])
    vis_importances = pd.DataFrame()
    trn_features = [f for f in trn_data.columns if f!='fullVisitorId']
    params={'learning_rate': 0.03,
        'n_estimators':2000,
        'objective':'regression',
        'metric':'rmse',
        'num_leaves': 32,
        'verbose': 1,
        "subsample": 0.99,
        "colsample_bytree": 0.99,
        "random_state":42,
        'max_depth': 15,
        'lambda_l2': 0.02,
        'lambda_l1': 0.004,
        'min_child_samples': 21
       }
    xgb_params = {
            'objective': 'reg:linear',
            'booster': 'gbtree',
            'learning_rate': 0.02,
            'n_estimators':2000,
            'max_depth': 22,
            'min_child_weight': 57,
            'gamma' : 1.45,
            'alpha': 0.0,
            'lambda': 0.0,
            'subsample': 0.67,
            'colsample_bytree': 0.054,
            'colsample_bylevel': 0.50,
            'n_jobs': -1,
            'random_state': 456
        }
    xg = xgb.XGBRegressor(**xgb_params)
    reg = lgb.LGBMRegressor(**params)

    for fold_, (trn_, val_) in enumerate(folds):
        trn_x, trn_y = trn_data[trn_features].iloc[trn_], trnUsery['totals.transactionRevenue'].iloc[trn_]
        val_x, val_y = trn_data[trn_features].iloc[val_], trnUsery['totals.transactionRevenue'].iloc[val_]

        print('-'*40+str("XGBoost is training"+'-'*40))
        xg.fit(
            trn_x, np.log1p(trn_y),
            eval_set=[(trn_x, np.log1p(trn_y)), (val_x, np.log1p(val_y))],
            early_stopping_rounds=100,
            eval_metric='rmse',
            verbose=100
        )
        print('-'*40+str("LGBM is training"+'-'*40))
        reg.fit(
            trn_x, np.log1p(trn_y),
            eval_set=[(trn_x, np.log1p(trn_y)), (val_x, np.log1p(val_y))],
            eval_names=['TRAIN', 'VALID'],
            early_stopping_rounds=100,
            eval_metric='rmse',
            verbose=100
        )

        imp_df = pd.DataFrame()
        imp_df['feature'] = trn_x.columns
        imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')

        imp_df['fold'] = fold_ + 1
        vis_importances = pd.concat([vis_importances, imp_df], axis=0, sort=False)

        oof_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
        oof_preds[oof_preds < 0] = 0

        # Make sure features are in the same order
        _preds = reg.predict(te_data[trn_features], num_iteration=reg.best_iteration_)
        _preds[_preds < 0] = 0

        pre = xg.predict(te_data[trn_features])
        pre[pre<0]=0

        sub_preds += (_preds / len(folds)) * 0.5 + (pre / len(folds)) * 0.5
        
    mean_squared_error(np.log1p(trnUsery['totals.transactionRevenue']), oof_preds) ** .5
        
    return reg,xg,sub_preds,vis_importances

lgb_est,xgb_est,sub_preds,vis_importances = modelXGBAndLGB(train_data,test_data,user_train_y)

# lgb_est.booster_.save_model('lgb_est_user.txt')
# xgb_est.save_model('xgb_est_user.model')
# del lgb_est,xgb_est
# gc.collect()

### Submission

In [None]:
import datetime
flagtime  = datetime.datetime.now()
flagtime = datetime.datetime.strftime(flagtime,'%Y%m%d%H%M')

vis_importances['gain_log'] = np.log1p(vis_importances['gain'])
mean_gain = vis_importances[['gain', 'feature']].groupby('feature').mean()
vis_importances['mean_gain'] = vis_importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 25))
sns.barplot(x='gain_log', y='feature', data=vis_importances.sort_values('mean_gain', ascending=False).iloc[:300])

test_data['PredictedLogRevenue'] = sub_preds
# test_data[['fullVisitorId','PredictedLogRevenue']].to_csv(submission_dir+'submission_tuning2_'+flagtime+'.csv', index=False)
df_sub = pd.DataFrame()
df_sub['fullVisitorId'] = list(set(list(df_test['fullVisitorId'])))
df_sub = df_sub.merge(test_data[['fullVisitorId','PredictedLogRevenue']],on=['fullVisitorId'],how='left')
df_sub.fillna(0,inplace=True)
df_sub.to_csv(submission_dir+'submission_tuning2_'+flagtime+'.csv', index=False)