In [None]:
import os
import json
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import gc
import os

from load_df import *
from tqdm import tqdm_notebook, tnrange
from pandas.io.json import json_normalize
from sklearn import model_selection, preprocessing, metrics
from plotly import tools

py.init_notebook_mode(connected=True)
color = sns.color_palette()
feat_orignal_dir = "features_orignal/"
submission_dir = 'submission/'
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES']='1'

%matplotlib inline

In [None]:
%%time
# 加载原始数据
train_path = feat_orignal_dir + 'train.csv'
test_path = feat_orignal_dir + 'test.csv'

df_train,df_test = get_df(train_path,test_path)

print(df_train.shape)
print(df_test.shape)

In [None]:
def remove_const_cols(train_df,test_df):
    #查询出具有列元素全部相同的列名
    const_cols = [c for c in tqdm_notebook(train_df.columns) if train_df[c].nunique(dropna=False)==1 and c!='totals.visits']
    if len(const_cols):
        cols_to_drop = const_cols + ["sessionId"]+['trafficSource.adwordsClickInfo.page']
        train_df = train_df.drop(cols_to_drop+['trafficSource.campaignCode'],axis=1)
        test_df = test_df.drop(cols_to_drop,axis=1)
    return train_df,test_df
df_train,df_test = remove_const_cols(df_train,df_test)
df_train.sort_values(['date'],ascending=True,inplace=True)
df_train['totals.transactionRevenue'] = df_train['totals.transactionRevenue'].apply(lambda x:np.log1p(float(x)) if float(x) > 0 else 0)
df_test.sort_values(['date'],ascending=True,inplace=True)
df_test['is_test'] = True

df_data = pd.concat((df_train,df_test))

print(df_train.shape,df_test.shape)
print(df_data.shape)

### Categories Features && Label Encoding

In [None]:
#类别特征处理
device_browers = list(df_data['device.browser'].value_counts().reset_index()['index'][0:30])
device_os = list(df_data['device.operatingSystem'].value_counts().reset_index()['index'][0:15])

geoNetwork_cities = list(df_data['geoNetwork.city'].value_counts().reset_index()['index'][0:10])
geoNetwork_country = list(df_data['geoNetwork.country'].value_counts().reset_index()['index'][0:20])
geoNetwork_metro = list(df_data['geoNetwork.metro'].value_counts().reset_index()['index'][0:40])
geoNetwork_networkDomain = list(df_data['geoNetwork.networkDomain'].value_counts().reset_index()['index'][0:40])

def browser_mapping(x):
    if x in device_browers:
        return x.lower()
    else:
        return 'others'

def geoNetwork_city(x):
    if x in geoNetwork_cities:
        return x.lower()
    else:
        return 'others'
    
def geoNetwork_countries(x):
    if x in geoNetwork_country:
        return x.lower()
    else:
        return 'others'

def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def device_operatingSystem(x):
    if x in device_os:
        return x.lower()
    else:
        return 'others'

def traficSource_referralPath(x):
    if x == '/':
        return '/'
    elif 'yt/about/' in x:
        return x
    elif 'google' in x:
        return '/google'
    elif 'mail' in x:
        return '/mail'
    elif 'yt/advertise/' in x:
        return x
    elif 'offer/2145' in x:
        return x
    elif 'yt/creators/' in x:
        return x
    elif 'pagead/ads' in x:
        return x
    elif '/intl/' in x:
        return x;
    elif 'shirt' in x:
        return x;
    elif '/analytics/app/' in x:
        return x
    elif 'using-the-logo':
        return x
    elif '/moma' in x:
        return x
    else:
        return '/others'  
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif('(direct)'in x):
        return x;
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

for df in [df_data]:  
    df['device.browser'] = df['device.browser'].map(lambda x:browser_mapping(str(x))).astype('str')
    df['device.operatingSystem'] = df['device.operatingSystem'].map(lambda x:device_operatingSystem(str(x))).astype('str')
    df['trafficSource.adContent'] = df['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
    df['trafficSource.source'] = df['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')
    df['geoNetwork.city'] = df['geoNetwork.city'].map(lambda x:geoNetwork_city(str(x))).astype('str')
    df['geoNetwork.country'] = df['geoNetwork.country'].map(lambda x:geoNetwork_countries(str(x))).astype('str')
    df['trafficSource.referralPath'] = df['trafficSource.referralPath'].map(lambda x:traficSource_referralPath(str(x))).astype('str')

#将具有从属关系的特征进行合并:地区从属关系，市场分区
for df in [df_data]:
    print("... process device ...")
    df['source.country'] = df['trafficSource.source'] + '_' + df['geoNetwork.country']
    df['campaign.medium'] = df['trafficSource.campaign'] + '_' + df['trafficSource.medium']
    df['browser.category'] = df['device.browser'] + '_' + df['device.deviceCategory']
    df['browser.os'] = df['device.browser'] + '_' + df['device.operatingSystem']

#将设备中具有从属关系的特征整合
def custom(df):
    print('... custom ...')
    df['device_deviceCategory_channelGrouping'] = df['device.deviceCategory'] + "_" + df['channelGrouping']
    df['channelGrouping_browser'] = df['device.browser'] + "_" + df['channelGrouping']
    df['channelGrouping_OS'] = df['device.operatingSystem'] + "_" + df['channelGrouping']
    
    df['city_continent_country_metro_networkDomain_region_subContinent'] = df['geoNetwork.city']+"_"+df['geoNetwork.continent']+"_"+df['geoNetwork.country']+"_"+df['geoNetwork.metro']+"_"+df['geoNetwork.networkDomain']+"_"+df['geoNetwork.region']+df['geoNetwork.subContinent']
    
    for i in ['geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country','geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region','geoNetwork.subContinent']:
        for j in ['device.browser','device.deviceCategory', 'device.operatingSystem', 'trafficSource.source']:
            df[i + "_" + j] = df[i] + "_" + df[j]
    
    df['content.source'] = df['trafficSource.adContent'] + "_" + df['source.country']
    df['medium.source'] = df['trafficSource.medium'] + "_" + df['source.country']
    
    return df

df_data = custom(df_data)

print(df_data.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
excluded_cols = ['date','fullVisitorId','totals.transactionRevenue','visitId','visitStartTime','is_test']
cat_cols = [col for col in df_data.columns if df_data[col].dtype =="object" and col not in excluded_cols]

# label encode the categorical variables and convert the numerical variables to float
def label_encoding(df):
    for col in tqdm_notebook(cat_cols):
        if df[col].dtype == bool:#将bool变量转成整数
            df[col] = train_df[col].astype(int)
        lbl = LabelEncoder()
        lbl.fit(list(df[col].values.astype('str')))
        
        df[col] = lbl.transform(list(df[col].values.astype('str')))
        
    return df

df_data = label_encoding(df_data)

In [None]:
def downCast_dtype(df):
    '''
        对数据类型进行转换
    '''
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols = [c for c in df if df[c].dtype == 'int64']
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    return df
df_data = downCast_dtype(df_data)

### Time Series Features

In [None]:
# from datetime import date,timedelta

# holiday = ['01-01','02-01','02-12','02-14','02-22','03-07','03-17','04-01','05-01','05-14','06-01','06-21','06-14','07-01','08-01','09-01','10-01','10-31','11-01','12-24','12-25']

# df_data['visitStartTime'] = pd.to_datetime(df_data['visitStartTime'],unit='s')
# df_data['month'] = df_data['visitStartTime'].dt.month
# df_data['week'] = df_data['visitStartTime'].dt.week
# df_data['day'] = df_data['visitStartTime'].dt.day
# df_data['hour'] = df_data['visitStartTime'].dt.hour
# df_data["week_day"] = df_data['visitStartTime'].dt.weekday
# for h in holiday:
#     h_month,h_day = h.split('-')
#     df_data['is_holiday'] = ((df_data['month']==int(h_month))&(df_data['day']==int(h_day))|(df_data['week_day']==5)|(df_data['week_day']==6))*1

# #某日前两天、和后两天是否为假期
# df_data['prev_day_is_holiday'] = df_data['is_holiday'].shift().fillna(0)
# df_data['pre2_day_is_holiday'] = df_data['is_holiday'].shift(2).fillna(0)
# df_data['next_day_is_holiday'] = df_data['is_holiday'].shift(-1).fillna(0)
# df_data['next2_day_is_holiday'] = df_data['is_holiday'].shift(-2).fillna(0)

# df_data['month.unique.user.count'] = df_data.groupby('month')['fullVisitorId'].transform('nunique')
# df_data['day.unique.user.count'] = df_data.groupby('day')['fullVisitorId'].transform('nunique')
# df_data['weekday.unique.user.count'] = df_data.groupby('week_day')['fullVisitorId'].transform('nunique')

# df_data['next_session_1'] = (
#     df_data['visitStartTime'] - df_data[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(1)
# ).astype(np.int64) // 1e9 // 60 // 60
# df_data['next_session_2'] = (
#     df_data['visitStartTime'] - df_data[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(-1)
# ).astype(np.int64) // 1e9 // 60 // 60

# df_data.drop(columns=['visitStarTime'],axis=1,inplace=True)

In [None]:
from datetime import date,timedelta
from funcUtils import *
#获得核函数 PrEp
PrOriginalEp = np.zeros((2000,2000))
PrOriginalEp[1,0] = 1
PrOriginalEp[2,range(2)] = [0.5,0.5]
for i in range(3,2000):
    scale = (i-1)/2.
    x = np.arange(-(i+1)/2.+1, (i+1)/2., step=1)/scale
    y = 3./4.*(1-x**2)
    y = y/np.sum(y)
    PrOriginalEp[i, range(i)] = y
PrEp = PrOriginalEp.copy()
for i in range(3, 2000):
    PrEp[i,:i] = (PrEp[i,:i]*i+1)/(i+1)

def dateGap(x,y):
    year = x[:4]
    month = x[4:6]
    day = x[6:]
    return (date(int(year),int(month),int(day))-y).days

def datePrepro(df):
    holiday = ['01-01','02-01','02-12','02-14','02-22','03-07','03-17','04-01','05-01','05-14','06-01','06-21','06-14','07-01','08-01','09-01','10-01','10-31','11-01','12-24','12-25']

    df['visitStartTime'] = pd.to_datetime(df['visitStartTime'],unit='s')
    df['month'] = df['visitStartTime'].dt.month
    df['week'] = df['visitStartTime'].dt.week
    df['day'] = df['visitStartTime'].dt.day
    df['hour'] = df['visitStartTime'].dt.hour
    df["week_day"] = df['visitStartTime'].dt.weekday
    for h in holiday:
        h_month,h_day = h.split('-')
        df['is_holiday'] = ((df['month']==int(h_month))&(df['day']==int(h_day))|(df['week_day']==5)|(df['week_day']==6))*1

    #某日前两天、和后两天是否为假期
    df['prev_day_is_holiday'] = df['is_holiday'].shift().fillna(0)
    df['pre2_day_is_holiday'] = df['is_holiday'].shift(2).fillna(0)
    df['next_day_is_holiday'] = df['is_holiday'].shift(-1).fillna(0)
    df['next2_day_is_holiday'] = df['is_holiday'].shift(-2).fillna(0)

    df['month.unique.user.count'] = df.groupby('month')['fullVisitorId'].transform('nunique')
    df['day.unique.user.count'] = df.groupby('day')['fullVisitorId'].transform('nunique')
    df['weekday.unique.user.count'] = df.groupby('week_day')['fullVisitorId'].transform('nunique')

    df['next_session_1'] = (
        df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(1)
    ).astype(np.int64) // 1e9 // 60 // 60
    df['next_session_2'] = (
        df['visitStartTime'] - df[['fullVisitorId', 'visitStartTime']].groupby('fullVisitorId')['visitStartTime'].shift(-1)
    ).astype(np.int64) // 1e9 // 60 // 60

#     df_date = pd.to_datetime(df['date'])
    days_of_months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    #当天距离月初月末的距离
    df["days_to_side"] = df['visitStartTime'].apply(
                            lambda x: min(x.day, days_of_months[x.month-1]-x.day))
    df['day'] = df['day'].apply(lambda x:0 if x<=7 else 2 if x>=24 else 1)
    df.drop(columns=['visitStartTime'],axis=1,inplace=True)
    
    return df

def getFeatures(df_label,df_train):
    
    df_train = datePrepro(df_train)
    df_label = datePrepro(df_label)
    
    df_label = feat_kernelMedian(df_label,df_train,['fullVisitorId','day','week_day'],
                                                             'totals.hits',PrEp,'hits_weekday')
    df_label = feat_kernelMedian(df_label,df_train,['fullVisitorId','day','week_day'],
                                                             'totals.visits',PrEp,'visits_weekday')
    df_label = feat_kernelMedian(df_label,df_train,['fullVisitorId','day','week_day'],
                                                             'totals.pageviews',PrEp,'pageviews_weekday')
    df_label = feat_kernelMedian(df_label,df_train,['fullVisitorId','day','week_day'],
                                                             'visitNumber',PrEp,'visitNumber_weekday')
    #需要进行统计的量
    targetCols = ['totals.hits','totals.visits','totals.pageviews','visitNumber','totals.transactionRevenue']
    
    for i in [21,35,63,140,280,365]:
#     for i in [21]:
        df_select = df_train[df_train.day_gap>=-i].copy()
        for col in targetCols: 
            print(col)
            df_label = feat_median(df_label,df_select,['fullVisitorId'],col,'%s_median_%s'%(col,i))
            df_label = feat_mean(df_label,df_select,['fullVisitorId'],col,'%s_mean_%s'%(col,i))
            df_label = feat_kernelMedian(df_label,df_select,['fullVisitorId'],col,PrEp,'%s_kernelMed_%s'%(col,i))
            df_label = feat_max(df_label,df_select,['fullVisitorId'],col,'%s_max_%i'%(col,i))
            df_label = feat_min(df_label,df_select,['fullVisitorId'],col,'%s_min_%i'%(col,i))
            df_label = feat_std(df_label,df_select,['fullVisitorId'],col,'%s_std_%i'%(col,i))
            df_label = feat_count(df_label,df_select,['fullVisitorId'],col,'%s_count_%i'%(col,i))
            
            df_label = feat_kernelMedian(df_label,df_select,['fullVisitorId','month'],col,PrEp,'%s_month_kernelMed_%s'%(col,i))
            df_label = feat_mean(df_label,df_select,['fullVisitorId','month'],col,'%s_month_mean_%s'%(col,i))
            df_label = feat_max(df_label,df_select,['fullVisitorId','month'],col,'%s_month_max_%i'%(col,i))
            df_label = feat_min(df_label,df_select,['fullVisitorId','month'],col,'%s_month_min_%i'%(col,i))
            df_label = feat_std(df_label,df_select,['fullVisitorId','month'],col,'%s_month_std_%i'%(col,i))
            df_label = feat_count(df_label,df_select,['fullVisitorId','month'],col,'%s_month_count_%i'%(col,i))
            
            df_label = feat_kernelMedian(df_label,df_select,['fullVisitorId','week_day'],col,PrEp,'%s_week_kernelMed_%s'%(col,i))
            df_label = feat_mean(df_label,df_select,['fullVisitorId','week_day'],col,'%s_week_mean_%s'%(col,i))
            df_label = feat_max(df_label,df_select,['fullVisitorId','week_day'],col,'%s_week_max_%i'%(col,i))
            df_label = feat_min(df_label,df_select,['fullVisitorId','week_day'],col,'%s_week_min_%i'%(col,i))
            df_label = feat_std(df_label,df_select,['fullVisitorId','week_day'],col,'%s_week_std_%i'%(col,i))
            df_label = feat_count(df_label,df_select,['fullVisitorId','week_day'],col,'%s_week_count_%i'%(col,i))
            
            #节假日
            df_label = feat_kernelMedian(df_label,df_select,['fullVisitorId','is_holiday'],col,PrEp,'%s_holiday_kernelMed_%s'%(col,i))
            df_label = feat_mean(df_label,df_select,['fullVisitorId','is_holiday'],col,'%s_holiday_mean_%s'%(col,i))
            df_label = feat_max(df_label,df_select,['fullVisitorId','is_holiday'],col,'%s_holiday_max_%i'%(col,i))
            df_label = feat_min(df_label,df_select,['fullVisitorId','is_holiday'],col,'%s_holiday_min_%i'%(col,i))
            df_label = feat_std(df_label,df_select,['fullVisitorId','is_holiday'],col,'%s_holiday_std_%i'%(col,i))
            df_label = feat_count(df_label,df_select,['fullVisitorId','is_holiday'],col,'%s_holiday_count_%i'%(col,i))
        
        df_label['hits/pageviews_median_%i'%i] = df_label['totals.hits_median_%i'%i].values/df_label['totals.pageviews_median_%i'%i].values
        df_label['hits/pageviews_mean_%i'%i] = df_label['totals.hits_mean_%i'%i].values/df_label['totals.pageviews_mean_%i'%i].values
        df_label['hits/pageviews_max_%i'%i] = df_label['totals.hits_max_%i'%i].values/df_label['totals.pageviews_max_%i'%i].values
        df_label['hits/pageviews_kernelMed_%i'%i] = df_label['totals.hits_kernelMed_%i'%i].values/df_label['totals.pageviews_kernelMed_%i'%i].values
        
#         df_label['hits/pageviews_month_median_%'%i] = df_label['totals.hits_month_median_%'%i].values/df_label['totals.pageviews_month_median_%'%i].values
        df_label['hits/pageviews_month_mean_%i'%i] = df_label['totals.hits_month_mean_%i'%i].values/df_label['totals.pageviews_month_mean_%i'%i].values
        df_label['hits/pageviews_month_max_%i'%i] = df_label['totals.hits_month_max_%i'%i].values/df_label['totals.pageviews_month_max_%i'%i].values
        df_label['hits/pageviews_month_kernelMed_%i'%i] = df_label['totals.hits_month_kernelMed_%i'%i].values/df_label['totals.pageviews_month_kernelMed_%i'%i].values
        
#         df_label['hits/pageviews_week_median_%'%i] = df_label['totals.hits_week_median_%'%i].values/df_label['totals.pageviews_week_median_%'%i].values
        df_label['hits/pageviews_weel_mean_%i'%i] = df_label['totals.hits_week_mean_%i'%i].values/df_label['totals.pageviews_week_mean_%i'%i].values
        df_label['hits/pageviews_week_max_%i'%i] = df_label['totals.hits_week_max_%i'%i].values/df_label['totals.pageviews_week_max_%i'%i].values
        df_label['hits/pageviews_week_kernelMed_%i'%i] = df_label['totals.hits_week_kernelMed_%i'%i].values/df_label['totals.pageviews_week_kernelMed_%i'%i].values
        
        df_label['hits/pageviews_holiday_mean_%i'%i] = df_label['totals.hits_holiday_mean_%i'%i].values/df_label['totals.pageviews_holiday_mean_%i'%i].values
        df_label['hits/pageviews_holiday_max_%i'%i] = df_label['totals.hits_holiday_max_%i'%i].values/df_label['totals.pageviews_holiday_max_%i'%i].values
        df_label['hits/pageviews_holiday_kernelMed_%i'%i] = df_label['totals.hits_holiday_kernelMed_%i'%i].values/df_label['totals.pageviews_holiday_kernelMed_%i'%i].values
    df_label.fillna(0,inplace=True)
    df_label = df_label.merge(df_train[['fullVisitorId']+cat_cols],on=['fullVisitorId'],how='left').fillna(-1)
    
    return df_label

In [None]:
%time
#时间窗
# for windows in [14,28,42]:
t2017 = date(2017,8,2)
nday = 14
# nday = windows
all_data = []
print("-"*50+str("训练集构造")+"-"*50)
#构造训练集
for i in range(nday,nday*(365//nday+1),nday):
# for i in range(nday,nday*(2),nday):
    delta = timedelta(days=i)
    t_begin = t2017 - delta
    print(t_begin)
    df_data['day_gap'] = df_data['date'].apply(lambda x:dateGap(x,t_begin))
    df_feature = df_data[df_data.day_gap<0].copy()
#     df_label = df_data[(df_data.day_gap>=0)&(df_data.day_gap<nday)][['fullVisitorId','date','day_gap','totals.transactionRevenue']].copy()
    df_label = df_data[(df_data.day_gap>=0)&(df_data.day_gap<nday)][
                    ['fullVisitorId','date','visitStartTime','day_gap','totals.transactionRevenue']].copy()
    data_temp = getFeatures(df_label,df_feature)
    all_data.append(data_temp)

train = pd.concat(all_data)
print("-"*50+str("测试集构造")+"-"*50)

#构造测试集合
t_begin = date(2017,8,2)
print(t_begin)
df_label = df_data[df_data.is_test==True].copy()
df_train = df_data[df_data.is_test!=True].copy()
df_label['day_gap'] = df_label['date'].apply(lambda x:dateGap(x,t_begin))
df_train['day_gap'] = df_train['date'].apply(lambda x:dateGap(x,t_begin))
df_label = df_label[
    ['fullVisitorId','date','visitStartTime','day_gap','totals.transactionRevenue']].copy()

test = getFeatures(df_label,df_train)

# 保存文件
train.to_csv("./features/train_{}.csv".format(nday),index=None)
test.to_csv("./features/test.csv",index=None)

### 模型训练

In [None]:
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as catb

tr_features = [_f for _f in train.columns if _f not in excluded_cols]
train_y = train['totals.transactionRevenue']

def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids
#GroupKFold 交叉验证输出
def get_out_fold(model,x_train,y_train,x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    NFOLDS = 10
    
    folds = get_folds(df=x_train, n_splits=NFOLDS)
    
    oof_train_pred = np.zeros((ntrain,))
    oof_test_pred = np.zeros((ntest,))
    oof_test_pred_skf = np.empty((NFOLDS, ntest))
    #针对不同的模型采用不同的训练方式
    for i, (dev_index, val_index) in enumerate(folds):
        x_dev = x_train[tr_features].iloc[dev_index]
        y_dev = y_train.iloc[dev_index]
        x_val = x_train[tr_features].iloc[val_index]
        y_val = y_train.iloc[val_index]

        model.fit(x_dev, y_dev,eval_set=[(x_dev,y_dev),(x_val,y_val)],early_stopping_rounds=500,verbose=100)
    
        oof_test_pred_skf[i, :] = model.predict(x_test[tr_features],num_iteration=model.best_iteration_)
        oof_train_pred[val_index] = model.predict(x_val,num_iteration=model.best_iteration_)
    
    oof_test_pred_skf[oof_test_pred_skf<0] = 0
    oof_test_pred[:] = np.expm1(oof_test_pred_skf).mean(axis=0)
    oof_train_pred = np.expm1(oof_train_pred)
    
    print("-"*50+str("model training done！")+"-"*50)
    
    return model,oof_test_pred.reshape(-1, 1),oof_train_pred.reshape(-1,1)

lgb_params={
    'learning_rate': 0.03,
    'objective':'regression',
    'n_estimators':6000,
    'metric':'rmse',
    'num_leaves': 31,
    'verbose': 1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "random_state":42,
    'max_depth': 15,
    'lambda_l2': 0.02,
    'lambda_l1': 0.004,
    'device': 'gpu',
    'gpu_platform_id':1,
    'gpu_device_id': 1,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.7,
    'min_child_samples': 21
}
lgb_est = lgb.LGBMRegressor(**lgb_params)
lgb_est,oof_lgb_test_pred,oof_lgb_train_pred = get_out_fold(lgb_est,train,train_y,test)
# lgb_est.booster_.save_model('lgb_est_session.txt')

lgb_train_pred = pd.DataFrame({"fullVisitorId":train['fullVisitorId'].values,"lgb_train_pred":oof_lgb_train_pred.reshape(-1,)})
lgb_test_pred = pd.DataFrame({"fullVisitorId":test['fullVisitorId'].values,"lgb_test_pred":oof_lgb_test_pred.reshape(-1,)})
# lgb_train_pred.to_csv('lgb_train_session_pred.csv',index=False)
# lgb_test_pred.to_csv('lgb_test_session_pred.csv',index=False)

#特征重要性分析
fig, ax = plt.subplots(figsize=(12,18))
lgb.plot_importance(lgb_est,max_num_features=100, height=0.8, ax=ax)
ax.grid(False)
plt.title("LGBM - Feature Importance", fontsize=10)
plt.show()

In [None]:
def stacking(clf,train_data,test_data,clf_name,class_num=1):
    train=np.zeros((train_data.shape[0],class_num))
    test=np.zeros((test_data.shape[0],class_num))
    test_pre=np.empty((folds,test_data.shape[0],class_num))
    cv_scores=[]
    for i,(train_index,test_index) in enumerate(kf):
        tr=train_data.iloc[train_index]
        te=train_data.iloc[test_index]
        #分别测试分数
        te_1=te[te.day_gap<=6].copy()
        te_2=te[te.day_gap>6].copy()
        te_1_x=te_1.drop(["totals.transactionRevenue"], axis=1)
        te_2_x=te_2.drop(["totals.transactionRevenue"], axis=1)
        te_1_y=te_1["totals.transactionRevenue"].values
        te_2_y=te_2["totals.transactionRevenue"].values
        print(te_1.shape)
        print(te_2.shape)

        tr_x=tr.drop(["totals.transactionRevenue"], axis=1)
        tr_y=tr['totals.transactionRevenue'].values
        te_x=te.drop(["totals.transactionRevenue"], axis=1)
        te_y=te['totals.transactionRevenue'].values

        weight_train=weight_df.iloc[train_index]
        weight_test=weight_df.iloc[test_index]

        train_matrix = clf.Dataset(tr_x, label=tr_y,weight=weight_train["weight"])
        test_matrix = clf.Dataset(te_x, label=te_y,weight=weight_test["weight"])

        params = {
            # Feiyang: 10. 把 7 改成了 8
            'num_leaves': 2 ** 8 - 1,
            'objective': 'regression_l2',
            # Feiyang: 11. 把 8 改成了 9
            'max_depth': 9,
            'min_data_in_leaf': 50,
            # Feiyang: 12. 把 0.01 改成了 0.007 并同时改了下面的 Num_round 和 early_stopping_rounds
            'learning_rate': 0.007,
            'feature_fraction': 0.6,
            # Feiyang: 13. 把 0.75 改成了 0.8
            'bagging_fraction': 0.8,
            'bagging_freq': 1,
            'metric': 'rmse',
            'device': 'gpu',
            'gpu_platform_id':1,
            'gpu_device_id': 1,
            'num_threads': 4,
            'seed': 2018,
        }

        num_round = 6000
        early_stopping_rounds = 500
        if test_matrix:
            model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
                              early_stopping_rounds=early_stopping_rounds
                              )
            pre= model.predict(te_x,num_iteration=model.best_iteration).reshape((te_x.shape[0],1))
            train[test_index]=pre
            test_pre[i, :]= model.predict(test_data, num_iteration=model.best_iteration).reshape((test_data.shape[0],1))
            pre_1=model.predict(te_1_x,num_iteration=model.best_iteration).reshape((te_1_x.shape[0],1))
            pre_2=model.predict(te_2_x,num_iteration=model.best_iteration).reshape((te_2_x.shape[0],1))
            cv_scores.append((mean_squared_error(te_y, pre)**0.5,mean_squared_error(te_1_y, pre_1)**0.5,mean_squared_error(te_2_y, pre_2)**0.5))

        print("%s now score is:"%clf_name,cv_scores)
    test[:]=test_pre.mean(axis=0)
    print("%s_score_list:"%clf_name,cv_scores)
    print("%s_score_mean:"%clf_name,np.mean(cv_scores))

    score_split=(str(round(np.mean([i[0] for i in cv_scores]),6)),str(round(np.mean([i[1] for i in cv_scores]),6)),str(round(np.mean([i[2] for i in cv_scores]),6)))
    with open("score_cv.txt", "a") as f:
        f.write("%s now score is:" % clf_name + str(cv_scores) + "\n")
        f.write("%s_score_mean:"%clf_name+str(np.mean(cv_scores))+"\n")
        f.write("score_split:"+str(score_split)+"\n")

    return train.reshape(-1,class_num),test.reshape(-1,class_num),score_split


def lgb(train, valid):
    xgb_train, xgb_test,cv_scores = stacking(lightgbm,train,valid,"lgb")
    return xgb_train, xgb_test,cv_scores

import lightgbm
from sklearn.model_selection import KFold

folds = 5
seed = 2018

#生成数据
train_data = train.drop(["air_store_id","hpg_store_id","visit_date"], axis=1)
test_data = test.drop(["air_store_id","hpg_store_id","visit_date"], axis=1)

weight_df=train[["day_gap"]].copy()
weight_df["weight"]=weight_df["day_gap"].apply(lambda x: 1 if x<=6 else 1)

kf = KFold(n_splits=folds, shuffle=True, random_state=seed).split(train)
lgb_train, lgb_test,m = lgb(train_data,test_data)

#生成线下
train["visitors_pre"]=lgb_train
score_result=mean_squared_error(train["visitors"], train["visitors_pre"])**0.5
train["visitors"] = np.clip(np.expm1(train["visitors"]), 0, 1000)
train["visitors_pre"] = np.clip(np.expm1(train["visitors_pre"]), 0, 1000)
train[["air_store_id","visit_date","visitors","visitors_pre"]].to_csv("./offline/offline_cv_%s_%s_%s.csv"%m,index=None)
with open("score_cv.txt", "a") as f:
    f.write("result score is:" + str(score_result) + "\n")
#生成提交
df_test["visitors"]=lgb_test
df_test["visitors"] = np.clip(np.expm1(df_test["visitors"]), 0, 1000)
df_test[["id","visitors"]].to_csv("./submission/sub_woo_cv_%s_weight_%s_%s_%s.csv"%(slip,m[0],m[1],m[2]),index=None)


In [None]:
train.head()

In [None]:
def get_totals_feature(df):
    '''
        totals点击量处理
    '''
    df['totals.pageviews'] = df['totals.pageviews'].fillna(1.0)
    df['totals.newVisits'] = df['totals.newVisits'].fillna(0)
    df['totals.bounces'] = df['totals.bounces'].fillna(0)
    
    #用户每天，每周，每月的访问量
    df['sum_visits_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.visits'].transform('sum')
    df['sum_visits_per_week_visitor'] = df.groupby(['fullVisitorId','month','week'])['totals.visits'].transform('sum')
    df['sum_visits_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.visits'].transform('sum')
    
    #用户每天，每周，每月的点击量
    df['sum_hits_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.hits'].transform('sum')
    df['sum_hits_per_week_visitor'] = df.groupby(['fullVisitorId','month','week'])['totals.hits'].transform('sum')
    df['sum_hits_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.hits'].transform('sum')
    
    #用户每天，每周，每月的网页浏览量
    df['sum_pageviews_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['totals.pageviews'].transform('sum')
    df['sum_pageviews_per_week_visitor'] = df.groupby(['fullVisitorId','month','week'])['totals.pageviews'].transform('sum')
    df['sum_pageviews_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['totals.pageviews'].transform('sum')
    
    #用户每个pageView产生的点击量hits/pageviews
    df['hits/pageViews_per_day_visitor'] = df['sum_hits_per_day_visitor'].values/df['sum_pageviews_per_day_visitor'].values
    df['hits/pageViews_per_week_visitor'] = df['sum_hits_per_week_visitor'].values/df['sum_pageviews_per_week_visitor'].values
    df['hits/pageviews_per_month_visitor'] =df['sum_hits_per_month_visitor'].values/df['sum_pageviews_per_month_visitor'].values
    
    #用户每天，每周，每月的session量
    df['sum_visitNumber_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['visitNumber'].transform('sum')
    df['sum_visitNumber_per_week_visitor'] = df.groupby(['fullVisitorId','month','week'])['visitNumber'].transform('sum')
    df['sum_visitNumber_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['visitNumber'].transform('sum')
    
    #平均session量
    df['mean_vistiNumber_per_day_visitor'] = df.groupby(['fullVisitorId','month','day'])['visitNumber'].transform('mean')
    df['mean_visitNumebr_per_week_visitor']= df.groupby(['fullVisitorId','month','week_day'])['visitNumber'].transform('mean')
    df['mean_visitNumber_per_month_visitor'] = df.groupby(['fullVisitorId','month'])['visitNumber'].transform('mean')
    
    #每次session产生的pageviews pageview/visitNumber
    df['pageviews/visitNumber_per_day_visitor'] = df['sum_pageviews_per_day_visitor'].values/df['sum_visitNumber_per_day_visitor'].values
    df['pageviews/visitNumber_per_week_visitor'] = df['sum_pageviews_per_week_visitor'].values/df['sum_visitNumber_per_week_visitor'].values
    df['pageviews/visitNumber_per_month_visitor'] = df['sum_pageviews_per_month_visitor'].values/df['sum_visitNumber_per_month_visitor'].values
    
    df['mean.hits.per.day'] = df.groupby(['month','day'])['totals.hits'].transform('mean')
    df['sum.hits.per.day'] = df.groupby(['month','day'])['totals.hits'].transform('sum')
    
    df['mean.hits.per.month'] = df.groupby(['month'])['totals.hits'].transform('mean')
    df['sum.hits.per.month'] = df.groupby(['month'])['totals.hits'].transform('sum')
    
    df['mean.hits.per.week_day'] = df.groupby(['month','week_day'])['totals.hits'].transform('mean')
    df['sum.hits.per.week_day'] = df.groupby(['month','week_day'])['totals.hits'].transform('sum')
    
    df['mean.pageviews.per.day'] = df.groupby(['month','day'])['totals.pageviews'].transform('mean')
    df['sum.pageviews.per.day'] = df.groupby(['month','day'])['totals.pageviews'].transform('sum')
    
    df['mean.pageviews.per.month'] = df.groupby(['month'])['totals.pageviews'].transform('mean')
    df['sum.pageviews.per.month'] = df.groupby(['month'])['totals.pageviews'].transform('sum')
    
    return df

df_data = get_totals_feature(df_data)    

print(df_data.shape)