In [None]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import os
DATA_PATH = './datasets/'
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
def downCast_dtype(df):
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols = [c for c in df if df[c].dtype == 'int64']
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    return df

def dateUtils(df=None,timeCol='purchase_date'):
    dateHandle = pd.to_datetime(df[timeCol])
    df['week'] = dateHandle.dt.week
    df['year'] = dateHandle.dt.year
    df['month_gap'] = (dateHandle.dt.date - datetime.date(2018,2,28)).dt.days//30
    df['day_gap'] = (dateHandle.dt.date - datetime.date(2018,2,28)).dt.days
    #cardid用户连续购买之间的时间差
    roll = df.groupby(['card_id'])['day_gap'].apply(lambda series:series.diff(1))
    df['day_diff'] = roll.values
    return df

def label_encoding(df,encodCols):
    for col in tqdm_notebook(encodCols):
        lbl = LabelEncoder()
        lbl.fit(list(df[col].values.astype('str')))
        df[col] = lbl.transform(list(df[col].values.astype('str')))
    return df
def getMeanStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].mean().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getStdStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].std().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getMaxStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].max().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getMedianStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].median().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getMinStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].min().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getSumStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].sum().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left') 
    return df_data
def getCountsStaticsFeatures(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].count().reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
#统计用户刷信用卡的主要商店和商店类型
def getCategoryFrequenceMax(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].apply(lambda series:series.value_counts(dropna=False).index[0]).reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
def getCategoryCounts(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].apply(lambda series:len(series.unique())).reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
#历史访问最多的店的次数所占比例
def getCategoryFrequenceMaxRatio(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].apply(lambda series:list(series.value_counts(dropna=False).values)[0]/series.shape[0]).reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data
#历史访问的多类点所占的比例
def getCategoryCountsRatio(df_data,df_feature,group,fea='',name=''):
    df_temp = df_feature.groupby(group)[fea].apply(lambda series:len(series.unique())/series.shape[0]).reset_index()
    df_temp.rename(columns={fea:name},inplace=True)
    df_data = df_data.merge(df_temp,on=group,how='left')
    return df_data