# 0. 路徑

In [90]:
import threading # 因為等等要載入大量資料所以要multi thread
from multiprocessing import Queue

from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier,ExtraTreesClassifier

from xgboost import XGBClassifier

from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.metrics import accuracy_score

import datetime
import numpy as np
import pandas as pd
from numpy import loadtxt

import os
import argparse
from tqdm import tqdm
import matplotlib.pyplot as plt


plt.rcParams['font.family']='SimHei' #顯示中文

#特殊符號: | 

In [2]:
'''
import gc
del big_object
gc.collect()
'''

'\nimport gc\ndel big_object\ngc.collect()\n'

# 1. 載入資料

In [22]:
# 載入資料 A

train = pd.read_csv('training-set.csv', encoding = "utf-8", header=None)
test  = pd.read_csv('testing-set.csv', encoding = "utf-8", header=None)

train_exc = pd.read_csv('exception_train.txt', encoding = "utf-8", header=None)
test_exc = pd.read_csv('exception_testing.txt', encoding = "utf-8", header=None)

train.columns=['FileID','label']
test.columns=['FileID','label']


# 確認排除的FileID在training set裡面找不到

for item in train_exc:
    print(train[train['FileID']==item])
    
for item in test_exc:
    print(test[test['FileID']==item])

Empty DataFrame
Columns: [FileID, label]
Index: []
Empty DataFrame
Columns: [FileID, label]
Index: []


In [3]:
# 將 FileID 切割，免得等等跑不動

train = train.sort_values('FileID')

n = 10
a = 0
b = int(len(train)/n)

train1 = pd.DataFrame()

for i in range(1, n+1):
    tr = train.iloc[a:b, :]
    tr['batch'] = i
    a = b
    b = b + int(len(train)/n)
    train1 = train1.append(tr)

print(train1.head(3))
print(train1.tail(3))
train1.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


                             FileID  label  batch
0  0000e2398b12121a85166fed5fe2a3da      0      1
1  0001fe8dce14ce099aa6ca8ea5026ea7      0      1
2  00027f50019000accc492e5684efc818      0      1
                                 FileID  label  batch
52507  fff57a378fbc3bc458b7cc9171ce7c31      0     10
52508  fff6851ddbf675b950e1184c988b984e      1     10
52509  fff6e0c69b020e121bce461aa1b4e447      1     10


(52510, 3)

In [None]:
# 載入資料 B 

import glob
path = '/data/examples/trend/data/query_log'

all_files = glob.glob(os.path.join(path, "*.csv")) # advisable to use os.path.join as this makes concatenation OS independent
file = [pd.read_csv(f, names=['FileID','CustomerID','QueryTs','ProductID']) for f in all_files]
raw  = pd.concat(file, ignore_index=True)

print(raw.shape)
raw.head()

In [5]:
raw['cnt'] = 1
raw['date'] = pd.to_datetime(raw['QueryTs'], unit='s').dt.date
raw['time'] = pd.to_datetime(raw['QueryTs'], unit='s').dt.time
raw['hour'] = pd.to_datetime(raw['QueryTs'], unit='s').dt.hour
raw['week'] = pd.to_datetime(raw['QueryTs'], unit='s').dt.weekday_name

raw.tail()

Unnamed: 0,FileID,CustomerID,QueryTs,ProductID,cnt,date,time,hour,week
83273105,ad141ec00374be238c2476b2f2c499e4,2956a9498da8ad0ce55015ce67ef4693,1493251196,55649,1,2017-04-26,23:59:56,23,Wednesday
83273106,f62bfd843961a7c4a69f403963867966,5065b2835b355439c2b765f22e2702f4,1493251196,55649,1,2017-04-26,23:59:56,23,Wednesday
83273107,e612c3a78dc931c7693327b93c22b5de,f09c46986311323fc3b75e2e6bf15688,1493251198,55649,1,2017-04-26,23:59:58,23,Wednesday
83273108,1a64991c9ef66ec6332262c26b5303b5,d6bfafcb8123cc30c5a9c1a9b3a43a06,1493251199,55649,1,2017-04-26,23:59:59,23,Wednesday
83273109,f62bfd843961a7c4a69f403963867966,8a3c4f62c058fd5d1ae4e174e7d4bf35,1493251199,55649,1,2017-04-26,23:59:59,23,Wednesday


In [6]:
train_raw = pd.merge(raw, train1, on=['FileID'])
train_raw['ProductID'] = train_raw['ProductID'].astype(str)   # 因為值中有數字文字
train_raw.replace({'ProductID':{'55649': '055649'}})
train_raw = train_raw.sort_values(['CustomerID','FileID','QueryTs'])

print(raw.shape, train.shape, train_raw.shape)
print(train_raw.ProductID.unique())
train_raw.head()

(83273110, 9) (52518, 2) (54242074, 11)
['7acab3' '55649' '055649' '634e6b' 'c76d58' 'c105a0' 'e47f04' '885fab'
 '26a5d0' 'a310bb' 'dd8d4a' 'd465fc' '533133' '262880' 'b93794' '8541a0'
 '218578' '3ea8c3' '05b409' '20f8a5' '0374c4' 'cc3a6a' '8452da' 'aaa9c8'
 '0cdb7a' '3c2be6' '75f310' 'fec24f']


Unnamed: 0,FileID,CustomerID,QueryTs,ProductID,cnt,date,time,hour,week,label,batch
6872637,f096e1c6e0cbaf10389fbf427b4d341f,0000006fa286976bf35ea17f1f19bc7a,1493364274,7acab3,1,2017-04-28,07:24:34,7,Friday,0,10
6872918,f096e1c6e0cbaf10389fbf427b4d341f,0000006fa286976bf35ea17f1f19bc7a,1493531993,7acab3,1,2017-04-30,05:59:53,5,Sunday,0,10
119508,19308434813502167aaef38f578981a4,00000145d9062eada528bace5fb4864e,1490544224,7acab3,1,2017-03-26,16:03:44,16,Sunday,0,1
32148560,ee6a1280be5c96d7b2461de6b7578180,00000145d9062eada528bace5fb4864e,1492708112,7acab3,1,2017-04-20,17:08:32,17,Thursday,0,10
32151096,ee6a1280be5c96d7b2461de6b7578180,00000145d9062eada528bace5fb4864e,1492962863,7acab3,1,2017-04-23,15:54:23,15,Sunday,0,10


In [None]:
'''
observe1 = train_raw.groupby(['FileID','CustomerID','ProductID'])[['cnt','label']].sum()
observe1 = observe1.sort_values('label', ascending=0)
aa = observe1.loc[observe1['label']==1]
aa.head()

observe2 = train_raw.groupby(['week'])[['cnt','label']].sum() 
observe2['fr_rate'] = observe2['label'] / observe2['cnt']
observe2 = observe2.sort_values('fr_rate', ascending=0)
print(observe2)

observe3 = train_raw.groupby(['ProductID'])[['cnt','label']].sum() 
observe3['fr_rate'] = observe3['label'] / observe3['cnt']
observe3 = observe3.sort_values('fr_rate', ascending=0)
print(observe3)
'''

In [None]:
# 其他常用資料整理
'''
for dfB in full_data:  
    dfB['ColumnA'] = dfB['ColumnA'].fillna(dfA['ColumnA'].median())#刪除空值，用中位數代替  

df['columnB'] = pd.qcut(df['columnA'], 4)  #分為四等份
'''

# 2A. Feature Engineering

In [7]:
# aggregate

train_agg = pd.DataFrame()
for i in range(10):
    
    tr = train_raw.loc[train_raw['batch'] == i+1]

    # 單日File被使用幾次
    DayFil    = tr.groupby(['FileID', 'date']).size()
    DayFilMax = DayFil.groupby(level=0).max()
    DayFilMin = DayFil.groupby(level=0).min()
    DayFilMea = DayFil.groupby(level=0).mean()


    DayCs  = tr.groupby(['FileID', 'date', 'CustomerID']).size() # 可以先拿掉
    DayPr  = tr.groupby(['FileID', 'date', 'ProductID']).size()
    Pr     = tr.groupby(['FileID', 'ProductID']).size()
    Cs     = tr.groupby(['FileID', 'CustomerID']).size()
    Day    = tr.groupby(['FileID', 'date']).size()

    # 單日File被多少客人使用
    DayTCs    = DayCs.groupby(level=(0,1)).size()
    DayTCsMax = DayTCs.groupby(level=0).max()
    DayTCsMin = DayTCs.groupby(level=0).min()
    DayTCsMea = DayTCs.groupby(level=0).mean()

    # 單日File被同一客人使用幾次
    DaySCsMax = DayCs.groupby(level=0).max()
    DaySCsMin = DayCs.groupby(level=0).min()
    DaySCsMea = DayCs.groupby(level=0).mean()

    # 單日File被多少產品使用
    DayTPr    = DayPr.groupby(level=(0,1)).size()
    DayTPrMax = DayTPr.groupby(level=0).max()
    DayTPrMin = DayTPr.groupby(level=0).min()
    DayTPrMea = DayTPr.groupby(level=0).mean()

    # 單日File被同一產品使用幾次
    DaySPrMax = DayPr.groupby(level=0).max()
    DaySPrMin = DayPr.groupby(level=0).min()
    DaySPrMea = DayPr.groupby(level=0).mean()


    TCs    = Cs.groupby(level=(0)).size() # 總共File被多少客人使用
    TPr    = Pr.groupby(level=(0)).size() # 總共File被多少產品使用
    Day    = Day.groupby(level=(0)).size() # 總共File被使用幾天
    Fil    = tr.groupby(['FileID']).size() # 總共File被使用幾次

    train_a = pd.concat([DayFilMax, DayFilMin, DayFilMea,
                           DayTCsMax, DayTCsMin, DayTCsMea,
                           DaySCsMax, DaySCsMin, DaySCsMea,
                           DayTPrMax, DayTPrMin, DayTPrMea,
                           DaySPrMax, DaySPrMin, DaySPrMea,
                           TCs,    TPr,   Day,    Fil,     ], axis=1)

    train_agg = train_agg.append(train_a)
    print(i+1)
    
train_agg.columns = ['DayFilMax', 'DayFilMin', 'DayFilMea',
                     'DayTCsMax', 'DayTCsMin', 'DayTCsMea',
                     'DaySCsMax', 'DaySCsMin', 'DaySCsMea',
                     'DayTPrMax', 'DayTPrMin', 'DayTPrMea',
                     'DaySPrMax', 'DaySPrMin', 'DaySPrMea',
                     'TCs',   'TPr',  'Day',  'Fil'      ]

train_agg = pd.DataFrame(train_agg).reset_index()
train_agg.head()

1
2
3
4
5
6
7
8
9
10


Unnamed: 0,FileID,DayFilMax,DayFilMin,DayFilMea,DayTCsMax,DayTCsMin,DayTCsMea,DaySCsMax,DaySCsMin,DaySCsMea,DayTPrMax,DayTPrMin,DayTPrMea,DaySPrMax,DaySPrMin,DaySPrMea,TCs,TPr,Day,Fil
0,0000e2398b12121a85166fed5fe2a3da,43,1,15.666667,2,1,1.333333,38,1,11.75,2,1,1.333333,38,1,11.75,3,2,3,47
1,0001fe8dce14ce099aa6ca8ea5026ea7,66,3,39.0,66,3,39.0,1,1,1.0,4,1,3.0,45,1,13.0,234,4,6,234
2,00027f50019000accc492e5684efc818,253,8,75.6,11,1,6.0,117,1,12.6,5,1,3.2,137,1,23.625,22,6,5,378
3,00028c9da3573ec50db74b44310ae507,339,4,61.333333,107,3,20.833333,55,1,2.944,5,1,2.5,271,1,24.533333,113,5,6,368
4,0003dc8130969abe688cadf5f14ea19f,115,1,39.166667,75,1,27.5,18,1,1.424242,5,1,2.666667,87,1,14.6875,130,5,6,235


In [8]:
train_agg.shape

(52510, 20)

In [None]:
# one-hot encoding

train1 = pd.DataFrame()
train2 = pd.DataFrame()
train3 = pd.DataFrame()

for i in range(10):
    
    tr = train_raw.loc[train_raw['batch'] == i+1]

    Pr = tr[['FileID', 'ProductID']]
    Hr = tr[['FileID', 'hour']]
    Wk = tr[['FileID', 'week']]

    train_prd = pd.concat([Pr, pd.get_dummies(Pr.ProductID)], 1).groupby(['FileID']).sum().reset_index()
    train_hur = pd.concat([Hr, pd.get_dummies(Hr.hour)], 1).groupby(['FileID']).sum().reset_index()
    train_wek = pd.concat([Wk, pd.get_dummies(Wk.week)], 1).groupby(['FileID']).sum().reset_index()

    '''
    train_dum = pd.concat([all_v2[['FileID','CustomerID']], train_prd, train_hur, train_wek], axis=1) 
    train_dum2= train_dum.groupby(['FileID']).sum()
    train_dum3= train_dum.groupby(['FileID']).std()
    '''
    
    train1 = train1.append(train_prd)
    train2 = train2.append(train_hur)
    train3 = train3.append(train_wek)
    
    print(i+1)
    
train_dum = pd.merge(train1, train2, on='FileID')
train_dum = pd.merge(train_dum, train3, on='FileID')

train_dum.head()

1
2
3
4
5
6
7
8
9
10


Unnamed: 0,0374c4,055649,05b409,0cdb7a,20f8a5,218578,262880,26a5d0,3c2be6,3ea8c3,...,21,22,23,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,0.0,38.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,43.0,3.0,0.0,0.0,1.0
1,0.0,44.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,12.0,21.0,66.0,48.0,33.0,45.0,39.0,0.0,3.0
2,0.0,11.0,,0.0,0.0,0.0,0.0,0.0,0.0,117.0,...,7.0,17.0,6.0,80.0,0.0,8.0,0.0,27.0,10.0,253.0
3,0.0,3.0,,0.0,0.0,0.0,0.0,0.0,0.0,64.0,...,2.0,1.0,4.0,0.0,4.0,339.0,6.0,9.0,4.0,6.0
4,0.0,20.0,,0.0,0.0,0.0,0.0,0.0,0.0,38.0,...,0.0,7.0,8.0,7.0,0.0,2.0,1.0,50.0,60.0,115.0


In [None]:
# 找回regular_X
# 新增300

In [None]:
# 使用間隔

def regular(x):
    return ( x%3600 <= 10 ) | ( x%3600 >= 3590 )

#print(regular(np.array([1,3599,3601,450,720,4150])))

def outlier(x):
    Q1 = np.percentile(x, 25)
    Q3 = np.percentile(x, 75)
    h = Q3 - Q1
    return ((Q3 + 0.5*h) >= x) & (x >= (Q1 - 0.5*h))

#print(outlier(np.array([1,100,100,100,100,100,100])))

train1 = pd.DataFrame()
train2 = pd.DataFrame()

for i in range(10):
    
    tr = train_raw.loc[train_raw['batch'] == i+1]

    tr = tr[['CustomerID','FileID','QueryTs','label']]

    tr['Delta1']    = tr.groupby(['FileID'])['QueryTs'].diff(1) # diff 1 or 2 or n : 差幾個interval
    tr['Delta1_d']  = tr.groupby(['FileID'])['Delta1'].diff(1)
    tr['Delta1_d']  = abs(tr.groupby(['FileID'])['Delta1'].diff(1)) # 絕對值
    
    tr['Rank']      = tr.groupby(['FileID'])['QueryTs'].rank() # 順序排名

    #crosstab 也可以 (樞紐)

    #train_freq['Delta1']    = train_freq.groupby(['FileID'])['QueryTs'].transform(lambda x: x-x.shift(1))
    #train_freq['Delta1_d']  = train_freq.groupby(['FileID'])['Delta1'].transform(lambda x: abs(x-x.shift(1)))
    #train_freq['Delta2']    = train_freq.groupby(['CustomerID','FileID'])['QueryTs'].transform(lambda x: x-x.shift(1))
    #train_freq['Delta2_d']  = train_freq.groupby(['CustomerID','FileID'])['Delta2'].transform(lambda x: abs(x-x.shift(1)))

    # 測: 是否為3600秒一單位)

    train_freq1 = tr.drop(['Delta1_d'], axis=1).dropna() #.loc[train_freq['label']==0]
    train_freq1['Regular'] = train_freq1['Delta1'].apply(regular)
    train_freq1mean = train_freq1.groupby(['FileID'])['Regular'].mean().reset_index()
    train_freq1max  = train_freq1.groupby(['FileID'])['Regular'].max().reset_index()
    train_freq1 = pd.merge(train_freq1mean, train_freq1max, on=['FileID'], how='outer')

    # 測: 規律性 (標準差)

    train_freq2 = tr.dropna() # .loc[train_freq['label']==0]
    train_freq2['Delta_OL'] = train_freq2.groupby(['CustomerID','FileID'])['Delta1_d'].transform(outlier)
    train_freq2 = train_freq2[train_freq2['Delta_OL'] != 0]
    train_freq2['Delta_sd'] = train_freq2.groupby(['CustomerID','FileID'])['Delta1_d'].transform(lambda x: np.std(x)/np.mean(x))
    train_freq2mean = train_freq2.groupby(['FileID'])['Delta_sd'].mean().fillna(0).reset_index()
    train_freq2max  = train_freq2.groupby(['FileID'])['Delta_sd'].max().fillna(0).reset_index()
    train_freq2 = pd.merge(train_freq2mean, train_freq2max, on=['FileID'], how='outer')

    train1 = train1.append(train_freq1)
    train2 = train2.append(train_freq2)
    print(i+1)

train_freq = pd.merge(train1, train2, on='FileID')
train_freq.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1
2
3
4
5
6


In [None]:
print(train_freq, train_dum.shape, train_agg.shape)

# 2B. 資料清整

In [None]:
# 全部合併

train_all = pd.merge(train_freq, train_dum, on=['FileID'], how='outer')
train_all = pd.merge(train_all, train_agg, on=['FileID'], how='outer')

train_all.head()

# index & column 相merge時: train_all = pd.merge(train_all, train_agg, how='left', left_on=['FileID'], right_index=True)

In [None]:
print(train_all.columns)
train_all.to_csv("train_all.csv")

In [23]:
# read
train_all = pd.read_csv("train_all_3m.csv")
train_all.shape

  interactivity=interactivity, compiler=compiler, result=result)


(52510, 85)

In [24]:
# 去除 outlier

train_all.columns
print(train_all.head())
print(train_all.isnull().sum())

   Unnamed: 0                            FileID  Regular_x Regular_y  \
0           0  0000e2398b12121a85166fed5fe2a3da   0.847826      True   
1           1  0001fe8dce14ce099aa6ca8ea5026ea7   0.004292      True   
2           2  00027f50019000accc492e5684efc818   0.551724      True   
3           3  00028c9da3573ec50db74b44310ae507   0.220708      True   
4           4  0003dc8130969abe688cadf5f14ea19f   0.047009      True   

   Delta_sd_x  Delta_sd_y  0374c4  055649  05b409  0cdb7a ...   DayTPrMax  \
0    0.720793    0.808290     0.0    38.0     NaN     0.0 ...           2   
1    0.000000    0.000000     0.0    44.0     NaN     0.0 ...           4   
2    1.462955    2.027292     0.0    11.0     NaN     0.0 ...           5   
3    0.471363    1.294782     0.0     3.0     NaN     0.0 ...           5   
4    0.298189    0.964996     0.0    20.0     NaN     0.0 ...           5   

   DayTPrMin  DayTPrMea  DaySPrMax  DaySPrMin  DaySPrMea  TCs  TPr  Day  Fil  
0          1   1.333333  

In [25]:
# 補值 (interval: 同一file同一customer之間的時間差)

train_all[['Regular_x']]  = train_all[['Regular_x']].fillna(value=0)    # 沒有interval
train_all[['Regular_y']]  = train_all[['Regular_y']].fillna(value=0)    # 沒有interval
train_all[['Delta_sd_x']] = train_all[['Delta_sd_x']].fillna(value=100) # 沒有兩個以上interval
train_all[['Delta_sd_y']] = train_all[['Delta_sd_y']].fillna(value=100) # 沒有兩個以上interval
train_all                 = train_all.fillna(value=0)                   # 將所有產品補上0

check1 = len(train_all[train_all['Regular_x'].isnull()])
check2 = len(train_all[train_all['Regular_y'].isnull()])
check3 = len(train_all[train_all['Delta_sd_x'].isnull()])
check4 = len(train_all[train_all['Delta_sd_y'].isnull()])
check5 = len(train_all[train_all['20f8a5'].isnull()])
check6 = len(train_all[train_all['DayTCsMea'].isnull()])

print(check1)
print(check2)
print(check3)
print(check4)
print(check5)
print(check6)

train_all[['Regular_y']] = train_all[['Regular_y']].astype(float)

0
0
0
0
0
0


In [None]:
# Normalization
'''
def normalize(x, axis, method, minmax_range =(0,1)):
    if method == 'z-score':
        scale_a = preprocessing.scale(a, axis=axis)
    elif method== 'minmax':    
        scale_a = preprocessing.minmax_scale(a, axis=axis, feature_range=minmax_range) #default feature range 0~1
    return scale_a
axis =0
scale_a1 = normalize(a, axis, method = 'z-score')
scale_a2 = normalize(a, axis, method = 'minmax', minmax_range=(0,1))
print(scale_a1)
'''

# 3. 探索性資料分析

In [None]:
# 散布圖
'''
color = "rbg"
color = [color[y[i]] for i in range(len(y))]

plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.title('Actual')
plt.xlabel('X1')
plt.ylabel('X2')

plt.subplot(122)
plt.scatter(X[:, 0], X[:, 1], c=prediction)
plt.title('Prediction')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
'''

In [None]:
# 長條圖

In [None]:
# feature correlation

colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
# Pairplots

g = sns.pairplot(train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
       u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])

# 4. 評估模型

In [31]:
# 迭代 train model - 避免imbalance (test data不用拆，直接看AUC&ROC分數即可)

'''
def train_batch_generator(x, y, bs):
    badIndex = y[y == 1].index
    goodIndex = y[y == 0].index

    while(True):
        newBad_ind = shuffle(badIndex)
        newgood_ind = shuffle(goodIndex)

        newBad_ind = newBad_ind[:int(bs/2)]
        newgood_ind = newgood_ind[:int(bs/2)]

        batch_x = x.loc[newBad_ind]
        batch_y = y.loc[newBad_ind]

        batch_x = batch_x.append(x.loc[newgood_ind], ignore_index=True)
        batch_y = batch_y.append(y.loc[newgood_ind], ignore_index=True)

        yield batch_x, batch_y
'''

In [26]:
# 分成 train & test

train = pd.merge(train_all, train, on=['FileID'], how='left')
train = train.sort_values(['FileID'])

x = train.drop(['FileID','label'], axis=1)
y = train['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 68, shuffle=False)

'''
gen = train_batch_generator(x_train, y_train, 10000)
bat_x, bat_y = next(gen)
print(bat_x, bat_y)
'''

'\ngen = train_batch_generator(x_train, y_train, 10000)\nbat_x, bat_y = next(gen)\nprint(bat_x, bat_y)\n'

In [45]:
y_tr = y_train.to_frame()
y_te = y_test.to_frame()

train_tr = pd.concat([x_train, y_tr], axis=1, join='outer')
train_te = pd.concat([x_test,  y_te], axis=1, join='outer')

In [46]:
print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

print(train_tr.shape)
print(train_te.shape)

(35181, 84)
(35181,)
(17329, 84)
(17329,)
(35181, 85)
(17329, 85)


In [48]:
train_te.tail()

Unnamed: 0.1,Unnamed: 0,Regular_x,Regular_y,Delta_sd_x,Delta_sd_y,0374c4,055649,05b409,0cdb7a,20f8a5,...,DayTPrMin,DayTPrMea,DaySPrMax,DaySPrMin,DaySPrMea,TCs,TPr,Day,Fil,label
52443,52443,0.015625,1.0,0.170553,0.999552,0.0,26.0,0.0,0.0,0.0,...,2,4.333333,245,1,17.269231,354,7,6,449,1
52444,52444,0.130435,1.0,1.039173,1.436399,0.0,25.0,0.0,0.0,0.0,...,2,3.0,54,1,9.0,21,4,6,162,0
52445,52445,0.003597,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,2,3.166667,189,1,43.947368,835,4,6,835,0
52446,52446,0.0,0.0,1.0234,1.0234,0.0,0.0,0.0,0.0,0.0,...,1,1.0,32,32,32.0,1,1,1,32,1
52447,52447,0.40625,1.0,0.528375,1.052461,0.0,0.0,0.0,0.0,0.0,...,1,2.5,19,1,6.6,11,4,2,33,1


In [None]:
LogisticRegression()     # logistic
Ridge()                  # Ridge
Lasso()                  # Lasso
DecisionTreeClassifier() # Decision Tree

In [29]:
# SGD

from sklearn.model_selection import cross_val_score

model = SGDClassifier(loss='hinge', verbose=False, max_iter=1000, class_weight='balanced')
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc') # cv: 
print(scores)

'''
model = SGDClassifier(loss='log', penalty='l2', alpha=0.000001, l1_ratio=0.15, 
                      fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, 
                      epsilon=0.2,n_jobs=1, random_state=123, learning_rate='optimal', eta0=0, 
                      power_t=0.2, class_weight='balanced', warm_start=False, average=False, n_iter=2000)
model.fit(x_train, y_train)

probs = model.predict_proba(x_test)
auc   = metrics.roc_auc_score(y_test, [x[1] for x in probs])
print(auc)
'''

[ 0.82465851  0.77392795  0.80954103  0.81134081  0.76431997]


"\nmodel = SGDClassifier(loss='log', penalty='l2', alpha=0.000001, l1_ratio=0.15, \n                      fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, \n                      epsilon=0.2,n_jobs=1, random_state=123, learning_rate='optimal', eta0=0, \n                      power_t=0.2, class_weight='balanced', warm_start=False, average=False, n_iter=2000)\nmodel.fit(x_train, y_train)\n\nprobs = model.predict_proba(x_test)\nauc   = metrics.roc_auc_score(y_test, [x[1] for x in probs])\nprint(auc)\n"

In [77]:
# momentum
model = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', 
                      alpha=0.0001, batch_size='auto', learning_rate='constant', 
                      learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                      random_state=566, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                      nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
                      beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.fit(x_train, y_train)

probs = model.predict_proba(x_test)
auc   = metrics.roc_auc_score(y_test, [x[1] for x in probs])
print(auc)

0.832757592775


In [None]:
# SVM
'''
model = SVC(kernel='poly',coef0=0, degree=3)
model.fit(x_train, y_train)
print(model.roc_auc_score(x_test, y_test))
'''

In [19]:
# XGBoost

model = XGBClassifier()
model.fit(x_train, y_train 
          #, eval_set=[(x_train, y_train), (x_test, y_test)]
          #, eval_metric='auc'
          #, verbose=False
         )
   
probs1 = model.predict_proba(x_train)
probs2 = model.predict_proba(x_test)

auc1   = metrics.roc_auc_score(y_train, [x[1] for x in probs1])
auc2   = metrics.roc_auc_score(y_test, [x[1] for x in probs2])
    
print(auc1, auc2)

0.920483011854 0.913766705764


# 5A 組合模型 (Ensemble): KFold + Stacking 

In [49]:
# 模型評估時
train_xy = train_tr
test_xy  = train_te

train_x  = x_train
train_y  = y_train

test_x  = x_test
test_y  = y_test

'''
# 真正預測時
train_xy = train
test_xy  = test

train_x  = x
train_y  = y

test_x  = test
'''

'\n# 真正預測時\ntrain_xy = train\ntest_xy  = test\n\ntrain_x  = x\ntrain_y  = y\n\ntest_x  = test\n'

In [74]:

ntrain = train_xy.shape[0] 
ntest  = test_xy.shape[0]  
NFOLD = 5
SEED = 0

kf = KFold(n_splits= NFOLD, random_state=0)   # 參數中有default值的要

class SklearnHelper: 

    def __init__(self, clf, params, seed=SEED):   # 初始化参数  
        params['random_state'] = seed  
        self.clf = clf(**params)  

    def train(self, x, y):  
        self.clf.fit(x, y)  
  
    def predict(self, x):  
        return self.clf.predict(x)  
      
    def fit(self, x, y):  
        return self.clf.fit(x, y)  
      
    def feature_importances(self, x, y):  
        print(self.clf.fit(x, y).feature_importances_)  
  

In [96]:
kf.split(train_x)
for train_index, test_index in kf.split(train_x):
    print(train_index, test_index)

[ 7037  7038  7039 ..., 35178 35179 35180] [   0    1    2 ..., 7034 7035 7036]
[    0     1     2 ..., 35178 35179 35180] [ 7037  7038  7039 ..., 14070 14071 14072]
[    0     1     2 ..., 35178 35179 35180] [14073 14074 14075 ..., 21106 21107 21108]
[    0     1     2 ..., 35178 35179 35180] [21109 21110 21111 ..., 28142 28143 28144]
[    0     1     2 ..., 28142 28143 28144] [28145 28146 28147 ..., 35178 35179 35180]


In [82]:
def get_oof(clf, kf_x_tr, kf_y_tr, kf_x_te):
    
    oof_train    = np.zeros((ntrain,))  
    oof_test     = np.zeros((ntest,))  
    oof_test_skf = np.empty((NFOLDS, ntest))        # 創建一個內容隨機並且依賴與內存狀態的數  
  
    index=0
    for train_index, test_index in kf.split(train_x):  # kfold，i為次數 
        x_tr = kf_x_tr[train_index]  # index就是row的編號
        y_tr = kf_y_tr[train_index]  
        x_te = kf_x_tr[test_index]
  
        clf.train(x_tr, y_tr) 
  
        oof_train[test_index]  = clf.predict(x_te) 
        oof_test_skf[index, :] = clf.predict(kf_x_te) 
        index += 1
  
    oof_test[:] = oof_test_skf.mean(axis=0)  
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)  


# 5B 設定模型參數 

In [84]:

# Random Forest  
rf_params = {  
    'n_jobs': -1,  
    'n_estimators': 500,  
    'warm_start': True,   
     #'max_features': 0.2,  
    'max_depth': 6,  
    'min_samples_leaf': 2,  
    'max_features' : 'sqrt',  
    'verbose': 0  
}  
  
# Extra Trees  
et_params = {  
    'n_jobs': -1,  
    'n_estimators':500,  
    #'max_features': 0.5,  
    'max_depth': 8,  
    'min_samples_leaf': 2,  
    'verbose': 0  
}  
  
# AdaBoost   
ad_params = {  
    'n_estimators': 500,  
    'learning_rate' : 0.75  
}  
  
# Gradient Boost  
gb_params = {  
    'n_estimators': 500,  
    #'max_features': 0.2,  
    'max_depth': 5,  
    'min_samples_leaf': 2,  
    'verbose': 0  
}  
  
# SVM  
sv_params = {  
    'kernel':'poly',  
    'C' : 0.025  
    }  


# 5C Stacking 第一層

In [92]:
# 每種算法的特徵重要性

rf = SklearnHelper(clf=RandomForestClassifier,     seed=SEED, params=rf_params)  
et = SklearnHelper(clf=ExtraTreesClassifier,       seed=SEED, params=et_params)  
ad = SklearnHelper(clf=AdaBoostClassifier,         seed=SEED, params=ad_params)  
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)  
# sv = SklearnHelper(clf=SVC,                        seed=SEED, params=sv_params) 

rf_feature  = rf.feature_importances(train_x, train_y)  
et_feature  = et.feature_importances(train_x, train_y)  
ad_feature  = ad.feature_importances(train_x, train_y)  
gb_feature  = gb.feature_importances(train_x, train_y)
# sv_feature  = sv.feature_importances(train_x, train_y)


[  7.52030156e-04   3.93923353e-02   8.12895534e-04   2.50997783e-02
   4.40477753e-03   0.00000000e+00   2.58722242e-03   0.00000000e+00
   4.45869130e-05   2.90104585e-03   1.53417954e-04   3.73357846e-06
   0.00000000e+00   0.00000000e+00   4.57033378e-02   5.06917291e-03
   5.15835538e-03   1.98860858e-02   2.45135395e-04   1.40301477e-02
   1.63399709e-04   1.75305032e-04   7.83697699e-03   0.00000000e+00
   0.00000000e+00   9.40999918e-04   1.42358252e-02   6.53566031e-03
   0.00000000e+00   4.82984526e-03   7.21177861e-04   9.59817587e-03
   0.00000000e+00   1.21379092e-02   9.88523105e-03   7.12139497e-03
   9.60219624e-03   4.12418117e-03   4.06963045e-03   5.15233077e-03
   3.05484322e-03   6.42481215e-03   1.57793623e-02   9.64549649e-03
   6.92016540e-03   5.11054754e-03   5.98985039e-03   7.50162459e-03
   5.97963748e-03   4.46897937e-03   3.83672089e-03   4.20061273e-03
   3.11550542e-03   4.37653859e-03   4.58177241e-03   5.72965797e-03
   4.34246980e-03   1.25180704e-02

In [94]:
kf_x_tr = x_train.values 
kf_y_tr = y_train.values
kf_x_te = x_test.values 

In [95]:
# 用各模型預測，結果成為下一層的input  

et_oof_train, et_oof_test = get_oof(et, kf_x_tr, kf_y_tr, kf_x_te) # Extra Trees  
rf_oof_train, rf_oof_test = get_oof(rf, kf_x_tr, kf_y_tr, kf_x_te) # Random Forest  
ad_oof_train, ad_oof_test = get_oof(ad, kf_x_tr, kf_y_tr, kf_x_te) # AdaBoost   
gb_oof_train, gb_oof_test = get_oof(gb, kf_x_tr, kf_y_tr, kf_x_te) # Gradient Boost  
#sv_oof_train, sv_oof_test = get_oof(sv, kf_x_tr, kf_y_tr, kf_x_te) # Support Vector Classifier

x_train_new = np.concatenate(( et_oof_train, rf_oof_train, ad_oof_train, gb_oof_train), axis=1)
x_test_new  = np.concatenate(( et_oof_test,  rf_oof_test,  ad_oof_test,  gb_oof_test),  axis=1)

  warn("Warm-start fitting without increasing n_estimators does not "


In [97]:
x_train_new

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       ..., 
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  1.]])

# 5D Stacking 第二層

In [102]:
model = XGBClassifier()
model.fit(x_train_new, y_train)
predictions = model.predict(x_test_new)

probs1 = model.predict_proba(x_train_new)
probs2 = model.predict_proba(x_test_new)

auc1   = metrics.roc_auc_score(y_train, [i[1] for i in probs1])
auc2   = metrics.roc_auc_score(y_test , [i[1] for i in probs2])
    
print(auc1, auc2)

0.806244239924 0.779379503203


In [None]:
#voting model & results

#models.append(('logistic', model_logistic))
model_cart = DecisionTreeClassifier()
models.append(('cart', model_cart))

#model_svc = SVC(kernel='poly',coef0=0 ,degree=3) #coef/degree都暫選default

#models.append(('svm', model_svc))

#model_RF = RandomForestClassifier()

#models.append(('RF', model_RF))
model_XGB = XGBClassifier( early_stopping_rounds=6, eval_metric="auc",verbose=True)
models.append(('XGB', model_XGB))
model_MO = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', solver='adam', 
                      alpha=0.0001, batch_size='auto', learning_rate='constant', 
                      learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
                      random_state=566, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, 
                      nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, 
                      beta_1=0.9, beta_2=0.999, epsilon=1e-08)
models.append(('Momentumn', model_MO))

ensemble_model = VotingClassifier(estimators=models)
result_voting = cross_val_score(ensemble_model, X, Y, cv=kfold)
print('voting result:' + str(result_voting.mean()))

In [None]:
# 美美AUC 圖像

from sklearn.metrics import auc
from sklearn.linear_model import SGDClassifier

svm_clf = SGDClassifier(loss='hinge', verbose=False, max_iter=2000, class_weight='balanced')
svm_clf.fit(x_train, y_train)
score_roc = svm_clf.decision_function(x_test)

fpr, tpr, thresholds = metrics.roc_curve(y_test, score_roc)
roc_auc = auc(fpr, tpr)

fig = plt.figure()
fig.set_figwidth(5)
fig.set_figheight(5)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
plt.legend(loc='lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 6. 上傳結果

In [None]:
FileID = test['FileID']
predictions = model.predict(test_x)

KaggleSubmission = pd.DataFrame({ 'FileID': FileID,
                                  'label': predictions })
KaggleSubmission.to_csv("KaggleSubmission.csv", index=False)

In [None]:
#import time

#list1=[]
#def func(i):
 #   time.sleep(np.random.randint(0,5))
  #  list1.append(i)