In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

description = pd.read_csv('./column_description.csv',index_col = 'Column' ,encoding='big5')
train = pd.read_csv('./train_offline.csv',encoding='big5')
test = pd.read_csv('./test_offline.csv',encoding='big5')

In [2]:
description

Unnamed: 0_level_0,Description
Column,Unnamed: 1_level_1
User_id,用戶 ID
Merchant_id,商家 ID
Coupon_id,優惠券 ID (null 代表無優惠券消費)
Discount_rate,"優惠券折價：[0,1] 代表折扣率；x:y 代表滿 x 減 y 元"
Distance,"用戶經常活動地點離商家最近距離 (x * 500 公尺), 0 表示低於 500 公尺, 1..."
Date_received,優惠券取得時間
Date,購買商品時間 (如果 Date is null & Coupon_id is not nul...


In [3]:
train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160742 entries, 0 to 1160741
Data columns (total 7 columns):
User_id          1160742 non-null int64
Merchant_id      1160742 non-null int64
Coupon_id        746969 non-null float64
Discount_rate    746969 non-null object
Distance         1090916 non-null float64
Date_received    746969 non-null float64
Date             456709 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 62.0+ MB


不去care那些沒優惠券的數據

In [5]:
train = train[~train.Coupon_id.isna()]

In [6]:
train.shape

(746969, 7)

In [7]:
test = test[~test.Coupon_id.isna()]

In [8]:
test.shape

(306313, 6)

> 總共給的train 數據，整體只有5%的人用消費券，模型選用時使用分層抽樣，避免imbalance

> 由上面的資料可以得知，總共的資料為1160742筆，但我們感興趣的資料是消費券是否被使用，所以可用資料應為Coupon_id的資料74696筆，共將沒有消費券的給0與有卻沒有使用的都表示為0，把有消費券且用的表示為1。

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306313 entries, 0 to 594141
Data columns (total 6 columns):
User_id          306313 non-null int64
Merchant_id      306313 non-null int64
Coupon_id        306313 non-null float64
Discount_rate    306313 non-null object
Distance         270136 non-null float64
Date_received    306313 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 16.4+ MB


In [10]:
test.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
3,1439408,2632,8591.0,20:1,0.0,20160516.0
4,2029232,450,1532.0,30:5,0.0,20160530.0
5,2029232,6459,12737.0,20:1,0.0,20160519.0


In [11]:
#Creat target label
"""
According to the definition,
(1)buy with coupon within(include) 15 days ==> 1
(2)buy with coupon but out of 15 days ==> 0
(3)buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
    
train['label'] = train.apply(label, axis = 1)
train['label'].value_counts()

0    710665
1     36304
Name: label, dtype: int64

確認一下training 中Coupon_id、Merchant_id 裡面的值 跟 test中有無重複

In [12]:
train_Coupon = set(list(train['Coupon_id'].unique()))
test_Coupon = set(list(test['Coupon_id'].unique()))

In [13]:
len(train_Coupon.intersection(test_Coupon)) # 1905個重複項

1905

In [14]:
train_Merchant_id = set(list(train['Merchant_id'].unique()))
test_Merchant_id = set(list(test['Merchant_id'].unique()))

In [15]:
len(train_Merchant_id.intersection(test_Merchant_id)) # 2160個重複項

2160

看看重複的店家 佔資料裡面的多少

In [16]:
sum(train['Merchant_id'].apply(lambda x : 1 if x in list(train_Merchant_id.intersection(test_Merchant_id)) else 0))

546562

In [17]:
train.shape #只有將近20萬筆沒有重複

(746969, 8)

將重複的 保留 不重複地給予None 當作特徵，這邊在合併後 一起做

In [18]:
train.drop(['Date'],inplace= True,axis = 1) #將training 多的欄位去掉
all_data = pd.concat([train, test], ignore_index=True)#將training 跟Test 的資料共同處理

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [19]:
all_data.head()

Unnamed: 0,Coupon_id,Date_received,Discount_rate,Distance,Merchant_id,User_id,label
0,8591.0,20160217.0,20:1,0.0,2632,1439408,0.0
1,1078.0,20160319.0,20:1,0.0,2632,1439408,0.0
2,7610.0,20160429.0,200:20,0.0,3381,1832624,0.0
3,11951.0,20160129.0,200:20,1.0,3381,2029232,0.0
4,9776.0,20160129.0,10:5,2.0,3381,2223968,0.0


> 消費次數最多的前10名店家

In [20]:
all_data['Merchant_id'].value_counts().head(10)

3381    122834
450      63042
760      44976
5341     36162
2709     35330
1569     33600
7555     27713
4660     24867
6454     21575
3621     20779
Name: Merchant_id, dtype: int64

In [21]:
all_data['Coupon_id'].value_counts().head(10)

7610.0     46729
2418.0     29284
11951.0    26035
8555.0     26009
1480.0     24815
1807.0     24500
111.0      21760
2840.0     21693
5054.0     21402
14031.0    21059
Name: Coupon_id, dtype: int64

### 資料前處理

將 店家id(Merchant_id)與優惠券id(Coupon_id)的特性保留下來，因為用不用優惠券可能跟店家會是優惠券本生有關

In [22]:
len(all_data['Merchant_id'].unique())

5599

In [23]:
len(all_data['Coupon_id'].unique())

9738

直接one-hoteconding維度太大了，將在訓練跟測試集都有的部分保留，其餘填上None值

In [24]:
all_data['Merchant_id_repeat'] = all_data['Merchant_id'].apply(lambda x : str(x) if x in list(train_Merchant_id.intersection(test_Merchant_id)) else 'None')

In [25]:
all_data['Coupon_id_repeat'] = all_data['Coupon_id'].apply(lambda x : str(x) if x in list(train_Coupon.intersection(test_Coupon)) else 'None')

In [26]:
from sklearn.preprocessing import LabelEncoder
all_data['Merchant_id_repeat'] = LabelEncoder().fit_transform(all_data['Merchant_id_repeat'])
all_data['Coupon_id_repeat'] = LabelEncoder().fit_transform(all_data['Coupon_id_repeat'])

Discount_rate 處理，將它實際的rate 跟 前後滿多少 折多少， 都當作一個特徵

In [27]:
all_data['Discount_rate'] = all_data['Discount_rate'].fillna('0:0')
all_data['Discount_rate'] = all_data['Discount_rate'].apply(str)

In [28]:
all_data['Discount_rate'] = all_data['Discount_rate'].apply(lambda x:x.split(':'))

In [29]:
rate = pd.DataFrame(all_data['Discount_rate'].loc[all_data['Discount_rate'].apply(lambda x : len(x)) !=2])
raw_rate = pd.DataFrame(all_data['Discount_rate'].loc[all_data['Discount_rate'].apply(lambda x : len(x)) != 1])

In [30]:
raw_rate['Need_buy_price'] = raw_rate['Discount_rate'].apply(lambda x :x[0])
raw_rate['Discount_price'] = raw_rate['Discount_rate'].apply(lambda x :x[1])
raw_rate['Discount_rate'] = 1 - (raw_rate['Discount_price'].apply(int) / raw_rate['Need_buy_price'].apply(int))

In [31]:
raw_rate.fillna(0, inplace=True)

In [32]:
rate = pd.DataFrame(rate['Discount_rate'].apply(lambda x : float(x[0])))

In [33]:
raw_rate['index'] = raw_rate.index
rate['index'] = rate.index

In [34]:
raw_rate = raw_rate.append(rate,ignore_index=True).sort_values(by = ['index'])

# res = pd.merge(left,right, left_index=True, right_index=True, how='outer')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [35]:
raw_rate.index = raw_rate['index']

In [36]:
raw_rate.drop('index',axis = 1, inplace=True)

In [37]:
all_data = pd.merge(all_data ,raw_rate, left_index=True, right_index=True, how='outer')
all_data.drop(['Discount_rate_x'],axis = 1,inplace = True)

In [38]:
all_data.head()

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_price,Discount_rate_y,Need_buy_price
0,8591.0,20160217.0,0.0,2632,1439408,0.0,439,1687,1,0.95,20
1,1078.0,20160319.0,0.0,2632,1439408,0.0,439,116,1,0.95,20
2,7610.0,20160429.0,0.0,3381,1832624,0.0,647,1535,20,0.9,200
3,11951.0,20160129.0,1.0,3381,2029232,0.0,647,1905,20,0.9,200
4,9776.0,20160129.0,2.0,3381,2223968,0.0,647,1905,5,0.5,10


In [39]:
pd.options.display.float_format = '{:.4f}'.format
all_data.describe()

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_rate_y
count,1053282.0,1053282.0,947279.0,1053282.0,1053282.0,746969.0,1053282.0,1053282.0,1053282.0
mean,6815.3982,20160315.9383,3.2288,3953.3774,3687241.5191,0.0486,1253.0706,1770.413,0.8443
std,4174.276,177.1416,3.8195,2422.8522,2124955.8622,0.215,723.3704,352.1169,0.0904
min,1.0,20160101.0,0.0,2.0,4.0,0.0,0.0,0.0,0.2
25%,2840.0,20160129.0,0.0,1715.0,1842402.0,0.0,647.0,1905.0,0.8333
50%,7430.0,20160321.0,1.0,3381.0,3692321.5,0.0,1113.0,1905.0,0.85
75%,10323.0,20160514.0,6.0,6284.0,5529516.0,0.0,2014.0,1905.0,0.9
max,14045.0,20160615.0,10.0,8856.0,7361032.0,1.0,2160.0,1905.0,0.99


In [40]:
all_data.loc[all_data['Discount_rate_y'] < 0.5].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 23520 to 1022627
Data columns (total 11 columns):
Coupon_id             143 non-null float64
Date_received         143 non-null float64
Distance              29 non-null float64
Merchant_id           143 non-null int64
User_id               143 non-null int64
label                 127 non-null float64
Merchant_id_repeat    143 non-null int32
Coupon_id_repeat      143 non-null int32
Discount_price        33 non-null object
Discount_rate_y       143 non-null float64
Need_buy_price        33 non-null object
dtypes: float64(5), int32(2), int64(2), object(2)
memory usage: 12.3+ KB


In [41]:
all_data.loc[(all_data['Discount_rate_y'] < 0.5) & (~all_data['Discount_price'].isnull())]  #打五折以上的價格差不多在30~50之間

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_price,Discount_rate_y,Need_buy_price
23520,9675.0,20160425.0,0.0,3926,4278132,0.0,2160,1905,20,0.3333,30
24555,899.0,20160205.0,1.0,21,209308,0.0,2160,1905,20,0.3333,30
71498,9235.0,20160314.0,0.0,4153,5519592,0.0,2160,1905,30,0.4,50
208070,899.0,20160101.0,,21,5247943,0.0,2160,1905,20,0.3333,30
211091,899.0,20160411.0,10.0,21,6849595,0.0,2160,1905,20,0.3333,30
292635,899.0,20160108.0,3.0,21,1875718,0.0,2160,1905,20,0.3333,30
389412,899.0,20160114.0,0.0,21,3587899,0.0,2160,1905,20,0.3333,30
390679,10614.0,20160418.0,,6521,6962830,0.0,2160,1905,30,0.4,50
405873,10614.0,20160426.0,3.0,6521,4013509,0.0,2160,1905,30,0.4,50
453749,10614.0,20160418.0,,6521,5339991,0.0,2160,1905,30,0.4,50


In [42]:
all_data.loc[(all_data['Discount_rate_y'] < 0.5) & (~all_data['Discount_price'].isnull())]['Need_buy_price'].mode()

0    30
dtype: object

> 以眾數為30 但估計打到8折以上 不會是太貴的商品 故除以2填補 Need_buy_price缺失值

In [43]:
all_data['Need_buy_price'] = all_data['Need_buy_price'].fillna(15)

In [44]:
all_data['Discount_price'] = all_data.apply(lambda x : float(x['Discount_rate_y']) * int(x['Need_buy_price']), axis = 1)

Coupon_id 缺失值用None填補

In [45]:
all_data['Coupon_id'] = all_data['Coupon_id'].fillna('None').apply(str)

Distance 缺失值用距離平均填補

In [46]:
all_data['Distance'] = all_data['Distance'].fillna(all_data['Distance'].mean())

將Reeived 改成星期 1~7，並把是否為周末的特徵補上

In [47]:
import datetime
all_data['Date_received'] = all_data['Date_received'].fillna(0)
all_data['Date_received'] = all_data['Date_received'].apply(int)

In [48]:
all_data['Date_received'] = all_data['Date_received'].apply(str)

In [49]:
all_data['Date_received'] = all_data['Date_received'].apply(lambda x : pd.to_datetime(x) if x != '0' else '0')

In [50]:
all_data['Weekday'] = all_data['Date_received'].apply(lambda x: x.weekday() + 1 if x != '0' else 0)

In [51]:
all_data['Weekday'].value_counts()

7    192614
5    168071
1    162069
6    160211
4    132771
3    122754
2    114792
Name: Weekday, dtype: int64

In [52]:
all_data['received_weekend'] = all_data['Weekday'].apply(lambda x : 1  if x == 6 or x == 7 else 0)

In [53]:
all_data.head()

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_price,Discount_rate_y,Need_buy_price,Weekday,received_weekend
0,8591.0,2016-02-17,0.0,2632,1439408,0.0,439,1687,19.0,0.95,20,3,0
1,1078.0,2016-03-19,0.0,2632,1439408,0.0,439,116,19.0,0.95,20,6,1
2,7610.0,2016-04-29,0.0,3381,1832624,0.0,647,1535,180.0,0.9,200,5,0
3,11951.0,2016-01-29,1.0,3381,2029232,0.0,647,1905,180.0,0.9,200,5,0
4,9776.0,2016-01-29,2.0,3381,2223968,0.0,647,1905,5.0,0.5,10,5,0


將Merchant_id、Coupon_id 的數量(count)特徵補上

In [54]:
all_data['Merchant_count'] = all_data.groupby('Merchant_id')['Merchant_id'].transform('count')

In [55]:
all_data['Coupon_count'] = all_data.groupby('Coupon_id')['Coupon_id'].transform('count')

Distance 能不能搞點東西出來

In [56]:
all_data.head()

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_price,Discount_rate_y,Need_buy_price,Weekday,received_weekend,Merchant_count,Coupon_count
0,8591.0,2016-02-17,0.0,2632,1439408,0.0,439,1687,19.0,0.95,20,3,0,43,31
1,1078.0,2016-03-19,0.0,2632,1439408,0.0,439,116,19.0,0.95,20,6,1,43,12
2,7610.0,2016-04-29,0.0,3381,1832624,0.0,647,1535,180.0,0.9,200,5,0,122834,46729
3,11951.0,2016-01-29,1.0,3381,2029232,0.0,647,1905,180.0,0.9,200,5,0,122834,26035
4,9776.0,2016-01-29,2.0,3381,2223968,0.0,647,1905,5.0,0.5,10,5,0,122834,10345


Distance小於500公尺 為0，0對模型可能不太好，將所有值+1

In [57]:
all_data['Distance'] = all_data['Distance'] + 1

In [58]:
all_data['Distance'].describe()

count   1053282.0000
mean          4.2288
std           3.6222
min           1.0000
25%           1.0000
50%           3.0000
75%           6.0000
max          11.0000
Name: Distance, dtype: float64

In [59]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 0 to 1053281
Data columns (total 15 columns):
Coupon_id             1053282 non-null object
Date_received         1053282 non-null datetime64[ns]
Distance              1053282 non-null float64
Merchant_id           1053282 non-null int64
User_id               1053282 non-null int64
label                 746969 non-null float64
Merchant_id_repeat    1053282 non-null int32
Coupon_id_repeat      1053282 non-null int32
Discount_price        1053282 non-null float64
Discount_rate_y       1053282 non-null float64
Need_buy_price        1053282 non-null object
Weekday               1053282 non-null int64
received_weekend      1053282 non-null int64
Merchant_count        1053282 non-null int64
Coupon_count          1053282 non-null int64
dtypes: datetime64[ns](1), float64(4), int32(2), int64(6), object(2)
memory usage: 160.5+ MB


### 模型

In [60]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [61]:
ntrain = all_data[:train.shape[0]]
ntest = all_data[train.shape[0]:]

In [62]:
user_id_train = ntrain['User_id']
ntrain.drop(['User_id'],axis = 1,inplace = True)

user_id_test = ntest['User_id']
ntest.drop(['User_id'],axis = 1,inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [63]:
# x_train, x_test, y_train, y_test = train_test_split(ntrain, y_label,test_size = 0.2)

# clf = RandomForestClassifier(n_estimators = 20 , max_depth = 5)

# clf.fit(x_train, y_train)

# y_pred = clf.predict(x_test)

In [64]:
# acc = metrics.accuracy_score(y_test,y_pred)
# print("Accuracy:",acc)

上述沒用 分層抽樣會導致 模型全部拆 0，準確率也很高

In [65]:
#Naive model
def split_train_valid(row, date_cut = '20160416'):
    is_train = True if row < pd.to_datetime(date_cut,\
    format = "%Y%m%d") else False
    
    return is_train

ntrain['is_train'] = ntrain['Date_received'].apply(split_train_valid)
train = ntrain[ntrain['is_train']]
valid = ntrain[~ntrain['is_train']]
train.reset_index(drop = True, inplace = True)
valid.reset_index(drop = True, inplace = True)

print("Train size : {}, #positive: {}".format(len(train),train['label'].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size : 667753, #positive: 32472.0
Valid size: 79216, #positive: 3832.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [66]:
train.head()

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,label,Merchant_id_repeat,Coupon_id_repeat,Discount_price,Discount_rate_y,Need_buy_price,Weekday,received_weekend,Merchant_count,Coupon_count,is_train
0,8591.0,2016-02-17,1.0,2632,0.0,439,1687,19.0,0.95,20,3,0,43,31,True
1,1078.0,2016-03-19,1.0,2632,0.0,439,116,19.0,0.95,20,6,1,43,12,True
2,11951.0,2016-01-29,2.0,3381,0.0,647,1905,180.0,0.9,200,5,0,122834,26035,True
3,9776.0,2016-01-29,3.0,3381,0.0,647,1905,5.0,0.5,10,5,0,122834,10345,True
4,12034.0,2016-02-07,4.2288,2099,0.0,2160,1905,90.0,0.9,100,7,1,16824,16824,True


In [67]:
predictors = ['Distance', 'Discount_price', 'Discount_rate_y', 'Need_buy_price', 'Weekday','received_weekend',\
              'Merchant_id_repeat','Coupon_id_repeat','Merchant_count', 'Coupon_count']

In [68]:
import xgboost as xgb

In [103]:
def check_model(data, predictors):
    
    classifier = lambda: StackingClassifier(classifiers=[rf,model_lgb,model_xgb],meta_classifier=GBoost)
    
    
    
    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    return grid_search

In [104]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter alpha for estimator StackingClassifier(average_probas=False,
          classifiers=[RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=9,
            min_weight_fr...       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)],
          drop_last_proba=False,
          meta_classifier=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='huber', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=15, min_s...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          store_train_meta_features=False, use_clones=True,
          use_features_in_secondary=False, use_probas=False, verbose=0). Check the list of available parameters with `estimator.get_params().keys()`.

In [87]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

  Xt = transform.transform(Xt)


In [88]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.838, Accuracy: 0.952


In [89]:
import lightgbm as lgb

In [90]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [99]:
GBoost = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1,
                                   max_depth=4,
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [100]:
model_lgb = lgb.LGBMClassifier(objective='classmethod',num_leaves=5,
                              learning_rate=0.1, n_estimators=50,
                               bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [101]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=9, min_samples_leaf=10, 
                           max_depth=4, bootstrap=False)

In [102]:
model_xgb = xgb.XGBClassifier(n_estimators=100,\
                                   max_depth=4, \
                                   learning_rate=0.1, \
                                   subsample=0.7, \
                                   colsample_bytree=0.7)

In [96]:
from mlxtend.classifier import StackingClassifier

In [105]:
stacking = StackingClassifier(classifiers=[rf,model_lgb,model_xgb],meta_classifier=GBoost)

In [None]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

### final predict

In [76]:
ntest['User_id'] = user_id_test.reset_index(drop = True)

In [77]:
targetset = ntest.copy()
print(targetset.shape)
ntest =ntest[~ntest.Coupon_id.isna()]
ntest.reset_index(drop=True, inplace=True)
testset = ntest[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 15)


  Xt = transform.transform(Xt)


(306313, 11)


In [78]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(float(x))))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(x)[0:10].replace('-',''))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [79]:
output.shape

(306313, 5)

In [80]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.205
1,1000020_8192_20160513,0.2032
2,1000065_1455_20160527,0.1057
3,1000085_8067_20160513,0.1682
4,1000086_2418_20160613,0.0522


In [81]:
out.to_csv('out5.csv',index = False)

In [82]:
out.shape

(304096, 2)