In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./input/"


In [2]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [3]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()



 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
print(dfoff.shape)
dfoff=dfoff[dfoff['label']!=-1]
dfoff.shape

(1160742, 8)


(746969, 8)

In [5]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
# dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
# dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset
dfoff['weekday_type'] = dfoff['weekday'].astype('int').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('int').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [6]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [7]:
import math
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
#     print(row)
    """Convert discount to rate"""
    if ((row == 'null') or (row == 'nan')):
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [8]:
# Add Shopping_count
# offline
count_dfoff = dfoff['User_id'].value_counts().reset_index()
count_dfoff.columns=['User_id', 'Shopping_count']
dfoff = pd.merge(dfoff, count_dfoff, on=['User_id'], how='left')

# test data
count_dftest = dftest['User_id'].value_counts().reset_index()
count_dftest.columns=['User_id', 'Shopping_count']
dftest = pd.merge(dftest, count_dftest, on=['User_id'], how='left')


In [9]:
# Add Merchant_count
# offline
count_dfoff = dfoff['Merchant_id'].value_counts().reset_index()
count_dfoff.columns=['Merchant_id', 'Merchant_count']
dfoff = pd.merge(dfoff, count_dfoff, on=['Merchant_id'], how='left')

# test data
count_dftest = dftest['Merchant_id'].value_counts().reset_index()
count_dftest.columns=['Merchant_id', 'Merchant_count']
dftest = pd.merge(dftest, count_dftest, on=['Merchant_id'], how='left')

In [10]:
import datetime

## Add month and days
dfoff['month'] = dfoff['Date_received'].apply(lambda x: x/100%10).astype('int')
dfoff['day'] = dfoff['Date_received'].apply(lambda x: x%100).astype('int')

dftest['month'] = dftest['Date_received'].apply(lambda x: x/100%10).astype('int')
dftest['day'] = dftest['Date_received'].apply(lambda x:  x%100).astype('int')



In [11]:
# Add Coupon_count
# offline
couponcnt_dfoff = dfoff['Coupon_id'].value_counts().reset_index()
couponcnt_dfoff.columns=['Coupon_id', 'Coupon_count']
dfoff = pd.merge(dfoff, couponcnt_dfoff, on=['Coupon_id'], how='left')

# test data
couponcnt_dftest = dftest['Coupon_id'].value_counts().reset_index()
couponcnt_dftest.columns=['Coupon_id', 'Coupon_count']
dftest = pd.merge(dftest, couponcnt_dftest, on=['Coupon_id'], how='left')

In [12]:
dfoff.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_7,discount_rate,discount_man,discount_jian,discount_type,Shopping_count,Merchant_count,month,day,Coupon_count
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3,0,...,0,0.95,20,1,1,2,32,2,17,24
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6,1,...,0,0.95,20,1,1,2,32,3,19,8
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5,0,...,0,0.9,200,20,1,1,122781,4,29,46676
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5,0,...,0,0.9,200,20,1,1,122781,1,29,26035
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5,0,...,0,0.5,10,5,1,1,122781,1,29,10345


In [15]:



#1
train_Y = dfoff["label"]
targetPars = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type',
                     'Shopping_count',
                     'Merchant_count',
                      'Merchant_id', 'month','day','Coupon_count'
             ] + weekdaycols
lengthOfFeatures = len(targetPars)
train_X = dfoff[targetPars]
# train_X = MinMaxScaler().fit_transform(train_X)
# Y_test = dftest["label"]
test_X = dftest[targetPars]
# test_X = MinMaxScaler().fit_transform(test_X)
#2
from sklearn.model_selection import train_test_split
from sklearn import metrics

# from sklearn.preprocessing import StandardScaler
train_X['discount_rate']= StandardScaler().fit_transform(train_X['discount_rate'].values.reshape(-1, 1))
train_X['discount_man']= StandardScaler().fit_transform(train_X['discount_man'].values.reshape(-1, 1))
train_X['Shopping_count']= StandardScaler().fit_transform(train_X['Shopping_count'].values.reshape(-1, 1))
train_X['Distance']= StandardScaler().fit_transform(train_X['Distance'].values.reshape(-1, 1))

test_X['discount_rate']= StandardScaler().fit_transform(test_X['discount_rate'].values.reshape(-1, 1))
test_X['discount_man']= StandardScaler().fit_transform(test_X['discount_man'].values.reshape(-1, 1))
test_X['Shopping_count']= StandardScaler().fit_transform(test_X['Shopping_count'].values.reshape(-1, 1))
test_X['Distance']= StandardScaler().fit_transform(test_X['Distance'].values.reshape(-1, 1))


x_train, x_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.25, random_state=4)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
lr.fit(x_train, y_train)
y_lr_pred = lr.predict_proba(x_test)

auc_score = roc_auc_score(y_true=y_test, y_score=y_lr_pred[:,1])
acc = metrics.accuracy_score(y_test,y_lr_pred.argmax(axis=1))
print("Accurary:",acc)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))




Accurary: 0.9513020568374718
Validation AUC: 0.689, Accuracy: 0.951


In [17]:
lr_pred = lr.predict_proba(test_X)[:,1]

targetset= dftest.copy()
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)

test1 = test_X.copy()
# print(test_X.head(5))
# test1 = test_X.copy()[['User_id', 'Coupon_id', 'Date_received']]
print(test1)
test1['pred_prob'] = lr_pred

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("lr.csv", header=["uid", "label"], index=False) # submission format
out.head()

        discount_rate  discount_type  discount_man  discount_jian  Distance  \
0            0.369881              1      2.329566             20 -0.394598   
1            1.217611              1     -0.589058              1 -0.426390   
2            1.217611              1     -0.589058              1 -0.426390   
3            0.030788              1     -0.364549              5 -0.426390   
4            1.217611              1     -0.589058              1 -0.426390   
5           -0.308304              1      0.084470             10  2.721015   
6            1.217611              1     -0.589058              1 -0.394598   
7            0.030788              1     -0.364549              5 -0.426390   
8            0.369881              1      2.329566             20  2.721015   
9           -0.816942              1     -0.589058              5 -0.426390   
10          -1.664673              1     -0.364549             10 -0.426390   
11           1.217611              1     -0.589058  

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.062357
1,1000020_8192_20160513,0.035553
2,1000065_1455_20160527,0.413364
3,1000085_8067_20160513,0.330208
4,1000086_2418_20160613,0.004557


In [18]:
# Random Forest

rf = RandomForestClassifier(n_estimators=10, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)


rf.fit(x_train, y_train)
y_rf_pred = rf.predict_proba(x_test)

auc_score = roc_auc_score(y_true=y_test, y_score=y_rf_pred[:,1])
acc = metrics.accuracy_score(y_test,y_rf_pred.argmax(axis=1))
print("Accurary:",acc)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

feats = pd.Series(data=rf.feature_importances_, index=train_X.columns)
feats = feats.sort_values(ascending=False)
print(feats)

# Accurary: 0.9522070439052601
# Validation AUC: 0.845, Accuracy: 0.952

## Grid Search
#设置参数
# parameters = [{'n_estimators':[10,100,1000],
# 'criterion':['entropy','gini'],
# 'max_depth':[10,50,100,200],
# 'min_samples_split':[2,5,10],
# 'min_weight_fraction_leaf':[0.0,0.1,0.2,0.3,0.4,0.5]
# }]
# gsearch1=GridSearchCV(estimator=rf, param_grid=parameters,scoring='roc_auc')
# gsearch1.fit(x_train, y_train)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_  

Accurary: 0.952014265594962
Validation AUC: 0.868, Accuracy: 0.952
Coupon_count      0.246827
Distance          0.201248
Shopping_count    0.108867
discount_jian     0.104680
Merchant_count    0.082946
discount_rate     0.075925
discount_man      0.068197
month             0.045804
Merchant_id       0.033429
day               0.017663
discount_type     0.004879
weekday           0.003967
weekday_7         0.002959
weekday_type      0.001006
weekday_6         0.000578
weekday_5         0.000295
weekday_4         0.000288
weekday_1         0.000189
weekday_3         0.000136
weekday_2         0.000117
dtype: float64


In [19]:
rf_pred = rf.predict_proba(test_X)[:,1]
targetset= dftest.copy()
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)

test1 = test_X.copy()
test1['pred_prob'] = rf_pred

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("rf.csv", header=["uid", "label"], index=False) # submission format
out.head()

(306313, 4)


Unnamed: 0,uid,label
0,1000020_2705_20160519,0.06886
1,1000020_8192_20160513,0.059134
2,1000065_1455_20160527,0.09621
3,1000085_8067_20160513,0.093586
4,1000086_2418_20160613,0.017655


In [20]:
gdbt = GradientBoostingClassifier(subsample=0.75, n_estimators=250, max_features=lengthOfFeatures,
                                  max_depth=5, learning_rate=0.03, tol=0.1)
# gdbt.fit(train_X, train_Y)

gdbt.fit(x_train, y_train)
y_gdbt_pred = gdbt.predict_proba(x_test)

auc_score = roc_auc_score(y_true=y_test, y_score=y_gdbt_pred[:,1])
acc = metrics.accuracy_score(y_test,y_gdbt_pred.argmax(axis=1))
print("Accurary:",acc)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

feats = pd.Series(data=gdbt.feature_importances_, index=train_X.columns)
feats = feats.sort_values(ascending=False)
feats

# Accurary: 0.9531227408791761
# Validation AUC: 0.879, Accuracy: 0.953

# with Merchant_ID
# Accurary: 0.9531816453628784
# Validation AUC: 0.881, Accuracy: 0.953

# with month and day
# Accurary: 0.9533744236731765
# Validation AUC: 0.888, Accuracy: 0.953

Accurary: 0.9534172632976872
Validation AUC: 0.895, Accuracy: 0.953


Coupon_count      0.321996
Shopping_count    0.169660
Distance          0.128951
discount_man      0.087463
discount_rate     0.068342
Merchant_id       0.067696
Merchant_count    0.066892
month             0.032792
discount_jian     0.021853
day               0.020404
weekday           0.009566
weekday_4         0.001113
weekday_type      0.000712
weekday_5         0.000555
weekday_7         0.000544
discount_type     0.000468
weekday_3         0.000363
weekday_1         0.000238
weekday_6         0.000203
weekday_2         0.000190
dtype: float64

In [21]:
gdbt.fit(train_X, train_Y)
gdbt_pred = gdbt.predict_proba(test_X)[:,1]
targetset= dftest.copy()
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)

test1 = test_X.copy()
test1['pred_prob'] = gdbt_pred

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("gdbt.csv", header=["uid", "label"], index=False) # submission format
out.head()

(306313, 4)


Unnamed: 0,uid,label
0,1000020_2705_20160519,0.03647
1,1000020_8192_20160513,0.033258
2,1000065_1455_20160527,0.086248
3,1000085_8067_20160513,0.050156
4,1000086_2418_20160613,0.005484


In [22]:
# Blending
y_blending_pred = y_lr_pred*0.10  + y_gdbt_pred*0.60 + y_rf_pred*0.30
auc_score = roc_auc_score(y_true=y_test, y_score=y_blending_pred[:,1])
acc = metrics.accuracy_score(y_test,y_blending_pred.argmax(axis=1))
print("Accurary:",acc)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Accurary: 0.9527800238830907
Validation AUC: 0.886, Accuracy: 0.953


In [None]:
# Stacking
from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.70, n_estimators=50, 
                                           max_features='sqrt', max_depth=4, learning_rate=0.3)
stacking = StackingClassifier(classifiers=[lr,gdbt, rf], meta_classifier=meta_estimator, use_probas=True, average_probas=False)

stacking.fit(x_train, y_train)
y_pred = stacking.predict_proba(x_test)

auc_score = roc_auc_score(y_true=y_test, y_score=y_pred[:,1])
acc = metrics.accuracy_score(y_test,y_pred.argmax(axis=1))
print("Accurary:",acc)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

feats = pd.Series(data=gdbt.feature_importances_, index=train_X.columns)
feats = feats.sort_values(ascending=False)
feats






In [None]:
stacking.fit(train_X, train_Y)
stacking_pred = gdbt.predict_proba(test_X)[:,1]
targetset= dftest.copy()
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)

test1 = test_X.copy()
test1['pred_prob'] = stacking_pred

output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("stacking.csv", header=["uid", "label"], index=False) # submission format
out.head()