In [33]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "data/ml100marathon-02-01"

## 添加label 目標column

In [139]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))

In [18]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
##output new train_offline csv
dfoff.reset_index(drop=True, inplace=True)
dfoff.to_csv(DATA_ROOT+"/train_offline_label.csv", index_label=False ) 
dfoff["label"].value_counts()
dfoff.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0
6,73611,2099,12034.0,100:10,,20160207.0,,0
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0


## ====================

In [34]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline_label.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=False)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(10)

(1160742, 8)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0
6,73611,2099,12034.0,100:10,,20160207.0,,0
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0


## ====Generate feature====

In [35]:
## coupon related feature

def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    #f = df[df['label'] != -1].copy()
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    

    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    
 
    
    return df

dfoff = dfoff[dfoff['label'] != -1].copy()
dfoff = processData(dfoff)
dftest = processData(dftest)
print(dftest.shape)


(306313, 10)


## other feature

In [36]:
## dfoff

t =  dfoff[['User_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('User_id').agg('sum').reset_index()

    
t1 = dfoff[['User_id','Coupon_id']]
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
dfoff = dfoff[~np.isnan(dfoff['Coupon_id'])].copy()
#dftest['this_month_user_receive_all_coupon_count'].fillna(0)
#dftest['this_month_user_receive_same_coupon_count'].fillna(0)

t2 = dfoff[['User_id','Date_received']]
t2['this_day_user_receive_all_coupon_count'] = 1
t2 = t2.groupby(['User_id','Date_received']).agg('sum').reset_index()

t3 = dfoff[['User_id','Coupon_id','Date_received']]
t3['this_day_user_receive_same_coupon_count'] = 1
t3 = t3.groupby(['User_id','Coupon_id','Date_received']).agg('sum').reset_index()

t4 = dfoff[['User_id','Merchant_id','Coupon_id']]
t4 = t4[~np.isnan(t4.Coupon_id)][['User_id','Merchant_id']]
t4['user_merchant_received'] = 1
t4 = t4.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
t4.drop_duplicates(inplace=True)

t5 = dfoff[['User_id','Merchant_id']]
t5['user_merchant_any'] = 1
t5 = t5.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
t5.drop_duplicates(inplace=True)


t6= dfoff[~np.isnan(dfoff.Coupon_id)][['Merchant_id']]
t6['total_coupon'] = 1
t6= t6.groupby('Merchant_id').agg('sum').reset_index()


dfoff = pd.merge(dfoff,t,on='User_id',how='left')
dfoff = pd.merge(dfoff,t1,on=['User_id','Coupon_id'],how='left')
dfoff = pd.merge(dfoff,t2,on=['User_id','Date_received'],how='left')
dfoff = pd.merge(dfoff,t3,on=['User_id','Coupon_id','Date_received'],how='left')
dfoff = pd.merge(dfoff,t4,on=['User_id','Merchant_id'],how='left')
dfoff = pd.merge(dfoff,t5,on=['User_id','Merchant_id'],how='left')
dfoff = pd.merge(dfoff,t6,on=['Merchant_id'],how='left')

dfoff.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

(746969, 19)

In [37]:
##dftest
## this_month_user_receive_all_coupon_count
t =  dftest[['User_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('User_id').agg('sum').reset_index()

    

t1 = dftest[['User_id','Coupon_id']]
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
dftest = dftest[~np.isnan(dftest['Coupon_id'])].copy()
#dftest['this_month_user_receive_all_coupon_count'].fillna(0)
#dftest['this_month_user_receive_same_coupon_count'].fillna(0)
dftest.head()




t2 = dftest[['User_id','Date_received']]
t2['this_day_user_receive_all_coupon_count'] = 1
t2 = t2.groupby(['User_id','Date_received']).agg('sum').reset_index()

t3 = dftest[['User_id','Coupon_id','Date_received']]
t3['this_day_user_receive_same_coupon_count'] = 1
t3 = t3.groupby(['User_id','Coupon_id','Date_received']).agg('sum').reset_index()


t4 = dftest[['User_id','Merchant_id','Coupon_id']]
t4 = t4[~np.isnan(t4.Coupon_id)][['User_id','Merchant_id']]
t4['user_merchant_received'] = 1
t4 = t4.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
t4.drop_duplicates(inplace=True)

t5 = dftest[['User_id','Merchant_id']]
t5['user_merchant_any'] = 1
t5 = t5.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
t5.drop_duplicates(inplace=True)

t6= dftest[~np.isnan(dftest.Coupon_id)][['Merchant_id']]
t6['total_coupon'] = 1
t6= t6.groupby('Merchant_id').agg('sum').reset_index()

dftest = pd.merge(dftest,t,on='User_id',how='left')
dftest = pd.merge(dftest,t1,on=['User_id','Coupon_id'],how='left')
dftest = pd.merge(dftest,t2,on=['User_id','Date_received'],how='left')
dftest = pd.merge(dftest,t3,on=['User_id','Coupon_id','Date_received'],how='left')
dftest = pd.merge(dftest,t4,on=['User_id','Merchant_id'],how='left')
dftest = pd.merge(dftest,t5,on=['User_id','Merchant_id'],how='left')
dftest = pd.merge(dftest,t6,on=['Merchant_id'],how='left')

dftest.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http:/

(306313, 17)

## ======Split Data ======

In [38]:
## Naive model
def split_train_valid(row, date_cut="20160415"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
#df = dfoff[dfoff['Date_received'] != 0].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
train.to_csv(DATA_ROOT+"/train.csv", index_label=False ) 
valid.reset_index(drop=True, inplace=True)
valid.to_csv(DATA_ROOT+"/valid.csv", index_label=False ) 
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 662393, #positive: 32277
Valid size: 84576, #positive: 4027


In [39]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance','this_month_user_receive_all_coupon_count','this_month_user_receive_same_coupon_count'
                    ,'this_day_user_receive_all_coupon_count','this_day_user_receive_same_coupon_count',
                    'user_merchant_received','user_merchant_any','total_coupon'
                   ] 
predictors = original_feature
print(len(original_feature),original_feature)

12 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'this_month_user_receive_all_coupon_count', 'this_month_user_receive_same_coupon_count', 'this_day_user_receive_all_coupon_count', 'this_day_user_receive_same_coupon_count', 'user_merchant_received', 'user_merchant_any', 'total_coupon']


In [40]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
rf.fit(train[predictors],train['label'].values)
feats = pd.Series(data=rf.feature_importances_, index=train[predictors].columns)
feats = feats.sort_values(ascending=False)
feats



total_coupon                                 0.307353
this_month_user_receive_all_coupon_count     0.154444
Distance                                     0.100859
this_month_user_receive_same_coupon_count    0.079483
user_merchant_received                       0.069449
this_day_user_receive_all_coupon_count       0.066814
user_merchant_any                            0.060503
discount_man                                 0.051546
discount_rate                                0.048786
discount_jian                                0.037826
this_day_user_receive_same_coupon_count      0.020551
discount_type                                0.002386
dtype: float64

In [41]:
rf_predicted = rf.predict_proba(valid[predictors])
rf_predicted

array([[0.99501219, 0.00498781],
       [1.        , 0.        ],
       [1.        , 0.        ],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [42]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=rf_predicted[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=rf_predicted.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.724, Accuracy: 0.942


In [199]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
MMEncoder = MinMaxScaler()
train_X = MMEncoder.fit_transform(train[predictors])
cross_val_score(estimator, train_X, train['label'].values, cv=5).mean()

  return self.partial_fit(X, y)


0.9505176316051269

In [43]:

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])
    
    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [48]:
#model = check_model(train, predictors)
model = SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=120, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)
model.fit(train[predictors],train['label'].values.ravel())



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=120,
       n_iter=None, n_iter_no_change=5, n_jobs=1, penalty='elasticnet',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [49]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [50]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.724, Accuracy: 0.863


In [51]:
train= train.drop(['Discount_rate'],axis=1)
valid= valid.drop(['Discount_rate'],axis=1)

In [15]:
cross_val_score(model, train_X, train['label'].values, cv=5).mean()

NameError: name 'train_X' is not defined

In [50]:
from xgboost import  XGBClassifier
xgb_model = XGBClassifier()
parameters = {'max_depth': [3],'n_estimators':[150]}
clf = GridSearchCV(xgb_model, parameters, scoring='roc_auc')
clf.fit(train[predictors], train['label'].values)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [3], 'n_estimators': [150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [52]:
from xgboost import  XGBClassifier
xgb =XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=700, n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)
xgb.fit(train[predictors],train['label'].values.ravel())


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=700, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [53]:
xgb_predicted = xgb.predict_proba(valid[predictors])


In [54]:
xgb_predicted

array([[9.9279875e-01, 7.2012534e-03],
       [9.9957955e-01, 4.2042407e-04],
       [9.9477619e-01, 5.2238065e-03],
       ...,
       [9.9925691e-01, 7.4309611e-04],
       [9.3871045e-01, 6.1289560e-02],
       [9.7702265e-01, 2.2977343e-02]], dtype=float32)

In [55]:
auc_score = roc_auc_score(y_true=valid.label, y_score=xgb_predicted[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=xgb_predicted.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.873, Accuracy: 0.955


In [56]:
dftest = dftest.fillna(0)
dftest.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,discount_rate,discount_man,discount_jian,discount_type,this_month_user_receive_all_coupon_count,this_month_user_receive_same_coupon_count,this_day_user_receive_all_coupon_count,this_day_user_receive_same_coupon_count,user_merchant_received,user_merchant_any,total_coupon
0,1439408,4663,11002.0,150:20,1.0,20160528.0,0.866667,150,20,1,3,1,1,1,1,1,11312
1,1439408,2632,8591.0,20:1,0.0,20160613.0,0.95,20,1,1,3,2,1,1,2,2,11
2,1439408,2632,8591.0,20:1,0.0,20160516.0,0.95,20,1,1,3,2,1,1,2,2,11
3,2029232,450,1532.0,30:5,0.0,20160530.0,0.833333,30,5,1,2,1,1,1,1,1,22210
4,2029232,6459,12737.0,20:1,0.0,20160519.0,0.95,20,1,1,2,1,1,1,1,1,16


In [57]:

targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
xgb_test_predicted = xgb.predict_proba(testset[predictors])
rf_test_predicted = rf.predict_proba(testset[predictors])
#stacking_test_predicted = stacking.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = xgb_test_predicted[:, 1]*0.9 + y_test_pred[:, 1]*0.06 +rf_test_predicted[:, 1]*0.04
print(test1.shape)

(306313, 17)
(306313, 13)


In [58]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [59]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("baseline_example0616-9.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.037732
1,1000020_8192_20160513,0.077602
2,1000065_1455_20160527,0.096831
3,1000085_8067_20160513,0.193161
4,1000086_2418_20160613,0.03736
