In [3]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "data/ml100marathon-02-01"

## 添加label 目標column

In [139]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))

In [18]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
##output new train_offline csv
dfoff.reset_index(drop=True, inplace=True)
dfoff.to_csv(DATA_ROOT+"/train_offline_label.csv", index_label=False ) 
dfoff["label"].value_counts()
dfoff.head(10)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0
6,73611,2099,12034.0,100:10,,20160207.0,,0
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0


## ====================

In [249]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline_label.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=False)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(10)

(1160742, 8)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0
6,73611,2099,12034.0,100:10,,20160207.0,,0
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0


## ====Generate feature====

In [250]:
## coupon related feature

def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    #f = df[df['label'] != -1].copy()
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    

    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    
 
    
    return df

dfoff = dfoff[dfoff['label'] != -1].copy()
dfoff = processData(dfoff)
dftest = processData(dftest)
print(dftest.shape)


(306313, 10)


In [251]:
##dfoff
## this_month_user_receive_all_coupon_count
t =  dfoff[['User_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('User_id').agg('sum').reset_index()

    
## this_month_user_receive_same_coupon_count
t1 = dfoff[['User_id','Coupon_id']]
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
dfoff = pd.merge(dfoff,t,on='User_id',how='left')
dfoff = pd.merge(dfoff,t1,on=['User_id','Coupon_id'],how='left')
dfoff = dfoff[~np.isnan(dfoff['Coupon_id'])].copy()
dfoff[np.isnan(dfoff['this_month_user_receive_all_coupon_count'])] = 0
dfoff[np.isnan(dfoff['this_month_user_receive_same_coupon_count'])] = 0
print(dfoff.shape)
dfoff.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


(746969, 14)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,discount_rate,discount_man,discount_jian,discount_type,this_month_user_receive_all_coupon_count,this_month_user_receive_same_coupon_count
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,0.95,20,1,1,2,1
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,0.95,20,1,1,2,1
2,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,0.9,200,20,1,1,1
3,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,0.9,200,20,1,1,1
4,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,0.5,10,5,1,1,1


In [270]:
##dftest
## this_month_user_receive_all_coupon_count
t =  dftest[['User_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('User_id').agg('sum').reset_index()

    
## this_month_user_receive_same_coupon_count
t1 = dftest[['User_id','Coupon_id']]
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
dftest = pd.merge(dftest,t,on='User_id',how='left')
dftest = pd.merge(dftest,t1,on=['User_id','Coupon_id'], how='left')
dftest= dftest[~np.isnan(dftest['Coupon_id'])].copy()
dftest[np.isnan(dftest['this_month_user_receive_all_coupon_count'])] = 0
dftest[np.isnan(dftest['this_month_user_receive_same_coupon_count'])] = 0
print(dftest.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


(306313, 16)


### merchant related feature

## Distinct Merchant_id
merchant3 = dfoff[['Merchant_id','Coupon_id','Distance','Date_received','Date']]
t = merchant3[['Merchant_id']]
t.drop_duplicates(inplace=True)

## total_sales
t1 = merchant3[merchant3.Date!='null'][['Merchant_id']]
t1['total_sales'] = 1
t1 = t1.groupby('Merchant_id').agg('sum').reset_index()


## sales_use_coupon
t2 = merchant3[(merchant3.Date!='null')&(merchant3.Coupon_id!='null')][['Merchant_id']]
t2['sales_use_coupon'] = 1
t2 = t2.groupby('Merchant_id').agg('sum').reset_index()

##total_coupon
t3 = merchant3[merchant3.Coupon_id!='null'][['Merchant_id']]
t3['total_coupon'] = 1
t3 = t3.groupby('Merchant_id').agg('sum').reset_index()




### other feature

## this_month_user_receive_all_coupon_count
t =  dftest[['User_id']]
t['this_month_user_receive_all_coupon_count'] = 1
t = t.groupby('User_id').agg('sum').reset_index()
other_feature1 = pd.merge(dftest,t,on='User_id')
## this_month_user_receive_same_coupon_count
t1 = dftest[['User_id','Coupon_id']]
t1['this_month_user_receive_same_coupon_count'] = 1
t1 = t1.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
dftest = pd.merge(other_feature1,t1,on=['User_id','Coupon_id'])
dftest

## ======Split Data ======

In [254]:
## Naive model
def split_train_valid(row, date_cut="20160415"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
#df = dfoff[dfoff['Date_received'] != 0].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
train.to_csv(DATA_ROOT+"/train.csv", index_label=False ) 
valid.reset_index(drop=True, inplace=True)
valid.to_csv(DATA_ROOT+"/valid.csv", index_label=False ) 
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 662393, #positive: 32277
Valid size: 84576, #positive: 4027


In [255]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance','this_month_user_receive_all_coupon_count','this_month_user_receive_same_coupon_count'
                   ] 
predictors = original_feature
print(len(original_feature),original_feature)

7 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'this_month_user_receive_all_coupon_count', 'this_month_user_receive_same_coupon_count']


In [256]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
rf.fit(train[predictors],train['label'].values)
feats = pd.Series(data=rf.feature_importances_, index=train[predictors].columns)
feats = feats.sort_values(ascending=False)
feats



this_month_user_receive_same_coupon_count    0.319688
this_month_user_receive_all_coupon_count     0.224014
Distance                                     0.192251
discount_man                                 0.106971
discount_jian                                0.078750
discount_rate                                0.072942
discount_type                                0.005385
dtype: float64

In [257]:
rf_predicted = rf.predict_proba(valid[predictors])
rf_predicted

array([[0.99331405, 0.00668595],
       [1.        , 0.        ],
       [0.99363799, 0.00636201],
       ...,
       [1.        , 0.        ],
       [0.92142136, 0.07857864],
       [0.97162365, 0.02837635]])

In [258]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=rf_predicted[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=rf_predicted.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.786, Accuracy: 0.953


In [199]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
MMEncoder = MinMaxScaler()
train_X = MMEncoder.fit_transform(train[predictors])
cross_val_score(estimator, train_X, train['label'].values, cv=5).mean()

  return self.partial_fit(X, y)


0.9505176316051269

In [282]:
from sklearn.ensemble import ExtraTreesClassifier

extra = ExtraTreesClassifier()
extra.fit(train[predictors],train['label'].values)




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [283]:
extra_predicted = extra.predict_proba(valid[predictors])
extra_predicted

array([[0.99354244, 0.00645756],
       [1.        , 0.        ],
       [0.9934555 , 0.0065445 ],
       ...,
       [1.        , 0.        ],
       [0.9       , 0.1       ],
       [0.97619048, 0.02380952]])

In [284]:
auc_score = roc_auc_score(y_true=valid.label, y_score=extra_predicted[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=extra_predicted.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.778, Accuracy: 0.954


In [259]:

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])
    
    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [260]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.6min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [261]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

  Xt = transform.transform(Xt)


In [262]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.806, Accuracy: 0.952


In [263]:
train= train.drop(['Discount_rate'],axis=1)
valid= valid.drop(['Discount_rate'],axis=1)

In [208]:
cross_val_score(model, train_X, train['label'].values, cv=5).mean()

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.3min finished


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.3min finished


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.2min finished


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.3min finished


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.2min finished


0.951371240581941

In [None]:
from xgboost import  XGBClassifier
xgb_model = XGBClassifier(n_estimators=5)
parameters = {'max_depth': [3,4, 5, 6]}
clf = GridSearchCV(xgb_model, parameters, scoring='roc_auc')
clf.fit(train[predictors], train['label'].values)

In [265]:
from xgboost import  XGBClassifier
xgb =XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=5, n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)
xgb.fit(train[predictors],train['label'].values.ravel())


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=5, n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [266]:
xgb_predicted = xgb.predict_proba(valid[predictors])


In [267]:
xgb_predicted

array([[0.6867813 , 0.3132187 ],
       [0.6987501 , 0.30124992],
       [0.6987501 , 0.30124992],
       ...,
       [0.6987501 , 0.30124992],
       [0.6987501 , 0.30124992],
       [0.6580174 , 0.34198257]], dtype=float32)

In [268]:
auc_score = roc_auc_score(y_true=valid.label, y_score=xgb_predicted[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=xgb_predicted.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.817, Accuracy: 0.953


In [299]:

targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
xgb_test_predicted = xgb.predict_proba(testset[predictors])
#rf_test_predicted = estimator.predict_proba(testset[predictors])
#stacking_test_predicted = stacking.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = xgb_test_predicted[:, 1]*0.8 + y_test_pred[:, 1]*0.2
print(test1.shape)

(306313, 16)


  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


(306313, 8)


In [300]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [302]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("baseline_example0613-4.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.200537
1,1000020_8192_20160513,0.200537
2,1000065_1455_20160527,0.023462
3,1000085_8067_20160513,0.115476
4,1000086_2418_20160613,0.115476
