In [2]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler


DATA_ROOT = "data/Midterm_exam/"

In [3]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [4]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [5]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype(str).apply(lambda x : 1 if x in ["6.0","7.0"] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype(str).apply(lambda x : 1 if x in ["6.0","7.0"] else 0 ) # apply to testset

In [6]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [12]:
dfoff.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.5,10,5,1
6,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,1,...,0,0,0,0,0,1,0.9,100,10,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,...,0,0,1,0,0,0,0.85,200,30,1
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,...,0,0,0,0,1,0,0.9,200,20,1
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,1,0,0,0,0,0,0.9,200,20,1


In [13]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)
dfoff.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
5,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.5,10,5,1
6,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,1,...,0,0,0,0,0,1,0.9,100,10,1
7,163606,1569,5054.0,200:30,10.0,20160421.0,,0,4.0,0,...,0,0,1,0,0,0,0.85,200,30,1
8,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,...,0,0,0,0,1,0,0.9,200,20,1
9,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,1,0,0,0,0,0,0.9,200,20,1


In [14]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [15]:
train.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type,is_train
0,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,1,0,0,0,0,0.95,20,1,1,True
1,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,1,...,0,0,0,1,0,0.95,20,1,1,True
2,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.9,200,20,1,True
3,2223968,3381,9776.0,10:5,2.0,20160129.0,,0,5.0,0,...,0,0,1,0,0,0.5,10,5,1,True
4,73611,2099,12034.0,100:10,99.0,20160207.0,,0,7.0,1,...,0,0,0,0,1,0.9,100,10,1,True
5,3273056,4833,7802.0,200:20,10.0,20160130.0,,0,6.0,1,...,0,0,0,1,0,0.9,200,20,1,True
6,94107,3381,7610.0,200:20,2.0,20160412.0,,0,2.0,0,...,0,0,0,0,0,0.9,200,20,1,True
7,253750,8390,7531.0,20:5,0.0,20160327.0,,0,7.0,1,...,0,0,0,0,1,0.75,20,5,1,True
8,376492,1041,13490.0,30:5,2.0,20160127.0,,0,3.0,0,...,1,0,0,0,0,0.833333,30,5,1,True
9,1964720,7884,6704.0,20:1,10.0,20160215.0,,0,1.0,0,...,0,0,0,0,0,0.95,20,1,1,True


In [31]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type']

In [32]:
train_X = train[original_feature]
train_y = train["label"]
val_X = valid[original_feature]
val_y = valid["label"]
train_X.head(20)


Unnamed: 0,discount_rate,discount_type,discount_man,discount_jian,Distance,weekday,weekday_type
0,0.95,1,20,1,0.0,3.0,0
1,0.95,1,20,1,0.0,6.0,1
2,0.9,1,200,20,1.0,5.0,0
3,0.5,1,10,5,2.0,5.0,0
4,0.9,1,100,10,99.0,7.0,1
5,0.9,1,200,20,10.0,6.0,1
6,0.9,1,200,20,2.0,2.0,0
7,0.75,1,20,5,0.0,7.0,1
8,0.833333,1,30,5,2.0,3.0,0
9,0.95,1,20,1,10.0,1.0,0


In [33]:
train_y.head(20)

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    1
13    0
14    0
15    0
16    0
17    0
18    0
19    0
Name: label, dtype: int64

In [48]:
import lightgbm as lgb
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2','auc', 'huber'],
    'learning_rate': 0.01,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 6,
    "num_leaves": 63,  
    "max_bin": 512,
    "num_iterations": 1000,
    "n_estimators": 1000
}

In [49]:
gbm = lgb.LGBMRegressor(**hyper_params)

In [50]:
gbm.fit(train_X, train_y,
        eval_set=[(val_X, val_y)],
        eval_metric='huber',
        early_stopping_rounds=500)

[1]	valid_0's l2: 0.0459896	valid_0's auc: 0.763183	valid_0's huber: 0.0229322
Training until validation scores don't improve for 500 rounds.
[2]	valid_0's l2: 0.0459488	valid_0's auc: 0.769711	valid_0's huber: 0.022913
[3]	valid_0's l2: 0.0459222	valid_0's auc: 0.791127	valid_0's huber: 0.0229005
[4]	valid_0's l2: 0.0458822	valid_0's auc: 0.790191	valid_0's huber: 0.0228816
[5]	valid_0's l2: 0.0458429	valid_0's auc: 0.78846	valid_0's huber: 0.022863
[6]	valid_0's l2: 0.0457953	valid_0's auc: 0.791147	valid_0's huber: 0.0228403
[7]	valid_0's l2: 0.0457588	valid_0's auc: 0.790783	valid_0's huber: 0.022823
[8]	valid_0's l2: 0.0457144	valid_0's auc: 0.79275	valid_0's huber: 0.022802
[9]	valid_0's l2: 0.0456712	valid_0's auc: 0.793211	valid_0's huber: 0.0227814
[10]	valid_0's l2: 0.0456552	valid_0's auc: 0.792385	valid_0's huber: 0.0227738
[11]	valid_0's l2: 0.0456282	valid_0's auc: 0.790606	valid_0's huber: 0.022761
[12]	valid_0's l2: 0.0455907	valid_0's auc: 0.790829	valid_0's huber: 0.0

[106]	valid_0's l2: 0.0437577	valid_0's auc: 0.792959	valid_0's huber: 0.0218541
[107]	valid_0's l2: 0.0437492	valid_0's auc: 0.793076	valid_0's huber: 0.0218499
[108]	valid_0's l2: 0.0437394	valid_0's auc: 0.792985	valid_0's huber: 0.021845
[109]	valid_0's l2: 0.0437356	valid_0's auc: 0.792799	valid_0's huber: 0.0218431
[110]	valid_0's l2: 0.0437319	valid_0's auc: 0.792609	valid_0's huber: 0.0218413
[111]	valid_0's l2: 0.0437233	valid_0's auc: 0.792793	valid_0's huber: 0.021837
[112]	valid_0's l2: 0.0437189	valid_0's auc: 0.792744	valid_0's huber: 0.0218348
[113]	valid_0's l2: 0.0437093	valid_0's auc: 0.79263	valid_0's huber: 0.02183
[114]	valid_0's l2: 0.0437015	valid_0's auc: 0.79279	valid_0's huber: 0.0218262
[115]	valid_0's l2: 0.0436932	valid_0's auc: 0.79261	valid_0's huber: 0.021822
[116]	valid_0's l2: 0.0436842	valid_0's auc: 0.79236	valid_0's huber: 0.0218175
[117]	valid_0's l2: 0.0436738	valid_0's auc: 0.792591	valid_0's huber: 0.0218122
[118]	valid_0's l2: 0.0436668	valid_0

[212]	valid_0's l2: 0.0433858	valid_0's auc: 0.79175	valid_0's huber: 0.0216673
[213]	valid_0's l2: 0.0433842	valid_0's auc: 0.791768	valid_0's huber: 0.0216665
[214]	valid_0's l2: 0.0433829	valid_0's auc: 0.791749	valid_0's huber: 0.0216658
[215]	valid_0's l2: 0.0433834	valid_0's auc: 0.791615	valid_0's huber: 0.021666
[216]	valid_0's l2: 0.0433817	valid_0's auc: 0.791661	valid_0's huber: 0.0216652
[217]	valid_0's l2: 0.0433821	valid_0's auc: 0.791647	valid_0's huber: 0.0216653
[218]	valid_0's l2: 0.0433805	valid_0's auc: 0.791651	valid_0's huber: 0.0216646
[219]	valid_0's l2: 0.0433796	valid_0's auc: 0.791669	valid_0's huber: 0.0216641
[220]	valid_0's l2: 0.0433788	valid_0's auc: 0.791742	valid_0's huber: 0.0216636
[221]	valid_0's l2: 0.0433777	valid_0's auc: 0.791713	valid_0's huber: 0.021663
[222]	valid_0's l2: 0.0433764	valid_0's auc: 0.791715	valid_0's huber: 0.0216624
[223]	valid_0's l2: 0.0433761	valid_0's auc: 0.791751	valid_0's huber: 0.0216622
[224]	valid_0's l2: 0.0433751	v

[320]	valid_0's l2: 0.0433644	valid_0's auc: 0.791367	valid_0's huber: 0.0216548
[321]	valid_0's l2: 0.0433654	valid_0's auc: 0.791335	valid_0's huber: 0.0216553
[322]	valid_0's l2: 0.0433662	valid_0's auc: 0.791271	valid_0's huber: 0.0216556
[323]	valid_0's l2: 0.0433658	valid_0's auc: 0.791273	valid_0's huber: 0.0216554
[324]	valid_0's l2: 0.0433659	valid_0's auc: 0.791255	valid_0's huber: 0.0216555
[325]	valid_0's l2: 0.0433658	valid_0's auc: 0.791247	valid_0's huber: 0.0216554
[326]	valid_0's l2: 0.0433654	valid_0's auc: 0.791241	valid_0's huber: 0.0216552
[327]	valid_0's l2: 0.0433654	valid_0's auc: 0.791225	valid_0's huber: 0.0216552
[328]	valid_0's l2: 0.0433666	valid_0's auc: 0.791206	valid_0's huber: 0.0216558
[329]	valid_0's l2: 0.0433676	valid_0's auc: 0.791195	valid_0's huber: 0.0216563
[330]	valid_0's l2: 0.0433673	valid_0's auc: 0.791184	valid_0's huber: 0.0216561
[331]	valid_0's l2: 0.0433681	valid_0's auc: 0.791072	valid_0's huber: 0.0216565
[332]	valid_0's l2: 0.043367

[430]	valid_0's l2: 0.0434126	valid_0's auc: 0.790854	valid_0's huber: 0.0216777
[431]	valid_0's l2: 0.0434131	valid_0's auc: 0.790857	valid_0's huber: 0.021678
[432]	valid_0's l2: 0.0434144	valid_0's auc: 0.79084	valid_0's huber: 0.0216786
[433]	valid_0's l2: 0.0434154	valid_0's auc: 0.79086	valid_0's huber: 0.0216791
[434]	valid_0's l2: 0.0434169	valid_0's auc: 0.790853	valid_0's huber: 0.0216798
[435]	valid_0's l2: 0.043418	valid_0's auc: 0.790817	valid_0's huber: 0.0216804
[436]	valid_0's l2: 0.0434188	valid_0's auc: 0.790806	valid_0's huber: 0.0216807
[437]	valid_0's l2: 0.0434203	valid_0's auc: 0.790778	valid_0's huber: 0.0216815
[438]	valid_0's l2: 0.0434207	valid_0's auc: 0.790769	valid_0's huber: 0.0216817
[439]	valid_0's l2: 0.0434213	valid_0's auc: 0.790768	valid_0's huber: 0.021682
[440]	valid_0's l2: 0.0434226	valid_0's auc: 0.790776	valid_0's huber: 0.0216826
[441]	valid_0's l2: 0.043423	valid_0's auc: 0.79077	valid_0's huber: 0.0216828
[442]	valid_0's l2: 0.0434238	valid

[540]	valid_0's l2: 0.0434659	valid_0's auc: 0.789987	valid_0's huber: 0.0217036
[541]	valid_0's l2: 0.0434664	valid_0's auc: 0.789989	valid_0's huber: 0.0217038
[542]	valid_0's l2: 0.0434667	valid_0's auc: 0.790019	valid_0's huber: 0.021704
[543]	valid_0's l2: 0.0434678	valid_0's auc: 0.790006	valid_0's huber: 0.0217045
[544]	valid_0's l2: 0.0434685	valid_0's auc: 0.790009	valid_0's huber: 0.0217049
[545]	valid_0's l2: 0.0434691	valid_0's auc: 0.790018	valid_0's huber: 0.0217052
[546]	valid_0's l2: 0.0434693	valid_0's auc: 0.790001	valid_0's huber: 0.0217052
[547]	valid_0's l2: 0.0434698	valid_0's auc: 0.789994	valid_0's huber: 0.0217055
[548]	valid_0's l2: 0.0434708	valid_0's auc: 0.789996	valid_0's huber: 0.021706
[549]	valid_0's l2: 0.0434714	valid_0's auc: 0.789983	valid_0's huber: 0.0217063
[550]	valid_0's l2: 0.0434719	valid_0's auc: 0.789974	valid_0's huber: 0.0217065
[551]	valid_0's l2: 0.0434718	valid_0's auc: 0.789986	valid_0's huber: 0.0217065
[552]	valid_0's l2: 0.0434724	

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, boosting_type='gbdt',
       class_weight=None, colsample_bytree=1.0, feature_fraction=0.6,
       importance_type='split', learning_rate=0.01, max_bin=512,
       max_depth=6, metric=['l2', 'auc', 'huber'], min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
       n_jobs=-1, num_iterations=1000, num_leaves=63,
       objective='regression', random_state=None, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0, task='train', verbose=0)

In [43]:
from sklearn.metrics import mean_squared_log_error
y_pred = gbm.predict(train_X, num_iteration=gbm.best_iteration_)

In [56]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[original_feature].copy()

test_pred = gbm.predict(testset, num_iteration=gbm.best_iteration_)
print(test_pred)
test1 = testset.copy()
test1.head(20)
test1['pred_prob'] = test_pred
print(test1.shape)


(306313, 19)
[0.03014017 0.08239818 0.08239818 ... 0.05153422 0.03100414 0.10903009]
(306313, 8)


In [57]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [58]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.123516
1,1000020_8192_20160513,0.115793
2,1000065_1455_20160527,0.077146
3,1000085_8067_20160513,0.08062
4,1000086_2418_20160613,0.079252


In [59]:
out.to_csv("midterm_output.csv", header=["uid", "label"], index=False)

In [42]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=5, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type']


In [43]:
model = check_model(train, predictors)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.0min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [44]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

  Xt = transform.transform(Xt)


In [45]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.757, Accuracy: 0.952


In [46]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 19)
(306313, 8)


  Xt = transform.transform(Xt)


In [47]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [48]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.099644
1,1000020_8192_20160513,0.100839
2,1000065_1455_20160527,0.079201
3,1000085_8067_20160513,0.081605
4,1000086_2418_20160613,0.077722


In [51]:
out.to_csv("midterm_output.csv", header=["uid", "label"], index=False)