In [1]:
import pandas as pd
from scipy import sparse
import time

In [2]:
train = pd.read_csv('../data/train.data')

In [3]:
up = [(str(row.user_id), str(row.product_id)) for row in train.itertuples()]

In [4]:
df_X = train.drop(['user_id', 'eval_set', 'product_id', 'label'], 1)
y = train['label']

In [5]:
def df2csr(df):
    return sparse.csr_matrix(df.astype(float).values)
X = df2csr(df_X)

In [6]:
sz = 80000
df_X_train = df_X[0: sz]
df_X_test = df_X[sz: 2 * sz]

X_train = X[0: sz] 
X_test = X[sz: 2 * sz]

y_train = y[0: sz]
y_test = y[sz: 2 * sz]

In [7]:
import lightgbm as lgb

In [8]:
d_train = lgb.Dataset(df_X_train, y_train, categorical_feature=['aisle_id', 'department_id'])

In [9]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98
start_time = time.time()
bst = lgb.train(params, d_train, ROUNDS)
print(time.time() - start_time)

0.149171113968


In [18]:
bst_y_pred_prob = bst.predict(df_X_test)

In [11]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
start_time = time.time()
xgb.fit(X_train, y_train)
print(time.time() - start_time)



1.52723503113


In [15]:
xgb_y_pred_prob = xgb.predict_proba(X_test)[:, 1]

In [16]:
from sklearn.metrics import f1_score, roc_auc_score

In [21]:
xgb_y_pred_prob[0:10]

array([  3.18889834e-05,   3.18889834e-05,   3.18889834e-05,
         3.18889834e-05,   3.18889834e-05,   3.18889834e-05,
         3.18889834e-05,   3.18889834e-05,   3.18889834e-05,
         3.18889834e-05], dtype=float32)

In [19]:
#lightgbm
roc_auc_score(y_test, bst_y_pred_prob)

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [27]:
#xgboost
roc_auc_score(y_test, y_pred_prob)

0.76674046184766897

In [44]:
#xgboost
#y_prob = xgb.predict_proba(X)[:, 1]

#lightgbm
y_prob = bst.predict(df_X)

In [45]:
orders = pd.read_csv('../csv/orders.csv')
order_products_train = pd.read_csv('../csv/order_products__train.csv')

In [46]:
result = orders[orders.eval_set == 'train'].set_index('order_id').join(order_products_train.set_index('order_id'))

In [47]:
from collections import defaultdict
expect = defaultdict(set)
for row in result.itertuples():
    expect[str(row.user_id)].add(str(row.product_id))

In [48]:
def f1(prediction, expect, verbose=False):
    prediction = [int(p) for p in prediction]
    expect = set([int(e) for e in expect])
    
    if verbose:
        print(prediction)
        print(expect)
    tp = 0
    for c in prediction:
        if c in expect:
            tp += 1
    if tp != 0:
        precision = tp * 1.0 / len(prediction)
        recall = tp * 1.0 / len(expect)
        f1 = 2 * precision * recall / (precision + recall)
    else:
        f1 = 0
    return f1

In [51]:
def validate(threshold):
    prediction = defaultdict(set)
    for i, (p, row) in enumerate(zip(y_prob, up)):
        if p > threshold:
            prediction[row[0]].add(row[1])
    s = 0
    for i, (user_id, product_ids) in enumerate(expect.iteritems()):
        s += f1(prediction.get(user_id, []), product_ids)
    print('%s %s' % (threshold, s / len(expect)))

In [53]:
threshold = 0.14
#for i in range(10, 20, 1):
#    validate(i * 0.01) 

In [54]:
test = pd.read_csv('../data/test.data')

In [55]:
A = sparse.csr_matrix(test.drop(['user_id', 'eval_set', 'product_id'], 1).astype(float).values)

In [56]:
upt = [(str(row.user_id), str(row.product_id)) for row in test.itertuples()]

In [58]:
#xgboost
b = xgb.predict_proba(A)[:,1]
#lightgbm
b = bst.predict(A)

In [59]:
uo = {str(int(float(u))): str(int(float(o))) for o, u in orders.reset_index().where(orders.eval_set == 'test')[['order_id', 'user_id']].astype(str).values if o != 'nan'}

In [60]:
result = {o: set() for u,o in uo.iteritems() if o != 'nan'}

In [61]:
for i, (p, row) in enumerate(zip(b, upt)):
    if p > threshold:
        result[uo[row[0]]].add(row[1])

In [62]:
from __future__ import print_function
with open('../data/result_20170713.csv', 'w') as f:
    print('order_id,products', file=f)
    for k, v in result.iteritems():
        if len(v) == 0:
            print('%s,None' % k, file=f)
        else:
            print('%s,%s' % (k, ' '.join(v)), file=f)