In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import time

In [7]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score, roc_auc_score
group_kfold = GroupKFold(n_splits=5)

In [3]:
def get_input(path, label=True):
    df = pd.read_csv(path)
    if label:
        df_X = df.drop(['user_id', 'eval_set', 'product_id', 'label'], 1)
        columns = df_X.columns
        return df_X.astype(float).values, df['label'], df['user_id'], df['product_id'], columns
    else:
        return df.drop(['user_id', 'eval_set', 'product_id'], 1).astype(float).values, df['user_id'], df['product_id']

In [4]:
X, y, user_ids, product_ids, columns = get_input('../data/train.data')

In [5]:
for train_index, val_index in group_kfold.split(X, y, user_ids.values):
    break
X_train, y_train = X[train_index], y[train_index]
X_val, y_val = X[val_index], y[val_index]

In [6]:
"""
d_train = lgb.Dataset(X, y)
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss'},
        'num_leaves': 96,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.95,
        'bagging_freq': 5
    }
bst = lgb.train(params, d_train, 100)
bst.feature_importance()
"""

"\nd_train = lgb.Dataset(X, y)\nparams = {\n        'task': 'train',\n        'boosting_type': 'gbdt',\n        'objective': 'binary',\n        'metric': {'binary_logloss'},\n        'num_leaves': 96,\n        'feature_fraction': 0.9,\n        'bagging_fraction': 0.95,\n        'bagging_freq': 5\n    }\nbst = lgb.train(params, d_train, 100)\nbst.feature_importance()\n"

In [8]:
#m = LGBMClassifier()
m = XGBClassifier()
m.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [9]:
#for f, importance in zip(columns, m.feature_importances_):
#    print('%s\t%s' % (f, importance))

In [15]:
y_train_pred_prob = m.predict_proba(X_train)[:, 1]
y_val_pred_prob = m.predict_proba(X_val)[:, 1]

In [12]:
from collections import defaultdict
def choose(r):
    max_score, j  = -1, -1
    r.sort(key=lambda o: o[1], reverse=True)
    probs =  np.array([rr[1] for rr in r])
    product_ids =  np.array([rr[2] for rr in r])
    
    prob_sum = probs.sum()
    tp = 0
    for i in range(len(r) + 1):
        score = f1_predict(tp, i, prob_sum)
        if score > max_score:
            max_score = score
            j = i
        if i < len(r): tp += probs[i]
    return product_ids[:j]
def f1_predict(tp, k, prob_sum):
    if tp == 0: return 0
    precision = tp / k
    recall = tp / prob_sum
    return 2 * precision * recall / (precision + recall)
def f1(a, b):
    a, b = set(a), set(b)
    tp = len(a.intersection(b)) * 1.0
    if tp == 0: return 0
    precision = tp / len(a)
    recall = tp / len(b)
    return 2 * precision * recall / (precision + recall)
def constuct_user(y, y_pred_proba, user_ids, product_ids):
    user = defaultdict(list)
    for y_real, y_prob, user_id, product_id in zip(y, y_pred_proba, user_ids, product_ids):
        user[user_id].append((y_real, y_prob, product_id))
    return user

In [13]:
def evaluate(y, y_pred_proba, user_ids, product_ids):
    user = constuct_user(y, y_pred_proba, user_ids, product_ids)
    res = 0
    for i, (user_id, r) in enumerate(user.iteritems()):
        candidates = choose(r)
        real = [rr[2] for rr in r if rr[0] == 1]
        score = f1(candidates, real)
        res += score
    print(res / len(user)) 

In [14]:
evaluate(y_val, y_val_pred_prob, user_ids[val_index], product_ids[val_index])

0.364154038085


In [16]:
evaluate(y_train, y_train_pred_prob, user_ids[train_index], product_ids[train_index])

0.364276968572


In [17]:
orders_test = pd.read_csv('../csv/orders.csv')
orders_test = orders_test[orders_test.eval_set == 'test']

In [18]:
o, u = orders_test['order_id'], orders_test['user_id']
u2o = {uu: oo for oo, uu in zip(o, u)}

In [19]:
X_test, user_ids_test, product_ids_test = get_input('../data/test.data', False)

In [25]:
y_test_pred_proba = m.predict_proba(X_test)[:, 1]

In [26]:
user_test = constuct_user(np.zeros(X_test.shape[0]), y_test_pred_proba, user_ids_test, product_ids_test)

In [27]:
from __future__ import print_function
with open('../data/result_20170731.csv', 'w') as f:
    print('order_id,products', file=f)
    for user_id, r in user_test.iteritems():
        order_id = u2o[user_id]
        v = [str(p) for p in choose(r).tolist()]
        if len(v) == 0:
            print('%s,None' % order_id, file=f)
        else:
            print('%s,%s' % (order_id, ' '.join(v)), file=f)