In [2]:
import pandas as pd
import numpy as np
from scipy import sparse
import time

In [3]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV



In [4]:
def get_input(path, label=True):
    df = pd.read_csv(path)
    if label:
        df_X = df.drop(['user_id', 'eval_set', 'product_id', 'label'], 1)
        columns = df_X.columns
        return df_X.astype(float).values, df['label'], df['user_id'], df['product_id'], columns
    else:
        return df.drop(['user_id', 'eval_set', 'product_id'], 1).astype(float).values, df['user_id'], df['product_id']

In [5]:
X, y, user_ids, product_ids, columns = get_input('../data/train.data')

In [6]:
from collections import defaultdict

    
def choose(probs, pNone=None):
    max_score, j, predNone = -1, -1, False
    for i in range(len(probs) + 1):
        score = f1_predict(probs, i, predNone=False)
        if score > max_score:
            max_score = score
            j = i
            predNone = False
    for i in range(len(probs) + 1):
        score = f1_predict(probs, i, predNone=True)
        if score > max_score:
            max_score = score
            j = i
            predNone = True
    return j, predNone, max_score

def f1_predict(probs, k, predNone=False):
    if k == 0 and not predNone: return 0
    pNone = (1-probs).prod()
    # with 1-pNone probability the true label is not "None"
    p1 = probs[:k].sum() / (k+1 if predNone else k)
    r1 = probs[:k].sum() / probs.sum()
    f11 = 2*p1*r1/(p1+r1) if p1+r1 > 0 else 0
    # with pNone probability the true label is "None"
    p2 = (1 if predNone else 0) / (k+1 if predNone else k)
    r2 = 1 if predNone else 0
    f12 = 2*p2*r2/(p2+r2) if p2+r2 > 0 else 0
    return (1-pNone)*f11 + pNone*f12

def f1_score(labels, k, predNone=False):
    if sum(labels) > 0 and k > 0:
        p = sum(labels[:k])/(k+1 if predNone else k)
        r = sum(labels[:k])/sum(labels)
        if p+r > 0: return 2*p*r/(p+r)
    if sum(labels) == 0 and predNone:
        p = 1/(k+1)
        r = 1
        return 2*p*r/(p+r)
    return 0


def constuct_user(y, y_pred_proba, user_ids, product_ids):
    user = defaultdict(list)
    for y_real, y_prob, user_id, product_id in zip(y, y_pred_proba, user_ids, product_ids):
        user[user_id].append((y_real, y_prob, product_id))
    return user

In [7]:
group_kfold = GroupKFold(n_splits=5)

for train_index, val_index in group_kfold.split(X, y, user_ids.values):
    X_train, y_train = X[train_index], y[train_index]
    X_val, y_val = X[val_index], y[val_index]
m = LGBMClassifier(num_leaves=200, n_estimators=200, subsample=0.95, subsample_freq=5)
m.fit(X_train, y_train)
ccv = CalibratedClassifierCV(m, method='isotonic', cv='prefit')
ccv.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=200, nthread=-1,
        num_leaves=200, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=0, silent=True, subsample=0.95, subsample_for_bin=50000,
        subsample_freq=5),
            cv='prefit', method='isotonic')

In [14]:
y_train_pred_prob = ccv.predict_proba(X_train)[:, 1]
y_val_pred_prob = ccv.predict_proba(X_val)[:, 1]

In [9]:
def get_best_products(r):
    r.sort(key=lambda o: o[1], reverse=True)
    labels = np.array([rr[0] for rr in r])
    probs =  np.array([rr[1] for rr in r])
#    probs =  np.array([rr[1] for rr in r])
    product_ids =  np.array([rr[2] for rr in r])
#    k, predNone, predicted_f1 = maximize_expectation(probs)
    k, predNone, predicted_f1 = choose(probs)
    best_products = np.append(product_ids[:k], ["None"]) if predNone else product_ids[:k]
    true_f1 = f1_score(labels, k, predNone)
    return best_products, true_f1

In [24]:
from __future__ import print_function
def evaluate(y, y_pred_proba, user_ids, product_ids):
    user = constuct_user(y, y_pred_proba, user_ids, product_ids)
    res = 0
    for i, (user_id, r) in enumerate(user.iteritems()):
        print(u2o[user_id], file=f)
        print(json.dumps(r), file=f)
        #candidates, true_f1 = get_best_products(r)
        #res += true_f1
    #return (res / len(user)) 

In [25]:
orders_train = pd.read_csv('../csv/orders.csv')
orders_train = orders_train[orders_train.eval_set == 'train']
o, u = orders_train['order_id'], orders_train['user_id']
u2o = {uu: oo for oo, uu in zip(o, u)}

In [27]:
import json
with open('../data/my_train_record', 'w') as f:
    evaluate(y_train, y_train_pred_prob, user_ids[train_index], product_ids[train_index])
    evaluate(y_val, y_val_pred_prob, user_ids[val_index], product_ids[val_index])
#print('train\t%s' % evaluate(y_train, y_train_pred_prob, user_ids[train_index], product_ids[train_index]))
#print('test\t%s' % evaluate(y_val, y_val_pred_prob, user_ids[val_index], product_ids[val_index]))  

In [9]:
#for k, v in zip(columns, m.feature_importances_):
#    print("%s\t%s" % (k, v))

In [13]:
%%time
m = LGBMClassifier(num_leaves=200, n_estimators=200, subsample=0.95, subsample_freq=5)
m.fit(X, y)
ccv = CalibratedClassifierCV(m, method='isotonic', cv='prefit')
ccv.fit(X, y)

CPU times: user 28min 15s, sys: 4.33 s, total: 28min 19s
Wall time: 3min 46s


In [18]:
orders_test = pd.read_csv('../csv/orders.csv')
orders_test = orders_test[orders_test.eval_set == 'test']

In [28]:
o, u = orders_test['order_id'], orders_test['user_id']
u2o = {uu: oo for oo, uu in zip(o, u)}

In [20]:
X_test, user_ids_test, product_ids_test = get_input('../data/test.data', False)

In [21]:
y_test_pred_proba = ccv.predict_proba(X_test)[:, 1]
#y_test_pred_proba = bst.predict(X_test)

In [22]:
user_test = constuct_user(np.zeros(X_test.shape[0]), y_test_pred_proba, user_ids_test, product_ids_test)

In [29]:
with open('../data/my_test_record', 'w') as f:
    for i, (user_id, r) in enumerate(user_test.iteritems()):
        print(u2o[user_id], file=f)
        print(json.dumps(r), file=f)

In [20]:
from __future__ import print_function
with open('../data/result_20170807_second_refine_features.csv', 'w') as f:
    print('order_id,products', file=f)
    for user_id, r in user_test.iteritems():
        order_id = u2o[user_id]
        v = [str(p) for p in get_best_products(r)[0]]
        print('%s,%s' % (order_id, ' '.join(v)), file=f)
        #if len(v) == 0:
        #    print('%s,None' % order_id, file=f)
        #else:
        #    print('%s,%s' % (order_id, ' '.join(v)), file=f)