In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import pickle
import os
from scipy.sparse import hstack

In [2]:
data_dir = '../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
df_train = load("train.csv")

In [9]:
uid, (ufeat_index, uvec) = load_user_cnt("interest1")
aid, (afeat_index, avec) = load_ad_cnt('aid')
uid_to_index = dict(zip(uid, list(range(len(uid)))))  # mapping from uids to distinct indices
aid_to_index = dict(zip(aid, list(range(len(aid)))))  # mapping from aids to distinct indices

a_index = df_train['aid'].map(aid_to_index).values  # list of indices for matrix joining
u_index = df_train['uid'].map(uid_to_index).values  # list of indices for matrix joining

X = hstack((avec[a_index,:], uvec[u_index,:])).tocsr()  # joined user and advertise matrix
y = (df_train['label'].values + 1) / 2

X_train, y_train = X[:6000000], y[:6000000]
X_valid, y_valid = X[6000000:], y[6000000:]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

In [11]:
lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 4,
    'num_leaves': 31,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_valid], 
                valid_names=['train', 'valid1'])

[1]	train's auc: 0.552581	valid1's auc: 0.552446
[2]	train's auc: 0.555142	valid1's auc: 0.55497
[3]	train's auc: 0.557755	valid1's auc: 0.557258
[4]	train's auc: 0.558614	valid1's auc: 0.557803
[5]	train's auc: 0.558499	valid1's auc: 0.557781
[6]	train's auc: 0.56004	valid1's auc: 0.559386
[7]	train's auc: 0.560327	valid1's auc: 0.559746
[8]	train's auc: 0.560874	valid1's auc: 0.560505
[9]	train's auc: 0.562135	valid1's auc: 0.561793
[10]	train's auc: 0.562448	valid1's auc: 0.562038
[11]	train's auc: 0.562599	valid1's auc: 0.562182
[12]	train's auc: 0.56346	valid1's auc: 0.563061
[13]	train's auc: 0.563408	valid1's auc: 0.562998
[14]	train's auc: 0.564594	valid1's auc: 0.564188
[15]	train's auc: 0.565841	valid1's auc: 0.565321
[16]	train's auc: 0.566676	valid1's auc: 0.566134
[17]	train's auc: 0.567635	valid1's auc: 0.567215
[18]	train's auc: 0.568158	valid1's auc: 0.567648
[19]	train's auc: 0.568941	valid1's auc: 0.568212
[20]	train's auc: 0.569756	valid1's auc: 0.569169
[21]	train's

In [12]:
uid, (ufeat_index, uidf, uvec) = load_user_tfidf("interest1")
uid_to_index = dict(zip(uid, list(range(len(uid)))))

u_index = df_train['uid'].map(uid_to_index).values

X = hstack((avec[a_index,:], uvec[u_index,:])).tocsr()

X_train = X[:6000000]
X_valid = X[6000000:]

lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

In [13]:
lgb_train = lgb.Dataset(X_train.astype(np.float32), y_train)
lgb_valid = lgb.Dataset(X_valid.astype(np.float32), y_valid)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 4,
    'num_leaves': 31,
    'learning_rate': 0.15,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'verbose': 0
}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100,
                valid_sets=[lgb_train, lgb_valid], 
                valid_names=['train', 'valid1'])

[1]	train's auc: 0.549305	valid1's auc: 0.549279
[2]	train's auc: 0.550894	valid1's auc: 0.551084
[3]	train's auc: 0.552355	valid1's auc: 0.55197
[4]	train's auc: 0.552268	valid1's auc: 0.551556
[5]	train's auc: 0.552621	valid1's auc: 0.55196
[6]	train's auc: 0.552933	valid1's auc: 0.55253
[7]	train's auc: 0.552965	valid1's auc: 0.552619
[8]	train's auc: 0.553076	valid1's auc: 0.552511
[9]	train's auc: 0.554256	valid1's auc: 0.553837
[10]	train's auc: 0.555064	valid1's auc: 0.554635
[11]	train's auc: 0.555854	valid1's auc: 0.555335
[12]	train's auc: 0.557434	valid1's auc: 0.557021
[13]	train's auc: 0.557664	valid1's auc: 0.55721
[14]	train's auc: 0.558318	valid1's auc: 0.557913
[15]	train's auc: 0.558687	valid1's auc: 0.558316
[16]	train's auc: 0.559192	valid1's auc: 0.558748
[17]	train's auc: 0.559447	valid1's auc: 0.558944
[18]	train's auc: 0.559637	valid1's auc: 0.559122
[19]	train's auc: 0.560076	valid1's auc: 0.55929
[20]	train's auc: 0.560446	valid1's auc: 0.559711
[21]	train's a