In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import os
from scipy.sparse import hstack
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [2]:
data_dir = '../data/raw/preliminary_contest_data/'
ad_cnt_dir = '../data/nlp_count/preliminary_contest_data/byAdFeatureName/'
user_cnt_dir = '../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
user_tfidf_dir = '../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'

In [3]:
def load(filename, **kw):
    return pd.read_csv(os.path.join(data_dir, filename), **kw)

In [4]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj

In [5]:
def load_ad_cnt(feat_name):
    filename = "adFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "aid.pkl".format(feat_name)
    filepath = os.path.join(ad_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [6]:
def load_user_cnt(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    index, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_cnt_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, matrix)

In [7]:
def load_user_tfidf(feat_name):
    filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    index, idf, matrix = load_pickle(filepath)
    
    filename = "uid.pkl".format(feat_name)
    filepath = os.path.join(user_tfidf_dir, filename)
    uid = load_pickle(filepath)
    
    return uid, (index, idf, matrix)

In [8]:
def get_time_str():
    return time.strftime("%H:%M:%S", time.gmtime())

In [9]:
df_train = load("train.csv")

In [10]:
uids, (ufeat_index, uvec) = load_user_cnt("interest1")
aids, (afeat_index, avec) = load_ad_cnt('aid')
uid_to_index = dict(zip(uids, list(range(len(uids)))))  # mapping from uids to distinct indices

In [11]:
print("====================")
print("LR on Count Features")
print("====================")
for aid in aids:
    df_selected = df_train[df_train['aid']==aid]
    u_index = df_selected['uid'].map(uid_to_index).values
    
    valid_ratio = 0.3
    train_size = int(df_selected.shape[0] * (1 - valid_ratio))
    
    X = uvec[u_index,:]
    y = (df_selected['label'].values + 1) / 2
    X_train, y_train = X[:train_size], y[:train_size]
    X_valid, y_valid = X[train_size:], y[train_size:]

    lr = LogisticRegression()
    
    print("Training LR on [aid='{}']...".format(aid))
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    logloss = metrics.log_loss(y_valid, proba_valid)
    auc = metrics.roc_auc_score(y_valid, proba_valid[:,1])
    print("[{}] AUC: {:.6f} logloss: {:.6f}".format(get_time_str(), auc, logloss))
    print()

LR on Count Features
Training LR on [aid='177']...
[04:58:49] AUC: 0.544920 logloss: 0.196567

Training LR on [aid='2050']...
[04:59:00] AUC: 0.634029 logloss: 0.179728

Training LR on [aid='1716']...
[04:59:12] AUC: 0.607228 logloss: 0.185848

Training LR on [aid='336']...
[04:59:24] AUC: 0.573658 logloss: 0.220311

Training LR on [aid='671']...
[04:59:35] AUC: 0.632321 logloss: 0.194332

Training LR on [aid='529']...
[04:59:47] AUC: 0.537244 logloss: 0.184646

Training LR on [aid='927']...
[04:59:59] AUC: 0.590404 logloss: 0.179378

Training LR on [aid='1714']...
[05:00:11] AUC: 0.601591 logloss: 0.198046

Training LR on [aid='977']...
[05:00:23] AUC: 0.498722 logloss: 0.194986

Training LR on [aid='450']...
[05:00:34] AUC: 0.676067 logloss: 0.197756

Training LR on [aid='1749']...
[05:00:47] AUC: 0.591429 logloss: 0.184581

Training LR on [aid='404']...
[05:00:59] AUC: 0.612785 logloss: 0.194488

Training LR on [aid='302']...
[05:01:11] AUC: 0.563072 logloss: 0.194805

Training LR o

Training LR on [aid='1119']...
[05:20:38] AUC: 0.574516 logloss: 0.192630

Training LR on [aid='1449']...
[05:20:49] AUC: 0.638877 logloss: 0.199388

Training LR on [aid='886']...
[05:21:00] AUC: 0.619390 logloss: 0.205170

Training LR on [aid='1580']...
[05:21:12] AUC: 0.522594 logloss: 0.208253

Training LR on [aid='975']...
[05:21:23] AUC: 0.572035 logloss: 0.193256

Training LR on [aid='74']...
[05:21:35] AUC: 0.533178 logloss: 0.181291

Training LR on [aid='136']...
[05:21:49] AUC: 0.577698 logloss: 0.187987

Training LR on [aid='311']...
[05:22:01] AUC: 0.681912 logloss: 0.178260

Training LR on [aid='1140']...
[05:22:13] AUC: 0.597353 logloss: 0.183828

Training LR on [aid='1910']...
[05:22:24] AUC: 0.612702 logloss: 0.188824

Training LR on [aid='7']...
[05:22:36] AUC: 0.589117 logloss: 0.190887

Training LR on [aid='1827']...
[05:22:48] AUC: 0.625602 logloss: 0.184897

Training LR on [aid='1483']...
[05:22:59] AUC: 0.710040 logloss: 0.175236

Training LR on [aid='613']...
[05:

In [13]:
uids, (ufeat_index, idf, uvec) = load_user_tfidf("interest1")

In [14]:
print("====================")
print("LR on TFIDF Features")
print("====================")
for aid in aids:
    df_selected = df_train[df_train['aid']==aid]
    u_index = df_selected['uid'].map(uid_to_index).values
    
    valid_ratio = 0.3
    train_size = int(df_selected.shape[0] * (1 - valid_ratio))
    
    X = uvec[u_index,:]
    y = (df_selected['label'].values + 1) / 2
    X_train, y_train = X[:train_size], y[:train_size]
    X_valid, y_valid = X[train_size:], y[train_size:]

    lr = LogisticRegression()
    
    print("Training LR on [aid='{}']...".format(aid))
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    logloss = metrics.log_loss(y_valid, proba_valid)
    auc = metrics.roc_auc_score(y_valid, proba_valid[:,1])
    print("[{}] AUC: {:.6f} logloss: {:.6f}".format(get_time_str(), auc, logloss))
    print()

LR on TFIDF Features
Training LR on [aid='177']...
[06:21:42] AUC: 0.542810 logloss: 0.194485

Training LR on [aid='2050']...
[06:21:54] AUC: 0.638728 logloss: 0.178572

Training LR on [aid='1716']...
[06:22:06] AUC: 0.601042 logloss: 0.185841

Training LR on [aid='336']...
[06:22:18] AUC: 0.578140 logloss: 0.212666

Training LR on [aid='671']...
[06:22:30] AUC: 0.648403 logloss: 0.190905

Training LR on [aid='529']...
[06:22:42] AUC: 0.535082 logloss: 0.183888

Training LR on [aid='927']...
[06:22:54] AUC: 0.599216 logloss: 0.176448

Training LR on [aid='1714']...
[06:23:06] AUC: 0.599980 logloss: 0.197754

Training LR on [aid='977']...
[06:23:19] AUC: 0.491235 logloss: 0.191665

Training LR on [aid='450']...
[06:23:31] AUC: 0.646323 logloss: 0.199731

Training LR on [aid='1749']...
[06:23:43] AUC: 0.592385 logloss: 0.184306

Training LR on [aid='404']...
[06:23:56] AUC: 0.611377 logloss: 0.194242

Training LR on [aid='302']...
[06:24:08] AUC: 0.565456 logloss: 0.194760

Training LR o

Training LR on [aid='1119']...
[06:43:25] AUC: 0.575194 logloss: 0.192627

Training LR on [aid='1449']...
[06:43:36] AUC: 0.633926 logloss: 0.197741

Training LR on [aid='886']...
[06:43:47] AUC: 0.624838 logloss: 0.201808

Training LR on [aid='1580']...
[06:44:00] AUC: 0.546902 logloss: 0.202157

Training LR on [aid='975']...
[06:44:11] AUC: 0.585039 logloss: 0.188606

Training LR on [aid='74']...
[06:44:23] AUC: 0.536525 logloss: 0.180792

Training LR on [aid='136']...
[06:44:36] AUC: 0.575153 logloss: 0.188037

Training LR on [aid='311']...
[06:44:48] AUC: 0.675366 logloss: 0.178775

Training LR on [aid='1140']...
[06:45:00] AUC: 0.599787 logloss: 0.183348

Training LR on [aid='1910']...
[06:45:12] AUC: 0.601613 logloss: 0.187058

Training LR on [aid='7']...
[06:45:24] AUC: 0.584835 logloss: 0.190421

Training LR on [aid='1827']...
[06:45:36] AUC: 0.623409 logloss: 0.184748

Training LR on [aid='1483']...
[06:45:47] AUC: 0.707010 logloss: 0.176236

Training LR on [aid='613']...
[06: