In [126]:
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import sys
sys.path.append("../code/utils/")
sys.path.append("../code/analysis/")
import data_utils as du
from visualize_feature_cooccurrence import CooccurrenceVisualizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from glove import Glove

In [4]:
multi_feat_names = ['marriageStatus', 'interest1', 'interest2', 'interest3',
                   'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1',
                   'topic2', 'topic3', 'appIdInstall', 'appIdAction', 'ct', 'os']  # 16 in total

In [100]:
df_train = du.load_raw_data("train")
df_ad = du.load_raw_data("ad")
aids = df_ad['aid'].values

In [137]:
# 'marriageStatus': 5
feat_name = 'interest1'
emb_dim = 10
val_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
glove = Glove(no_components=emb_dim, learning_rate=0.05)
glove.fit(cooc.tocoo().astype(np.float64), epochs=1000, verbose=False)

# word_vectors = glove.word_vectors
# cooc_glove = np.dot(word_vectors, word_vectors.T)
# cooc_glove = sparse.csr_matrix(cooc_glove)

In [None]:
# CooccurrenceVisualizer.plot_cooc(cooc_glove, val_index, feat_name)

In [138]:
word_vecs = {}
for val, index in val_index.items():
    word_vecs[val] = glove.word_vectors[index]

In [141]:
df_feat = du.load_user_feature(feat_name)
df_feat = df_feat.fillna("[nan]")
df_log = pd.DataFrame(columns=["model", "feature", "aid", "numRecords",
                               "binary_auc", "embedAvg_auc", "embedMax_auc"])
aids_selected = np.random.choice(aids, 20, replace=False)
valid_ratio = 0.3

for aid in aids_selected:
    # prepare common data
    print("Validating on [aid={}]...".format(aid))
    df_selected = df_train[df_train['aid']==aid].reset_index()
    y = df_selected['label'].values
    y = (y + 1) / 2
    num_users = df_selected.shape[0]
    train_size = int(num_users * (1 - valid_ratio))
    print("{} records selected.".format(num_users))
    
    # prepare embedding
    print("preparing data...")
    X_pool = []
    # for i in tqdm.tqdm(range(df_selected.shape[0])):
    #     row = df_selected.iloc[i]
    for i, row in df_selected.iterrows():
        # if i % 1000 == 0:
        #     print("{} done.".format(i))
        feat_str = df_feat[df_interest1["uid"]==row["uid"]][feat_name].values[0]
        try:
            feat_vals = feat_str.split()
            vecs = np.zeros((len(feat_vals), emb_dim))
            for i, feat in enumerate(feat_vals):
                vecs[i] += word_vecs[feat]
        except AttributeError:
            vecs = np.zeros((1, emb_dim))
            vecs[0] = word_vecs["[nan]"]
        X_pool.append(vecs)
    print("{} done.".format(num_users))

    # =====================
    # train on binary input
    # =====================
    # prepare data
    uids, (ufeat_index, uvec) = du.load_user_cnt(feat_name)
    uid_to_index = dict(zip(uids, list(range(len(uids)))))  # mapping from uids to distinct indices
    u_index = df_selected['uid'].map(uid_to_index).values
    X_binary = uvec[u_index,:]

    # split train/valid set
    X_train, y_train = X_binary[:train_size], y[:train_size]
    X_valid, y_valid = X_binary[train_size:], y[train_size:]

    # normal LR
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_bin = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_bin = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Binary Input] AUC: {:.6f} logloss: {:.6f}".format(auc_bin, logloss_bin))
    
    # LR with balanced class weights
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_bin2 = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_bin2 = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Binary Input][Weighted] AUC: {:.6f} logloss: {:.6f}".format(auc_bin2, logloss_bin2))

    # =======================
    # average embedding input
    # =======================
    # prepare data
    X_avg = np.zeros((num_users, emb_dim))
    for i in range(num_users):
        X_avg[i] = X_pool[i].mean(axis=0)

    # split train/valid set
    X_train, y_train = X_avg[:train_size], y[:train_size]
    X_valid, y_valid = X_avg[train_size:], y[train_size:]

    # normal LR
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_avg = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_avg = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Avg Embedding Input] AUC: {:.6f} logloss: {:.6f}".format(auc_avg, logloss_avg))
    
    # LR with balanced class weights
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_avg2 = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_avg2 = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Avg Embedding Input][Weighted] AUC: {:.6f} logloss: {:.6f}".format(auc_avg2, logloss_avg2))

    # ===================
    # max embedding input
    # ===================
    # prepare data
    X_max = np.zeros((num_users, emb_dim))
    for i in range(num_users):
        X_max[i] = X_pool[i].max(axis=0)

    # split train/valid set
    X_train, y_train = X_max[:train_size], y[:train_size]
    X_valid, y_valid = X_max[train_size:], y[train_size:]

    # normal LR
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_max = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_max = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Max Embedding Input] AUC: {:.6f} logloss: {:.6f}".format(auc_max, logloss_max))
    
    # LR with balanced class weights
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    proba_valid = lr.predict_proba(X_valid)
    auc_max2 = metrics.roc_auc_score(y_valid, proba_valid[:, 1])
    logloss_max2 = metrics.log_loss(y_valid, proba_valid[:, 1])
    print("[On Avg Embedding Input][Weighted] AUC: {:.6f} logloss: {:.6f}".format(auc_max2, logloss_max2))
    
    # ==========
    # update log
    # ==========
    df_log.loc[df_log.shape[0]] = {"model": "LR",
                                   "feature": feat_name,
                                   "aid": aid, 
                                   "numRecords": num_users,
                                   "binary_auc": auc_bin, 
                                   "embedAvg_auc": auc_avg,
                                   "embedMax_auc": auc_max}
    df_log.loc[df_log.shape[0]] = {"model": "LR[weighted]",
                                   "feature": feat_name,
                                   "aid": aid, 
                                   "numRecords": num_users,
                                   "binary_auc": auc_bin2, 
                                   "embedAvg_auc": auc_avg2,
                                   "embedMax_auc": auc_max2}

Validating on [aid=688]...
10224 records selected.
preparing data...
10224 done.
[On Binary Input] AUC: 0.667895 logloss: 0.184944
[On Binary Input][Weighted] AUC: 0.655864 logloss: 0.622847
[On Avg Embedding Input] AUC: 0.644558 logloss: 0.187240
[On Avg Embedding Input][Weighted] AUC: 0.635217 logloss: 0.666936
[On Max Embedding Input] AUC: 0.630995 logloss: 0.187754
[On Avg Embedding Input][Weighted] AUC: 0.620564 logloss: 0.665548
Validating on [aid=86]...
11917 records selected.
preparing data...
11917 done.
[On Binary Input] AUC: 0.562806 logloss: 0.196433
[On Binary Input][Weighted] AUC: 0.565209 logloss: 0.644452
[On Avg Embedding Input] AUC: 0.559907 logloss: 0.192360
[On Avg Embedding Input][Weighted] AUC: 0.560754 logloss: 0.683189
[On Max Embedding Input] AUC: 0.592499 logloss: 0.191048
[On Avg Embedding Input][Weighted] AUC: 0.583355 logloss: 0.679888
Validating on [aid=1991]...
12885 records selected.
preparing data...
12885 done.
[On Binary Input] AUC: 0.607147 logloss: 

[On Max Embedding Input] AUC: 0.570533 logloss: 0.185346
[On Avg Embedding Input][Weighted] AUC: 0.570346 logloss: 0.684827
Validating on [aid=313]...
7031 records selected.
preparing data...
7031 done.
[On Binary Input] AUC: 0.666953 logloss: 0.167324
[On Binary Input][Weighted] AUC: 0.635020 logloss: 0.630986
[On Avg Embedding Input] AUC: 0.584637 logloss: 0.172309
[On Avg Embedding Input][Weighted] AUC: 0.573302 logloss: 0.677408
[On Max Embedding Input] AUC: 0.572776 logloss: 0.172063
[On Avg Embedding Input][Weighted] AUC: 0.567932 logloss: 0.678767


In [161]:
df_log.groupby('model')[['binary_auc', 'embedAvg_auc', 'embedMax_auc']].mean()

Unnamed: 0_level_0,binary_auc,embedAvg_auc,embedMax_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LR,0.619304,0.595998,0.594338
LR[weighted],0.614946,0.596816,0.593595


In [160]:
df_log

Unnamed: 0,model,feature,aid,numRecords,binary_auc,embedAvg_auc,embedMax_auc
0,LR,interest1,688,10224,0.667895,0.644558,0.630995
1,LR[weighted],interest1,688,10224,0.655864,0.635217,0.620564
2,LR,interest1,86,11917,0.562806,0.559907,0.592499
3,LR[weighted],interest1,86,11917,0.565209,0.560754,0.583355
4,LR,interest1,1991,12885,0.607147,0.606765,0.553344
5,LR[weighted],interest1,1991,12885,0.599473,0.603726,0.552661
6,LR,interest1,686,22720,0.564838,0.550763,0.553446
7,LR[weighted],interest1,686,22720,0.56142,0.54789,0.542876
8,LR,interest1,1566,211522,0.595894,0.587114,0.589892
9,LR[weighted],interest1,1566,211522,0.596239,0.587265,0.589522


In [136]:
# df_log[['binary_auc', 'embedAvg_auc', 'embedMax_auc']].mean()

binary_auc      0.553211
embedAvg_auc    0.550122
embedMax_auc    0.557878
dtype: float64