In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from glove import Glove
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append("../../../code/utils/")
sys.path.append("../../../code/analysis/")
sys.path.append('../../../code/pipeline/')
sys.path.append('../../../code')
import data_jointer as dj
import eval_utils as eu
import data_utils as du
import perf_utils as pu
import config

In [2]:
user_jointer = dj.PandasMatrixJointer(on="uid")
df_train = du.load_raw_data("train")
df_test = du.load_raw_data("test")
df_all = pd.concat([df_train, df_test], ignore_index=True)

train_size = df_train.shape[0]
test_size = df_test.shape[0]


def run_embedding(feat_name, emb_dim=10, learning_rate=0.01, epochs=300, version_no=1):
    # =========
    # load data
    # =========
    word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
    row_uids, (_, matrix_bin) = du.load_user_cnt(feat_name)
    num_users = matrix_bin.shape[0]
    print("Number of Users in Binary Matrix: {}".format(num_users))

    # ===============
    # train embedding
    # ===============
    with pu.profiler("fitting GloVe embedding"):
        glove = Glove(no_components=emb_dim, 
                      learning_rate=learning_rate, 
                      random_state=2018)  # fix random_state for reproducibility
        glove.fit(cooc.tocoo().astype(np.float64), 
                  epochs=epochs,
                  verbose=False)
        col_names = ["{}_embedding_{}".format(feat_name, i) for i in range(emb_dim)]

    # ====================
    # save embedding model
    # ====================
    glove_folder = "../../../model/glove"
    glove_folder = os.path.join(glove_folder, "[featureName='{}']".format(feat_name))
    glove_file = "v{}.glove_model".format(version_no)
    glove_path = os.path.join(glove_folder, glove_file)
    os.makedirs(glove_folder, exist_ok=True)
    du.save_pickle(glove, glove_path)

    # ==========
    # do pooling
    # ==========
    # should use multiprocessing
    with pu.profiler("preparing avg/max/min pooling of embedding"):
        matrix_avg = np.zeros((num_users, emb_dim))
        matrix_max = np.zeros((num_users, emb_dim))
        matrix_min = np.zeros((num_users, emb_dim))

        for i, row in tqdm.tqdm(enumerate(matrix_bin), total=num_users):
            row_pool = glove.word_vectors[row.indices]
            matrix_avg[i] = row_pool.mean(axis=0)
            matrix_max[i] = row_pool.max(axis=0)
            matrix_min[i] = row_pool.min(axis=0)

        matrix_avg = matrix_avg.astype(np.float32)
        matrix_max = matrix_max.astype(np.float32)
        matrix_min = matrix_min.astype(np.float32)

    # ===========
    # join matrix
    # ===========
    with pu.profiler("joining avg pooling matrix"):
        X_avg = user_jointer.join(df_all, matrix_avg, row_names=row_uids)
        X_avg = X_avg.astype(np.float32)
        del matrix_avg
        gc.collect()

    with pu.profiler("joining max pooling matrix"):
        X_max = user_jointer.join(df_all, matrix_max, row_names=row_uids)
        X_max = X_max.astype(np.float32)
        del matrix_max
        gc.collect()

    with pu.profiler("joining min pooling matrix"):
        X_min = user_jointer.join(df_all, matrix_min, row_names=row_uids)
        X_min = X_min.astype(np.float32)
        del matrix_min
        gc.collect()
    
    # =========
    # save data
    # =========
    emb_folder = "../../../data/embedding"
    emb_folder = os.path.join(emb_folder, "[featureName='{}']".format(feat_name))
    os.makedirs(emb_folder, exist_ok=True)

    with pu.profiler("saving data"):
        # avg pooling
        emb_file = "train.avg_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_avg[:train_size]), emb_path)  # save train data
        
        emb_file = "test1.avg_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_avg[train_size:]), emb_path)  # save test data

        # max pooling
        emb_file = "train.max_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_max[:train_size]), emb_path)  # save train data
        
        emb_file = "test1.max_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_max[train_size:]), emb_path)  # save test data

        # min pooling
        emb_file = "train.min_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_min[:train_size]), emb_path)  # save train data
        
        emb_file = "test1.min_v{}.pkl".format(version_no)
        emb_path = os.path.join(emb_folder, emb_file)
        du.save_pickle((col_names, X_min[train_size:]), emb_path)  # save test data

        del X_avg
        del X_max
        del X_min
        gc.collect()

In [3]:
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES

In [4]:
run_embedding("marriageStatus", 
              emb_dim=13, 
              learning_rate=0.001, 
              epochs=1000, 
              version_no=1)

  0%|          | 1451/9686953 [00:00<11:07, 14501.92it/s]

Number of Users in Binary Matrix: 9686953
[04:59:43] Finish fitting GloVe embedding. △M: +124.0KB. △T: 0.0 seconds.


100%|██████████| 9686953/9686953 [10:40<00:00, 15117.92it/s]


[05:10:25] Finish preparing avg/max/min pooling of embedding. △M: +1.41GB. △T: 10.7 minutes.
[05:10:47] Finish joining avg pooling matrix. △M: +86.66MB. △T: 22.2 seconds.
[05:11:10] Finish joining max pooling matrix. △M: +92.43MB. △T: 22.3 seconds.
[05:11:32] Finish joining min pooling matrix. △M: +68.33MB. △T: 22.3 seconds.
[05:11:33] Finish saving data. △M: -1.61GB. △T: 1.2 seconds.


In [5]:
run_embedding("interest1", 
              emb_dim=20, 
              learning_rate=0.0002, 
              epochs=5000, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1147/9686953 [00:00<14:04, 11464.68it/s]

[05:11:55] Finish fitting GloVe embedding. △M: +8.0KB. △T: 21.2 seconds.


100%|██████████| 9686953/9686953 [13:15<00:00, 12172.96it/s]


[05:25:14] Finish preparing avg/max/min pooling of embedding. △M: +2.17GB. △T: 13.3 minutes.
[05:25:47] Finish joining avg pooling matrix. △M: +136.26MB. △T: 33.3 seconds.
[05:26:19] Finish joining max pooling matrix. △M: +105.12MB. △T: 32.0 seconds.
[05:26:52] Finish joining min pooling matrix. △M: +105.12MB. △T: 33.0 seconds.
[05:26:55] Finish saving data. △M: -2.47GB. △T: 2.3 seconds.


In [6]:
run_embedding("interest2", 
              emb_dim=14, 
              learning_rate=0.0002, 
              epochs=5000, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1115/9686953 [00:00<14:29, 11144.09it/s]

[05:27:04] Finish fitting GloVe embedding. △M: +4.0KB. △T: 9.2 seconds.


100%|██████████| 9686953/9686953 [14:31<00:00, 11110.10it/s]


[05:41:38] Finish preparing avg/max/min pooling of embedding. △M: +1.52GB. △T: 14.6 minutes.
[05:42:07] Finish joining avg pooling matrix. △M: +115.46MB. △T: 29.2 seconds.
[05:42:37] Finish joining max pooling matrix. △M: +73.59MB. △T: 29.8 seconds.
[05:43:06] Finish joining min pooling matrix. △M: +73.59MB. △T: 28.9 seconds.
[05:43:08] Finish saving data. △M: -1.73GB. △T: 1.6 seconds.


In [7]:
run_embedding("interest3", 
              emb_dim=10, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

  0%|          | 0/9686953 [00:00<?, ?it/s]

Number of Users in Binary Matrix: 9686953
[05:43:08] Finish fitting GloVe embedding. △M: +0B. △T: 0.0 seconds.


100%|██████████| 9686953/9686953 [13:57<00:00, 11562.19it/s]


[05:57:07] Finish preparing avg/max/min pooling of embedding. △M: +1.08GB. △T: 14.0 minutes.
[05:57:36] Finish joining avg pooling matrix. △M: +95.13MB. △T: 28.9 seconds.
[05:58:05] Finish joining max pooling matrix. △M: +52.56MB. △T: 28.9 seconds.
[05:58:37] Finish joining min pooling matrix. △M: +52.56MB. △T: 31.5 seconds.
[05:58:38] Finish saving data. △M: -1.24GB. △T: 1.4 seconds.


In [8]:
run_embedding("interest4", 
              emb_dim=10, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953
[05:58:38] Finish fitting GloVe embedding. △M: +0B. △T: 0.0 seconds.


100%|██████████| 9686953/9686953 [12:54<00:00, 12514.62it/s]


[06:11:34] Finish preparing avg/max/min pooling of embedding. △M: +1.08GB. △T: 12.9 minutes.
[06:12:00] Finish joining avg pooling matrix. △M: +98.94MB. △T: 26.4 seconds.
[06:12:26] Finish joining max pooling matrix. △M: +52.56MB. △T: 25.6 seconds.
[06:12:51] Finish joining min pooling matrix. △M: +52.56MB. △T: 25.6 seconds.
[06:12:52] Finish saving data. △M: -1.24GB. △T: 0.9 seconds.


In [9]:
run_embedding("interest5", 
              emb_dim=10, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1240/9686953 [00:00<13:01, 12395.46it/s]

[06:12:54] Finish fitting GloVe embedding. △M: +0B. △T: 1.2 seconds.


100%|██████████| 9686953/9686953 [12:29<00:00, 12929.01it/s]


[06:25:25] Finish preparing avg/max/min pooling of embedding. △M: +1.08GB. △T: 12.5 minutes.
[06:25:47] Finish joining avg pooling matrix. △M: +85.01MB. △T: 22.5 seconds.
[06:26:07] Finish joining max pooling matrix. △M: +52.56MB. △T: 19.8 seconds.
[06:26:27] Finish joining min pooling matrix. △M: +52.56MB. △T: 19.9 seconds.
[06:26:28] Finish saving data. △M: -1.24GB. △T: 0.8 seconds.


In [10]:
run_embedding("ct", 
              emb_dim=5, 
              learning_rate=0.001, 
              epochs=1000, 
              version_no=1)

  0%|          | 976/9686953 [00:00<16:32, 9757.26it/s]

Number of Users in Binary Matrix: 9686953
[06:26:28] Finish fitting GloVe embedding. △M: +0B. △T: 0.1 seconds.


100%|██████████| 9686953/9686953 [10:30<00:00, 15353.99it/s]


[06:37:00] Finish preparing avg/max/min pooling of embedding. △M: +554.3MB. △T: 10.5 minutes.
[06:37:19] Finish joining avg pooling matrix. △M: +68.53MB. △T: 19.5 seconds.
[06:37:39] Finish joining max pooling matrix. △M: +26.28MB. △T: 19.6 seconds.
[06:37:59] Finish joining min pooling matrix. △M: +26.28MB. △T: 19.6 seconds.
[06:37:59] Finish saving data. △M: -633.14MB. △T: 0.4 seconds.


In [11]:
run_embedding("os", 
              emb_dim=2, 
              learning_rate=0.001, 
              epochs=1000, 
              version_no=1)

  0%|          | 1558/9686953 [00:00<10:21, 15571.85it/s]

Number of Users in Binary Matrix: 9686953
[06:37:59] Finish fitting GloVe embedding. △M: +0B. △T: 0.0 seconds.


100%|██████████| 9686953/9686953 [10:09<00:00, 15899.06it/s]


[06:48:09] Finish preparing avg/max/min pooling of embedding. △M: +221.72MB. △T: 10.2 minutes.
[06:48:28] Finish joining avg pooling matrix. △M: +57.64MB. △T: 19.3 seconds.
[06:48:47] Finish joining max pooling matrix. △M: +10.51MB. △T: 19.3 seconds.
[06:49:07] Finish joining min pooling matrix. △M: +10.51MB. △T: 19.2 seconds.
[06:49:07] Finish saving data. △M: -253.25MB. △T: 0.2 seconds.


In [12]:
run_embedding("kw1", 
              emb_dim=50, 
              learning_rate=0.01, 
              epochs=100, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 0/9686953 [00:00<?, ?it/s]

[08:11:58] Finish fitting GloVe embedding. △M: +198.38MB. △T: 1.4 hours.


100%|██████████| 9686953/9686953 [13:49<00:00, 11680.93it/s]


[08:25:54] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 13.9 minutes.
[08:26:25] Finish joining avg pooling matrix. △M: +287.73MB. △T: 31.2 seconds.
[08:26:58] Finish joining max pooling matrix. △M: +262.8MB. △T: 32.2 seconds.
[08:27:30] Finish joining min pooling matrix. △M: +262.8MB. △T: 32.3 seconds.
[08:27:44] Finish saving data. △M: -6.18GB. △T: 13.9 seconds.


In [13]:
run_embedding("kw2", 
              emb_dim=50, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1112/9686953 [00:00<14:31, 11110.50it/s]

[09:32:42] Finish fitting GloVe embedding. △M: +20.45MB. △T: 1.1 hours.


100%|██████████| 9686953/9686953 [14:09<00:00, 11405.59it/s]


[09:46:59] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 14.3 minutes.
[09:47:30] Finish joining avg pooling matrix. △M: +291.94MB. △T: 31.4 seconds.
[09:48:01] Finish joining max pooling matrix. △M: +262.8MB. △T: 31.2 seconds.
[09:48:32] Finish joining min pooling matrix. △M: +262.8MB. △T: 31.0 seconds.
[09:48:55] Finish saving data. △M: -6.18GB. △T: 22.7 seconds.


In [4]:
run_embedding("kw3", 
              emb_dim=50, 
              learning_rate=0.001, 
              epochs=5000, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1279/9686953 [00:00<12:37, 12784.13it/s]

[11:39:00] Finish fitting GloVe embedding. △M: +28.27MB. △T: 35.4 minutes.


100%|██████████| 9686953/9686953 [12:30<00:00, 12908.68it/s]


[11:51:38] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 12.6 minutes.
[11:52:09] Finish joining avg pooling matrix. △M: +271.76MB. △T: 31.2 seconds.
[11:52:38] Finish joining max pooling matrix. △M: +286.8MB. △T: 28.6 seconds.
[11:53:07] Finish joining min pooling matrix. △M: +263.05MB. △T: 28.9 seconds.
[11:53:14] Finish saving data. △M: -6.18GB. △T: 7.8 seconds.


In [5]:
run_embedding("topic1", 
              emb_dim=50, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1202/9686953 [00:00<13:26, 12010.19it/s]

[13:36:16] Finish fitting GloVe embedding. △M: +44.65MB. △T: 1.7 hours.


100%|██████████| 9686953/9686953 [13:44<00:00, 11750.13it/s]


[13:50:08] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 13.9 minutes.
[13:50:38] Finish joining avg pooling matrix. △M: +273.06MB. △T: 29.7 seconds.
[13:51:08] Finish joining max pooling matrix. △M: +262.8MB. △T: 29.8 seconds.
[13:51:38] Finish joining min pooling matrix. △M: +262.8MB. △T: 30.5 seconds.
[13:51:49] Finish saving data. △M: -6.18GB. △T: 10.7 seconds.


In [6]:
run_embedding("topic2", 
              emb_dim=50, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1370/9686953 [00:00<11:47, 13697.27it/s]

[15:00:01] Finish fitting GloVe embedding. △M: +22.29MB. △T: 1.1 hours.


100%|██████████| 9686953/9686953 [13:18<00:00, 12131.24it/s]


[15:13:27] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 13.4 minutes.
[15:13:58] Finish joining avg pooling matrix. △M: +291.47MB. △T: 30.2 seconds.
[15:14:28] Finish joining max pooling matrix. △M: +262.8MB. △T: 29.7 seconds.
[15:14:57] Finish joining min pooling matrix. △M: +263.05MB. △T: 29.9 seconds.
[15:15:08] Finish saving data. △M: -6.18GB. △T: 10.8 seconds.


In [7]:
run_embedding("topic3", 
              emb_dim=50, 
              learning_rate=0.0005, 
              epochs=6000, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1445/9686953 [00:00<11:10, 14441.72it/s]

[16:26:19] Finish fitting GloVe embedding. △M: +38.76MB. △T: 1.2 hours.


100%|██████████| 9686953/9686953 [10:34<00:00, 15274.58it/s]


[16:36:59] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 10.7 minutes.
[16:37:23] Finish joining avg pooling matrix. △M: +276.07MB. △T: 23.2 seconds.
[16:37:47] Finish joining max pooling matrix. △M: +262.8MB. △T: 24.0 seconds.
[16:38:10] Finish joining min pooling matrix. △M: +262.8MB. △T: 23.5 seconds.
[16:38:15] Finish saving data. △M: -6.18GB. △T: 4.8 seconds.


In [8]:
run_embedding("appIdInstall", 
              emb_dim=30, 
              learning_rate=0.1, 
              epochs=1, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1192/9686953 [00:00<13:33, 11912.80it/s]

[17:09:32] Finish fitting GloVe embedding. △M: +18.89MB. △T: 31.2 minutes.


100%|██████████| 9686953/9686953 [16:46<00:00, 9623.61it/s]


[17:26:22] Finish preparing avg/max/min pooling of embedding. △M: +3.25GB. △T: 16.8 minutes.
[17:26:44] Finish joining avg pooling matrix. △M: +188.05MB. △T: 21.8 seconds.
[17:27:06] Finish joining max pooling matrix. △M: +157.68MB. △T: 21.8 seconds.
[17:27:27] Finish joining min pooling matrix. △M: +157.68MB. △T: 21.8 seconds.
[17:27:30] Finish saving data. △M: -3.71GB. △T: 2.5 seconds.


In [9]:
run_embedding("appIdAction", 
              emb_dim=50, 
              learning_rate=0.01, 
              epochs=300, 
              version_no=1)

Number of Users in Binary Matrix: 9686953


  0%|          | 1432/9686953 [00:00<11:16, 14310.97it/s]

[17:38:42] Finish fitting GloVe embedding. △M: +55.2MB. △T: 11.2 minutes.


100%|██████████| 9686953/9686953 [10:33<00:00, 15299.61it/s]


[17:49:22] Finish preparing avg/max/min pooling of embedding. △M: +5.41GB. △T: 10.7 minutes.
[17:49:45] Finish joining avg pooling matrix. △M: +262.8MB. △T: 23.6 seconds.
[17:50:09] Finish joining max pooling matrix. △M: +262.8MB. △T: 23.8 seconds.
[17:50:33] Finish joining min pooling matrix. △M: +262.8MB. △T: 24.0 seconds.
[17:50:38] Finish saving data. △M: -6.18GB. △T: 4.9 seconds.
