In [1]:
from glove import Glove
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import tqdm
import gc
import os
import sys
sys.path.append("../../../code/utils/")
sys.path.append("../../../code/analysis/")
sys.path.append('../../../code')
import data_utils as du
import perf_utils as pu
import config
from visualize_feature_cooccurrence import CooccurrenceVisualizer

In [2]:
user_multi_feat_names = config.USER_MULTI_FEAT_NAMES

In [3]:
# Some analysis for selection of embedding size
for feat_name in user_multi_feat_names:
    uid, (ufeat_index, uvec) = du.load_user_cnt(feat_name)
    uvec_sum = uvec.sum(axis=1).flatten()
    has_nan = "[nan]" in ufeat_index
    print("{:<20}: min:{:>2} max:{:>3} mean: {:>4} nunique:{:>6}"
          .format(feat_name, uvec_sum.min() if not has_nan else 0, uvec_sum.max(), 
                  round(uvec_sum.mean(), 1), uvec.shape[1]))
    del uvec
    del uvec_sum
    del uid
    del ufeat_index
    gc.collect()

marriageStatus      : min: 1 max:  3 mean:  1.2 nunique:    13
interest1           : min: 0 max: 38 mean: 13.2 nunique:   123
interest2           : min: 0 max: 32 mean:  4.0 nunique:    81
interest3           : min: 0 max: 10 mean:  1.2 nunique:    11
interest4           : min: 0 max: 10 mean:  1.1 nunique:    11
interest5           : min: 0 max: 86 mean: 15.1 nunique:   137
kw1                 : min: 0 max:  5 mean:  4.4 nunique:259909
kw2                 : min: 0 max:  5 mean:  4.8 nunique: 49197
kw3                 : min: 0 max:  5 mean:  1.2 nunique: 11922
topic1              : min: 0 max:  5 mean:  4.7 nunique: 10001
topic2              : min: 0 max:  5 mean:  4.8 nunique:  9980
topic3              : min: 0 max:  5 mean:  1.2 nunique:  5873
appIdInstall        : min: 0 max:920 mean:  3.3 nunique: 64856
appIdAction         : min: 0 max:537 mean:  1.1 nunique:  6215
ct                  : min: 1 max:  4 mean:  1.9 nunique:     5
os                  : min: 1 max:  2 mean:  1.1 nunique

In [3]:
cooc_fig_folder = "../../../figure/cooccurrence/preliminary_contest_data/byUserFeatureName[Reconstructed]/"
os.makedirs(cooc_fig_folder, exist_ok=True)


def figpath(feat_name, emb_dim=None, create=True):
    folder = os.path.join(cooc_fig_folder, "[featureName='{}']".format(feat_name))
    file = "[emb_dim={}].jpg".format(emb_dim) if emb_dim is not None else "original.jpg"
    path = os.path.join(folder, file)
    if create:
        os.makedirs(folder, exist_ok=True)
    return path

In [4]:
default_plot_kwargs = {
    "figsize": (12, 9), 
    "titlesize": 12, 
    "ticksize": 6, 
    "tickweight": 400
}

def get_embedding(cooc, word_to_index, emb_dim=5, learning_rate=0.05, epochs=1000, verbose=False, ret_dict=True):
    glove = Glove(no_components=emb_dim, learning_rate=learning_rate, random_state=2018)  # fix random_state for reproducibility
    glove.fit(cooc.tocoo().astype(np.float64), epochs=epochs, verbose=verbose)
    if ret_dict:
        return {word: glove.word_vectors[index] for word, index in word_to_index.items()}
    else:
        return glove.word_vectors
    

def try_embedding_dim(cooc, word_to_index, feat_name, lift_min=True,
                      emb_dim=5, learning_rate=0.001, epochs=10000,
                      show=True, **plot_kwargs):
    with pu.profiler("training GloVe embeddings[Dim={}]".format(emb_dim)):
        embs = get_embedding(cooc, word_to_index, emb_dim=emb_dim, 
                             learning_rate=learning_rate, epochs=epochs, ret_dict=False)
    cooc_glove = np.dot(embs, embs.T)
    if lift_min:
        cooc_glove = cooc_glove - cooc_glove.min()
    cooc_glove = sparse.csr_matrix(cooc_glove)
    CooccurrenceVisualizer.plot_cooc(cooc_glove, word_to_index, "{}_embedding[{}]".format(feat_name, emb_dim), 
                                     savepath=figpath(feat_name, emb_dim=emb_dim), show=show,
                                     **plot_kwargs)

In [5]:
def try_embedding_dim_downsample(cooc, word_to_index, feat_name, window=100, lift_min=True,
                                 emb_dim=5, learning_rate=0.001, epochs=10000,
                                 show=True, **plot_kwargs):
    with pu.profiler("training GloVe embeddings[Dim={}]".format(emb_dim)):
        embs = get_embedding(cooc, word_to_index, emb_dim=emb_dim, 
                             learning_rate=learning_rate, epochs=epochs, ret_dict=False)
    
    word_to_index_sorted = sorted(word_to_index.items(), key=lambda x: x[1])
    word_to_index_sorted = [word for word, index in word_to_index_sorted]
    word_to_index_sample = word_to_index_sorted[::window]
    word_to_index_sample = {word: index for index, word in enumerate(word_to_index_sample)}
    embs_sample = embs[::window]
    cooc_glove = np.dot(embs_sample, embs_sample.T)
    if lift_min:
        cooc_glove = cooc_glove - cooc_glove.min()
    cooc_glove = sparse.csr_matrix(cooc_glove)
    CooccurrenceVisualizer.plot_cooc(cooc_glove, word_to_index_sample, 
                                     feat_name="{}_embedding[{}]".format(feat_name, emb_dim), 
                                     show=False, savepath=figpath(feat_name, emb_dim=emb_dim), 
                                     **plot_kwargs)

In [20]:
feat_name = 'marriageStatus'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name, 
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [31]:
# emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.01, epochs=300,
#                       show=False, **default_plot_kwargs)

[17:08:05] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.0 seconds.
[17:08:05] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.0 seconds.
[17:08:06] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.0 seconds.
[17:08:06] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.0 seconds.
[17:08:07] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.0 seconds.
[17:08:07] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 0.0 seconds.
[17:08:07] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 0.0 seconds.
[17:08:08] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 0.1 seconds.
[17:08:09] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 0.0 seconds.
[17:08:09] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 0.0 seconds.


In [14]:
# emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.0001, epochs=100000,
#                       show=False, **default_plot_kwargs)

[16:11:39] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 5.8 seconds.
[16:11:49] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 9.4 seconds.
[16:12:00] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 10.6 seconds.
[16:12:07] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 6.8 seconds.
[16:12:17] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 10.0 seconds.
[16:12:28] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 10.0 seconds.
[16:12:35] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 6.4 seconds.
[16:12:47] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 11.7 seconds.
[16:12:56] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 7.8 seconds.
[16:13:08] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 12.3 seconds.


In [22]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.001, epochs=1000,
                      show=False, **default_plot_kwargs)

[09:44:30] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.2 seconds.
[09:44:31] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.5 seconds.
[09:44:32] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.3 seconds.
[09:44:32] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.2 seconds.
[09:44:33] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.1 seconds.
[09:44:34] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 0.1 seconds.
[09:44:34] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 0.2 seconds.
[09:44:35] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 0.2 seconds.
[09:44:36] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 0.1 seconds.
[09:44:36] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 0.1 seconds.
[09:44:37] Finish training GloVe embeddings[Dim=11]. △M: +0B. △T: 0.2 seconds.
[09:44:38] Finish training GloVe embeddings[Dim=12]. △M: +0B. △T: 0.3 seconds.
[09:44:39] Finish training GloVe embeddings[Dim=13]. △M: +0B.

In [15]:
feat_name = 'interest1'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name,
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [16]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.0002, epochs=5000,
                      show=False, **default_plot_kwargs)

[16:17:04] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 28.8 seconds.
[16:17:34] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 27.2 seconds.
[16:18:09] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 30.4 seconds.
[16:18:45] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 32.5 seconds.
[16:19:20] Finish training GloVe embeddings[Dim=5]. △M: +4.0KB. △T: 29.8 seconds.
[16:19:55] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 31.8 seconds.
[16:20:34] Finish training GloVe embeddings[Dim=7]. △M: +4.0KB. △T: 34.3 seconds.
[16:21:17] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 39.0 seconds.
[16:21:55] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 32.1 seconds.
[16:22:32] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 34.3 seconds.
[16:23:16] Finish training GloVe embeddings[Dim=11]. △M: +0B. △T: 39.1 seconds.
[16:24:00] Finish training GloVe embeddings[Dim=12]. △M: +0B. △T: 40.1 seconds.
[16:24:45] Finish training GloVe embeddings

In [23]:
feat_name = 'interest2'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name,
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [24]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.0002, epochs=5000,
                      show=False, **default_plot_kwargs)

[11:19:27] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 4.1 seconds.
[11:19:32] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 4.4 seconds.
[11:19:39] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 5.1 seconds.
[11:19:45] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 4.9 seconds.
[11:19:51] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 5.5 seconds.
[11:19:58] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 5.6 seconds.
[11:20:06] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 6.1 seconds.
[11:20:13] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 6.1 seconds.
[11:20:20] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 6.3 seconds.
[11:20:28] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 6.5 seconds.
[11:20:36] Finish training GloVe embeddings[Dim=12]. △M: +0B. △T: 7.0 seconds.
[11:20:45] Finish training GloVe embeddings[Dim=14]. △M: +0B. △T: 7.6 seconds.
[11:20:55] Finish training GloVe embeddings[Dim=16]. △M: +0B.

In [19]:
feat_name = 'interest3'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name,
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [24]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.01, epochs=300,
                      show=False, **default_plot_kwargs)

[16:46:53] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.0 seconds.
[16:46:53] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.0 seconds.
[16:46:54] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.0 seconds.
[16:46:54] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.0 seconds.
[16:46:55] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.1 seconds.
[16:46:55] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 0.0 seconds.
[16:46:56] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 0.0 seconds.
[16:46:56] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 0.1 seconds.
[16:46:57] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 0.1 seconds.
[16:46:57] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 0.0 seconds.


In [25]:
feat_name = 'interest4'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name,
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [27]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.01, epochs=300,
                      show=False, **default_plot_kwargs)

[16:50:52] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.0 seconds.
[16:50:53] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.1 seconds.
[16:50:53] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.1 seconds.
[16:50:54] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.1 seconds.
[16:50:55] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.0 seconds.
[16:50:55] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 0.0 seconds.
[16:50:56] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 0.0 seconds.
[16:50:56] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 0.1 seconds.
[16:50:57] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 0.1 seconds.
[16:50:58] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 0.0 seconds.


In [28]:
feat_name = 'interest5'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name,
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [29]:
emb_dims = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.01, epochs=300,
                      show=False, **default_plot_kwargs)

[16:57:22] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 1.5 seconds.
[16:57:27] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 1.4 seconds.
[16:57:35] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 2.3 seconds.
[16:57:42] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 1.7 seconds.
[16:57:47] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 1.7 seconds.
[16:57:55] Finish training GloVe embeddings[Dim=6]. △M: +0B. △T: 2.9 seconds.
[16:58:02] Finish training GloVe embeddings[Dim=7]. △M: +0B. △T: 1.9 seconds.
[16:58:07] Finish training GloVe embeddings[Dim=8]. △M: +0B. △T: 2.1 seconds.
[16:58:17] Finish training GloVe embeddings[Dim=9]. △M: +0B. △T: 3.0 seconds.
[16:58:23] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 2.0 seconds.
[16:58:29] Finish training GloVe embeddings[Dim=11]. △M: +0B. △T: 2.3 seconds.
[16:58:39] Finish training GloVe embeddings[Dim=12]. △M: +0B. △T: 3.4 seconds.
[16:58:47] Finish training GloVe embeddings[Dim=13]. △M: +0B.

In [25]:
feat_name = 'kw1'
window = 2400
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [36]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.1, epochs=1,
#                                  show=False, **default_plot_kwargs)

[03:31:08] Finish training GloVe embeddings[Dim=5]. △M: +20.66MB. △T: 1.8 minutes.
[03:33:01] Finish training GloVe embeddings[Dim=10]. △M: +19.83MB. △T: 1.8 minutes.
[03:34:58] Finish training GloVe embeddings[Dim=15]. △M: +29.75MB. △T: 1.9 minutes.
[03:37:00] Finish training GloVe embeddings[Dim=20]. △M: +41.65MB. △T: 2.0 minutes.
[03:39:07] Finish training GloVe embeddings[Dim=25]. △M: +53.34MB. △T: 2.1 minutes.
[03:41:21] Finish training GloVe embeddings[Dim=30]. △M: +64.52MB. △T: 2.2 minutes.
[03:43:33] Finish training GloVe embeddings[Dim=35]. △M: +74.12MB. △T: 2.1 minutes.
[03:45:50] Finish training GloVe embeddings[Dim=40]. △M: +79.32MB. △T: 2.2 minutes.
[03:48:10] Finish training GloVe embeddings[Dim=45]. △M: +97.98MB. △T: 2.3 minutes.
[03:50:32] Finish training GloVe embeddings[Dim=50]. △M: +106.26MB. △T: 2.3 minutes.


In [37]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.1, epochs=5,
#                                  show=False, **default_plot_kwargs)

[03:57:32] Finish training GloVe embeddings[Dim=5]. △M: +14.54MB. △T: 3.5 minutes.
[04:01:26] Finish training GloVe embeddings[Dim=10]. △M: +12.57MB. △T: 3.9 minutes.
[04:05:37] Finish training GloVe embeddings[Dim=15]. △M: +7.78MB. △T: 4.1 minutes.
[04:10:04] Finish training GloVe embeddings[Dim=20]. △M: +39.66MB. △T: 4.4 minutes.
[04:14:44] Finish training GloVe embeddings[Dim=25]. △M: +49.57MB. △T: 4.6 minutes.
[04:19:48] Finish training GloVe embeddings[Dim=30]. △M: +59.49MB. △T: 5.0 minutes.
[04:25:06] Finish training GloVe embeddings[Dim=35]. △M: +69.41MB. △T: 5.3 minutes.
[04:30:42] Finish training GloVe embeddings[Dim=40]. △M: +84.96MB. △T: 5.5 minutes.
[04:36:36] Finish training GloVe embeddings[Dim=45]. △M: +92.46MB. △T: 5.8 minutes.
[04:41:48] Finish training GloVe embeddings[Dim=50]. △M: +105.61MB. △T: 5.1 minutes.


In [38]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.05, epochs=20,
#                                  show=False, **default_plot_kwargs)

[05:09:08] Finish training GloVe embeddings[Dim=5]. △M: +14.54MB. △T: 6.2 minutes.
[05:16:16] Finish training GloVe embeddings[Dim=10]. △M: +12.58MB. △T: 7.1 minutes.
[05:24:13] Finish training GloVe embeddings[Dim=15]. △M: +7.78MB. △T: 7.9 minutes.
[05:32:55] Finish training GloVe embeddings[Dim=20]. △M: +39.66MB. △T: 8.7 minutes.
[06:36:12] Finish training GloVe embeddings[Dim=25]. △M: +31.93MB. △T: 1.1 hours.
[06:46:24] Finish training GloVe embeddings[Dim=30]. △M: +59.71MB. △T: 10.2 minutes.
[06:57:26] Finish training GloVe embeddings[Dim=35]. △M: +69.41MB. △T: 11.0 minutes.
[07:10:48] Finish training GloVe embeddings[Dim=40]. △M: +77.5MB. △T: 13.3 minutes.
[07:23:51] Finish training GloVe embeddings[Dim=45]. △M: +87.64MB. △T: 13.0 minutes.
[07:43:08] Finish training GloVe embeddings[Dim=50]. △M: -82.66MB. △T: 19.2 minutes.


In [26]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.01, epochs=100,
                                 show=False, **default_plot_kwargs)

[12:00:10] Finish training GloVe embeddings[Dim=5]. △M: +9.91MB. △T: 32.2 minutes.
[12:37:52] Finish training GloVe embeddings[Dim=10]. △M: +29.75MB. △T: 37.7 minutes.
[13:22:14] Finish training GloVe embeddings[Dim=15]. △M: +0B. △T: 44.3 minutes.
[14:09:57] Finish training GloVe embeddings[Dim=20]. △M: +0B. △T: 47.7 minutes.
[15:04:14] Finish training GloVe embeddings[Dim=25]. △M: +49.57MB. △T: 54.2 minutes.
[16:04:07] Finish training GloVe embeddings[Dim=30]. △M: +42.66MB. △T: 59.8 minutes.
[17:30:21] Finish training GloVe embeddings[Dim=35]. △M: +69.63MB. △T: 1.4 hours.
[18:54:29] Finish training GloVe embeddings[Dim=40]. △M: +79.32MB. △T: 1.4 hours.
[20:06:57] Finish training GloVe embeddings[Dim=45]. △M: +89.23MB. △T: 1.2 hours.
[21:22:59] Finish training GloVe embeddings[Dim=50]. △M: +99.15MB. △T: 1.3 hours.


In [40]:
feat_name = 'kw2'
window = 480
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [41]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.05, epochs=20,
#                                  show=False, **default_plot_kwargs)

[15:19:05] Finish training GloVe embeddings[Dim=5]. △M: +344.61MB. △T: 1.3 minutes.
[15:20:39] Finish training GloVe embeddings[Dim=10]. △M: +8.19MB. △T: 1.5 minutes.
[15:22:29] Finish training GloVe embeddings[Dim=15]. △M: +7.52MB. △T: 1.8 minutes.
[15:24:32] Finish training GloVe embeddings[Dim=20]. △M: +14.77MB. △T: 2.0 minutes.
[15:26:46] Finish training GloVe embeddings[Dim=25]. △M: +19.95MB. △T: 2.2 minutes.
[15:29:14] Finish training GloVe embeddings[Dim=30]. △M: +12.55MB. △T: 2.5 minutes.
[15:32:00] Finish training GloVe embeddings[Dim=35]. △M: -3.11MB. △T: 2.7 minutes.
[15:34:55] Finish training GloVe embeddings[Dim=40]. △M: +16.89MB. △T: 2.9 minutes.
[15:37:55] Finish training GloVe embeddings[Dim=45]. △M: +33.79MB. △T: 3.0 minutes.
[15:41:10] Finish training GloVe embeddings[Dim=50]. △M: +20.64MB. △T: 3.2 minutes.


In [42]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=100,
#                                  show=False, **default_plot_kwargs)

[16:02:55] Finish training GloVe embeddings[Dim=5]. △M: +1.19MB. △T: 5.8 minutes.
[16:09:41] Finish training GloVe embeddings[Dim=10]. △M: +3.75MB. △T: 6.7 minutes.
[16:17:12] Finish training GloVe embeddings[Dim=15]. △M: +5.84MB. △T: 7.5 minutes.
[16:25:51] Finish training GloVe embeddings[Dim=20]. △M: +584.0KB. △T: 8.6 minutes.
[16:35:22] Finish training GloVe embeddings[Dim=25]. △M: +0B. △T: 9.5 minutes.
[01:36:46] Finish training GloVe embeddings[Dim=30]. △M: -4.04MB. △T: 9.0 hours.
[01:49:12] Finish training GloVe embeddings[Dim=35]. △M: +228.0KB. △T: 12.4 minutes.
[02:02:28] Finish training GloVe embeddings[Dim=40]. △M: +0B. △T: 13.2 minutes.
[02:16:17] Finish training GloVe embeddings[Dim=45]. △M: +0B. △T: 13.8 minutes.
[03:15:02] Finish training GloVe embeddings[Dim=50]. △M: +496.0KB. △T: 58.7 minutes.


In [43]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.01, epochs=300,
                                 show=False, **default_plot_kwargs)

[03:40:40] Finish training GloVe embeddings[Dim=5]. △M: -3.93MB. △T: 16.8 minutes.
[04:02:03] Finish training GloVe embeddings[Dim=10]. △M: +680.0KB. △T: 21.4 minutes.
[04:36:49] Finish training GloVe embeddings[Dim=15]. △M: -448.58MB. △T: 34.7 minutes.
[05:20:55] Finish training GloVe embeddings[Dim=20]. △M: +209.17MB. △T: 44.1 minutes.
[06:08:11] Finish training GloVe embeddings[Dim=25]. △M: +12.45MB. △T: 47.2 minutes.
[07:01:18] Finish training GloVe embeddings[Dim=30]. △M: +10.67MB. △T: 53.1 minutes.
[08:49:58] Finish training GloVe embeddings[Dim=35]. △M: +624.0KB. △T: 1.8 hours.
[09:51:01] Finish training GloVe embeddings[Dim=40]. △M: +1.91MB. △T: 1.0 hours.
[10:59:02] Finish training GloVe embeddings[Dim=45]. △M: +16.88MB. △T: 1.1 hours.
[12:10:06] Finish training GloVe embeddings[Dim=50]. △M: +2.09MB. △T: 1.2 hours.


In [44]:
feat_name = 'kw3'
window = 100
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [46]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=300,
#                                  show=False, **default_plot_kwargs)

[15:36:47] Finish training GloVe embeddings[Dim=5]. △M: -4.16MB. △T: 38.5 seconds.
[15:37:35] Finish training GloVe embeddings[Dim=10]. △M: +228.0KB. △T: 45.3 seconds.
[15:38:30] Finish training GloVe embeddings[Dim=15]. △M: +0B. △T: 53.3 seconds.
[15:39:34] Finish training GloVe embeddings[Dim=20]. △M: +0B. △T: 1.0 minutes.
[15:40:47] Finish training GloVe embeddings[Dim=25]. △M: +0B. △T: 1.2 minutes.
[15:42:09] Finish training GloVe embeddings[Dim=30]. △M: +0B. △T: 1.3 minutes.
[15:43:41] Finish training GloVe embeddings[Dim=35]. △M: +0B. △T: 1.5 minutes.
[15:45:23] Finish training GloVe embeddings[Dim=40]. △M: +0B. △T: 1.7 minutes.
[15:47:13] Finish training GloVe embeddings[Dim=45]. △M: +0B. △T: 1.8 minutes.
[15:49:11] Finish training GloVe embeddings[Dim=50]. △M: +0B. △T: 1.9 minutes.


In [47]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.002, epochs=1500,
#                                  show=False, **default_plot_kwargs)

[15:55:35] Finish training GloVe embeddings[Dim=5]. △M: +68.0KB. △T: 3.1 minutes.
[15:59:18] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 3.7 minutes.
[16:03:47] Finish training GloVe embeddings[Dim=15]. △M: +0B. △T: 4.4 minutes.
[16:09:06] Finish training GloVe embeddings[Dim=20]. △M: +0B. △T: 5.3 minutes.
[16:15:06] Finish training GloVe embeddings[Dim=25]. △M: +0B. △T: 6.0 minutes.
[16:21:36] Finish training GloVe embeddings[Dim=30]. △M: +0B. △T: 6.5 minutes.
[16:29:00] Finish training GloVe embeddings[Dim=35]. △M: +0B. △T: 7.4 minutes.
[16:37:13] Finish training GloVe embeddings[Dim=40]. △M: +0B. △T: 8.2 minutes.
[16:46:10] Finish training GloVe embeddings[Dim=45]. △M: +0B. △T: 8.9 minutes.
[16:55:39] Finish training GloVe embeddings[Dim=50]. △M: +0B. △T: 9.5 minutes.


In [48]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.001, epochs=5000,
                                 show=False, **default_plot_kwargs)

[17:13:35] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 10.0 minutes.
[17:25:34] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 11.9 minutes.
[17:40:00] Finish training GloVe embeddings[Dim=15]. △M: +0B. △T: 14.4 minutes.
[17:56:56] Finish training GloVe embeddings[Dim=20]. △M: +0B. △T: 16.9 minutes.
[18:16:16] Finish training GloVe embeddings[Dim=25]. △M: +0B. △T: 19.3 minutes.
[18:37:49] Finish training GloVe embeddings[Dim=30]. △M: +0B. △T: 21.5 minutes.
[19:02:01] Finish training GloVe embeddings[Dim=35]. △M: -4.05MB. △T: 24.2 minutes.
[19:29:53] Finish training GloVe embeddings[Dim=40]. △M: +232.0KB. △T: 27.8 minutes.
[20:00:03] Finish training GloVe embeddings[Dim=45]. △M: +0B. △T: 30.1 minutes.
[20:32:14] Finish training GloVe embeddings[Dim=50]. △M: +0B. △T: 32.2 minutes.


In [11]:
feat_name = 'topic1'
window = 100
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [7]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.01, epochs=300,
                                 show=False, **default_plot_kwargs)

[16:58:20] Finish training GloVe embeddings[Dim=5]. △M: +4.93MB. △T: 38.5 minutes.
[17:45:30] Finish training GloVe embeddings[Dim=10]. △M: -14.41MB. △T: 47.1 minutes.
[18:26:18] Finish training GloVe embeddings[Dim=15]. △M: -1.98MB. △T: 40.8 minutes.
[19:09:52] Finish training GloVe embeddings[Dim=20]. △M: -1.22MB. △T: 43.5 minutes.
[19:58:35] Finish training GloVe embeddings[Dim=25]. △M: -460.0KB. △T: 48.7 minutes.
[20:52:42] Finish training GloVe embeddings[Dim=30]. △M: -4.79MB. △T: 54.1 minutes.
[21:52:45] Finish training GloVe embeddings[Dim=35]. △M: -1.6MB. △T: 1.0 hours.
[22:58:37] Finish training GloVe embeddings[Dim=40]. △M: +1.83MB. △T: 1.1 hours.
[00:06:36] Finish training GloVe embeddings[Dim=45]. △M: -6.68MB. △T: 1.1 hours.
[01:19:19] Finish training GloVe embeddings[Dim=50]. △M: +96.0KB. △T: 1.2 hours.


In [8]:
feat_name = 'topic2'
window = 80
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [None]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.01, epochs=300,
                                 show=False, **default_plot_kwargs)

[04:06:30] Finish training GloVe embeddings[Dim=5]. △M: +15.48MB. △T: 19.8 minutes.


In [6]:
feat_name = 'topic3'
window = 40
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [7]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=300,
#                                  show=False, **default_plot_kwargs)

[14:17:25] Finish training GloVe embeddings[Dim=5]. △M: +22.49MB. △T: 55.3 seconds.
[14:18:29] Finish training GloVe embeddings[Dim=10]. △M: +924.0KB. △T: 1.0 minutes.
[14:19:43] Finish training GloVe embeddings[Dim=15]. △M: -2.01MB. △T: 1.2 minutes.
[14:21:07] Finish training GloVe embeddings[Dim=20]. △M: -2.36MB. △T: 1.4 minutes.
[14:22:46] Finish training GloVe embeddings[Dim=25]. △M: -2.01MB. △T: 1.6 minutes.
[14:24:35] Finish training GloVe embeddings[Dim=30]. △M: -1.46MB. △T: 1.8 minutes.
[14:26:34] Finish training GloVe embeddings[Dim=35]. △M: -2.01MB. △T: 2.0 minutes.
[14:28:46] Finish training GloVe embeddings[Dim=40]. △M: -2.36MB. △T: 2.2 minutes.
[14:31:08] Finish training GloVe embeddings[Dim=45]. △M: +112.0KB. △T: 2.3 minutes.
[14:33:42] Finish training GloVe embeddings[Dim=50]. △M: -1.91MB. △T: 2.5 minutes.


In [8]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.002, epochs=1500,
#                                  show=False, **default_plot_kwargs)

[15:06:10] Finish training GloVe embeddings[Dim=5]. △M: -6.39MB. △T: 4.2 minutes.
[15:11:19] Finish training GloVe embeddings[Dim=10]. △M: -18.31MB. △T: 5.1 minutes.
[15:17:23] Finish training GloVe embeddings[Dim=15]. △M: +228.0KB. △T: 6.0 minutes.
[15:24:21] Finish training GloVe embeddings[Dim=20]. △M: -4.0KB. △T: 6.9 minutes.
[15:32:23] Finish training GloVe embeddings[Dim=25]. △M: +0B. △T: 8.0 minutes.
[15:41:44] Finish training GloVe embeddings[Dim=30]. △M: -320.0KB. △T: 9.3 minutes.
[15:52:16] Finish training GloVe embeddings[Dim=35]. △M: -2.01MB. △T: 10.5 minutes.
[16:04:27] Finish training GloVe embeddings[Dim=40]. △M: -4.0KB. △T: 12.1 minutes.
[16:16:52] Finish training GloVe embeddings[Dim=45]. △M: -2.13MB. △T: 12.4 minutes.
[16:30:06] Finish training GloVe embeddings[Dim=50]. △M: +2.37MB. △T: 13.2 minutes.


In [9]:
emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.0005, epochs=6000,
                                 show=False, **default_plot_kwargs)

[16:54:37] Finish training GloVe embeddings[Dim=5]. △M: -2.14MB. △T: 17.8 minutes.
[17:15:19] Finish training GloVe embeddings[Dim=10]. △M: -2.01MB. △T: 20.7 minutes.
[17:41:05] Finish training GloVe embeddings[Dim=15]. △M: -4.16MB. △T: 25.7 minutes.
[18:10:49] Finish training GloVe embeddings[Dim=20]. △M: -2.01MB. △T: 29.7 minutes.
[18:44:56] Finish training GloVe embeddings[Dim=25]. △M: -4.04MB. △T: 34.1 minutes.
[19:22:50] Finish training GloVe embeddings[Dim=30]. △M: -452.0KB. △T: 37.9 minutes.
[20:05:03] Finish training GloVe embeddings[Dim=35]. △M: -2.01MB. △T: 42.2 minutes.
[20:51:29] Finish training GloVe embeddings[Dim=40]. △M: -2.01MB. △T: 46.4 minutes.
[21:41:55] Finish training GloVe embeddings[Dim=45]. △M: +0B. △T: 50.4 minutes.
[22:35:34] Finish training GloVe embeddings[Dim=50]. △M: -1.91MB. △T: 53.6 minutes.


In [6]:
feat_name = 'appIdInstall'
window = 600
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [14]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.1, epochs=1,
#                                  show=False, **default_plot_kwargs)

[03:12:09] Finish training GloVe embeddings[Dim=5]. △M: -5.17GB. △T: 17.0 minutes.
[03:30:08] Finish training GloVe embeddings[Dim=10]. △M: -5.33GB. △T: 17.9 minutes.
[03:46:28] Finish training GloVe embeddings[Dim=15]. △M: +2.77GB. △T: 16.2 minutes.
[04:02:29] Finish training GloVe embeddings[Dim=20]. △M: +616.84MB. △T: 16.0 minutes.
[04:19:42] Finish training GloVe embeddings[Dim=25]. △M: +12.37MB. △T: 17.2 minutes.
[04:37:15] Finish training GloVe embeddings[Dim=30]. △M: +14.85MB. △T: 17.5 minutes.
[04:55:22] Finish training GloVe embeddings[Dim=35]. △M: +17.32MB. △T: 18.1 minutes.
[05:13:43] Finish training GloVe embeddings[Dim=40]. △M: +19.79MB. △T: 18.3 minutes.
[05:33:04] Finish training GloVe embeddings[Dim=45]. △M: +22.27MB. △T: 19.3 minutes.
[05:52:50] Finish training GloVe embeddings[Dim=50]. △M: +24.74MB. △T: 19.7 minutes.


In [8]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
emb_dims = [100, 200, 300, 500]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.1, epochs=1,
                                 show=False, **default_plot_kwargs)

[13:15:13] Finish training GloVe embeddings[Dim=100]. △M: +49.48MB. △T: 26.8 minutes.
[13:48:57] Finish training GloVe embeddings[Dim=200]. △M: +98.96MB. △T: 33.7 minutes.
[14:48:39] Finish training GloVe embeddings[Dim=300]. △M: -11.5MB. △T: 59.7 minutes.
[15:56:32] Finish training GloVe embeddings[Dim=500]. △M: +247.41MB. △T: 1.1 hours.


In [6]:
feat_name = 'appIdAction'
window = 50
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc_downsample(cooc, word_to_index, feat_name=feat_name, window=window, 
                                            show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [7]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.1, epochs=1,
#                                  show=False, **default_plot_kwargs)

[06:04:03] Finish training GloVe embeddings[Dim=5]. △M: +420.0KB. △T: 2.6 seconds.
[06:04:08] Finish training GloVe embeddings[Dim=10]. △M: -3.34MB. △T: 2.8 seconds.
[06:04:13] Finish training GloVe embeddings[Dim=15]. △M: -2.01MB. △T: 3.5 seconds.
[06:04:19] Finish training GloVe embeddings[Dim=20]. △M: -3.2MB. △T: 3.3 seconds.
[06:04:24] Finish training GloVe embeddings[Dim=25]. △M: -1.92MB. △T: 3.5 seconds.
[06:04:30] Finish training GloVe embeddings[Dim=30]. △M: -2.37MB. △T: 3.5 seconds.
[06:04:36] Finish training GloVe embeddings[Dim=35]. △M: -2.48MB. △T: 3.6 seconds.
[06:04:42] Finish training GloVe embeddings[Dim=40]. △M: -356.0KB. △T: 3.9 seconds.
[06:04:48] Finish training GloVe embeddings[Dim=45]. △M: -3.85MB. △T: 4.1 seconds.
[06:04:54] Finish training GloVe embeddings[Dim=50]. △M: +612.0KB. △T: 4.2 seconds.


In [8]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=10,
#                                  show=False, **default_plot_kwargs)

[06:06:01] Finish training GloVe embeddings[Dim=5]. △M: -244.0KB. △T: 9.9 seconds.
[06:06:15] Finish training GloVe embeddings[Dim=10]. △M: +0B. △T: 11.3 seconds.
[06:06:30] Finish training GloVe embeddings[Dim=15]. △M: -2.01MB. △T: 13.5 seconds.
[06:06:47] Finish training GloVe embeddings[Dim=20]. △M: -6.64MB. △T: 14.6 seconds.
[06:07:05] Finish training GloVe embeddings[Dim=25]. △M: -2.96MB. △T: 16.1 seconds.
[06:07:25] Finish training GloVe embeddings[Dim=30]. △M: -2.72MB. △T: 17.8 seconds.
[06:07:46] Finish training GloVe embeddings[Dim=35]. △M: -5.7MB. △T: 19.5 seconds.
[06:08:09] Finish training GloVe embeddings[Dim=40]. △M: -2.25MB. △T: 21.2 seconds.
[06:08:34] Finish training GloVe embeddings[Dim=45]. △M: +124.0KB. △T: 22.6 seconds.
[06:09:00] Finish training GloVe embeddings[Dim=50]. △M: -3.82MB. △T: 24.2 seconds.


In [9]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=30,
#                                  show=False, **default_plot_kwargs)

[06:12:30] Finish training GloVe embeddings[Dim=5]. △M: -4.15MB. △T: 27.9 seconds.
[06:13:04] Finish training GloVe embeddings[Dim=10]. △M: -2.18MB. △T: 31.9 seconds.
[06:13:42] Finish training GloVe embeddings[Dim=15]. △M: -2.01MB. △T: 35.5 seconds.
[06:14:25] Finish training GloVe embeddings[Dim=20]. △M: -4.15MB. △T: 41.7 seconds.
[06:15:14] Finish training GloVe embeddings[Dim=25]. △M: -52.0KB. △T: 46.2 seconds.
[06:16:07] Finish training GloVe embeddings[Dim=30]. △M: -2.72MB. △T: 50.9 seconds.
[06:17:08] Finish training GloVe embeddings[Dim=35]. △M: -848.0KB. △T: 59.3 seconds.
[06:18:14] Finish training GloVe embeddings[Dim=40]. △M: -5.47MB. △T: 1.1 minutes.
[06:19:24] Finish training GloVe embeddings[Dim=45]. △M: +124.0KB. △T: 1.1 minutes.
[06:20:38] Finish training GloVe embeddings[Dim=50]. △M: +612.0KB. △T: 1.2 minutes.


In [10]:
# emb_dims = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# for emb_dim in emb_dims:
#     try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
#                                  window=window, emb_dim=emb_dim, 
#                                  learning_rate=0.01, epochs=300,
#                                  show=False, **default_plot_kwargs)

[06:28:19] Finish training GloVe embeddings[Dim=5]. △M: -9.15MB. △T: 3.9 minutes.
[06:32:59] Finish training GloVe embeddings[Dim=10]. △M: -4.15MB. △T: 4.6 minutes.
[06:38:55] Finish training GloVe embeddings[Dim=15]. △M: +128.0KB. △T: 5.9 minutes.
[06:45:43] Finish training GloVe embeddings[Dim=20]. △M: -2.01MB. △T: 6.8 minutes.
[06:53:18] Finish training GloVe embeddings[Dim=25]. △M: -4.14MB. △T: 7.5 minutes.
[07:01:20] Finish training GloVe embeddings[Dim=30]. △M: -2.14MB. △T: 8.0 minutes.
[07:10:20] Finish training GloVe embeddings[Dim=35]. △M: -2.48MB. △T: 9.0 minutes.
[07:20:11] Finish training GloVe embeddings[Dim=40]. △M: -360.0KB. △T: 9.8 minutes.
[07:30:48] Finish training GloVe embeddings[Dim=45]. △M: -5.96MB. △T: 10.6 minutes.
[07:42:10] Finish training GloVe embeddings[Dim=50]. △M: +608.0KB. △T: 11.3 minutes.


In [8]:
emb_dims = [100, 200, 300]
for emb_dim in emb_dims:
    try_embedding_dim_downsample(cooc, word_to_index, feat_name=feat_name, 
                                 window=window, emb_dim=emb_dim, 
                                 learning_rate=0.01, epochs=300,
                                 show=False, **default_plot_kwargs)

[04:56:05] Finish training GloVe embeddings[Dim=100]. △M: +45.89MB. △T: 18.2 minutes.
[05:31:25] Finish training GloVe embeddings[Dim=200]. △M: -24.7MB. △T: 35.3 minutes.
[06:25:33] Finish training GloVe embeddings[Dim=300]. △M: +79.23MB. △T: 54.1 minutes.


In [11]:
feat_name = 'ct'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name, 
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [12]:
# emb_dims = [1, 2, 3, 4, 5]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.0001, epochs=100000,
#                       show=False, **default_plot_kwargs)

[08:50:23] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 2.5 seconds.
[08:50:26] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 2.9 seconds.
[08:50:29] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 2.7 seconds.
[08:50:32] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 2.7 seconds.
[08:50:35] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 2.7 seconds.


In [13]:
# emb_dims = [1, 2, 3, 4, 5]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.00001, epochs=300000,
#                       show=False, **default_plot_kwargs)

[09:11:13] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 7.2 seconds.
[09:11:21] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 8.0 seconds.
[09:11:29] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 7.8 seconds.
[09:11:37] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 7.7 seconds.
[09:11:45] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 8.2 seconds.


In [14]:
# emb_dims = [1, 2, 3, 4, 5]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.00001, epochs=100000,
#                       show=False, **default_plot_kwargs)

[09:13:05] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 2.5 seconds.
[09:13:08] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 2.6 seconds.
[09:13:11] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 2.5 seconds.
[09:13:13] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 2.6 seconds.
[09:13:16] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 2.7 seconds.


In [15]:
# emb_dims = [1, 2, 3, 4, 5]
# for emb_dim in emb_dims:
#     try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
#                       learning_rate=0.0001, epochs=10000,
#                       show=False, **default_plot_kwargs)

[09:14:07] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.3 seconds.
[09:14:08] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.3 seconds.
[09:14:08] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.3 seconds.
[09:14:08] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.3 seconds.
[09:14:09] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.3 seconds.


In [16]:
emb_dims = [1, 2, 3, 4, 5]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.001, epochs=1000,
                      show=False, **default_plot_kwargs)

[09:15:20] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.0 seconds.
[09:15:20] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.0 seconds.
[09:15:20] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.0 seconds.
[09:15:21] Finish training GloVe embeddings[Dim=4]. △M: +0B. △T: 0.0 seconds.
[09:15:21] Finish training GloVe embeddings[Dim=5]. △M: +0B. △T: 0.0 seconds.


In [17]:
feat_name = 'os'
word_to_index, cooc = du.load_preliminary_user_feature_coocurrence(feat_name)
CooccurrenceVisualizer.plot_cooc(cooc, word_to_index, feat_name, 
                                 show=False, savepath=figpath(feat_name), **default_plot_kwargs)

In [19]:
emb_dims = [1, 2, 3]
for emb_dim in emb_dims:
    try_embedding_dim(cooc, word_to_index, feat_name, emb_dim=emb_dim, 
                      learning_rate=0.001, epochs=1000,
                      show=False, **default_plot_kwargs)

[09:27:29] Finish training GloVe embeddings[Dim=1]. △M: +0B. △T: 0.0 seconds.
[09:27:29] Finish training GloVe embeddings[Dim=2]. △M: +0B. △T: 0.0 seconds.
[09:27:29] Finish training GloVe embeddings[Dim=3]. △M: +0B. △T: 0.0 seconds.


In [4]:
# all random_state equals 2018
emb_sizes = {
    'marriageStatus',  # learning_rate=0.0001, epochs=100000, emb_dim=10  # need fine-tune
    'interest1',  # learning_rate=0.0002, epochs=5000, emb_dim=15
    'interest2',  # learning_rate=0.0002, epochs=5000, emb_dim=10
    'interest3',  # learning_rate=0.01, epochs=300, emb_dim=10
    'interest4',  # learning_rate=0.01, epochs=300, emb_dim=10
    'interest5',  # learning_rate=0.01, epochs=300, emb_dim=16
    'kw1', 
    'kw2', 
    'kw3', 
    'topic1',
    'topic2', 
    'topic3', 
    'appIdInstall', 
    'appIdAction', 
    'ct', 
    'os'
}