# User2Vec
Based on: https://ieeexplore.ieee.org/document/8875952/

How it works:
- Run doc2vec
- Average vector representations for each user


In [1]:
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np
import eland as ed
from sklearn.manifold import TSNE

from src.models import User2Vec, tokenize

# uncomment this if database is not already open (and give ES a couple minutes to set up)
#!make database

ed_df = ed.read_es('localhost', 'twitter')

In [43]:
df = ed_df[[
    'tweet_id', 'original_tweet_id_str', 
    'user_id', 'name', 'full_text_processed', 
    'sentiment', 'followers_count'
]].to_pandas().fillna(np.nan)

In [3]:
unique_docs = df['full_text_processed'].unique()
train_corpus = [tokenize(doc, tag) for doc, tag in zip(unique_docs, range(unique_docs.shape[0]))]

In [4]:
model = User2Vec(vector_size=10, min_count=2, epochs=40)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
users_id, user_vectors = model.infer_user_vectors(
    df['user_id'], 
    df['full_text_processed'],
    track_progress=True
)

vec_cols = [f'vec_{i}' for i in range(10)]
df_user_vecs = pd.DataFrame(user_vectors, columns=vec_cols)
df_user_vecs['user_id'] = users_id

In [20]:
tsne = TSNE(random_state=0, n_jobs=-1, verbose=2)
user_embeddings = tsne.fit_transform(user_vectors)

df_user_vecs[['tsne_0','tsne_1']] = user_embeddings

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 194183 samples in 0.447s...
[t-SNE] Computed neighbors for 194183 samples in 257.234s...
[t-SNE] Computed conditional probabilities for sample 1000 / 194183
[t-SNE] Computed conditional probabilities for sample 2000 / 194183
[t-SNE] Computed conditional probabilities for sample 3000 / 194183
[t-SNE] Computed conditional probabilities for sample 4000 / 194183
[t-SNE] Computed conditional probabilities for sample 5000 / 194183
[t-SNE] Computed conditional probabilities for sample 6000 / 194183
[t-SNE] Computed conditional probabilities for sample 7000 / 194183
[t-SNE] Computed conditional probabilities for sample 8000 / 194183
[t-SNE] Computed conditional probabilities for sample 9000 / 194183
[t-SNE] Computed conditional probabilities for sample 10000 / 194183
[t-SNE] Computed conditional probabilities for sample 11000 / 194183
[t-SNE] Computed conditional probabilities for sample 12000 / 194183
[t-SNE] Computed conditional proba

[t-SNE] Computed conditional probabilities for sample 120000 / 194183
[t-SNE] Computed conditional probabilities for sample 121000 / 194183
[t-SNE] Computed conditional probabilities for sample 122000 / 194183
[t-SNE] Computed conditional probabilities for sample 123000 / 194183
[t-SNE] Computed conditional probabilities for sample 124000 / 194183
[t-SNE] Computed conditional probabilities for sample 125000 / 194183
[t-SNE] Computed conditional probabilities for sample 126000 / 194183
[t-SNE] Computed conditional probabilities for sample 127000 / 194183
[t-SNE] Computed conditional probabilities for sample 128000 / 194183
[t-SNE] Computed conditional probabilities for sample 129000 / 194183
[t-SNE] Computed conditional probabilities for sample 130000 / 194183
[t-SNE] Computed conditional probabilities for sample 131000 / 194183
[t-SNE] Computed conditional probabilities for sample 132000 / 194183
[t-SNE] Computed conditional probabilities for sample 133000 / 194183
[t-SNE] Computed con

In [51]:
df_meta = df.groupby('user_id').agg(
    {'full_text_processed': 'count', 'sentiment': 'mean', 'followers_count': 'max'}
).rename(columns={
        'full_text_processed': 'Tweet Count', 
        'sentiment':'Mean Sentiment', 
        'followers_count': 'Followers'
    }
)

df_user_vecs = df_user_vecs.set_index('user_id').join(df_meta)

In [58]:
df_user_vecs.to_csv('iwmi_user2vec.csv')

In [39]:
from bokeh.io import output_file, show
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure

output_file("slider.html")

p1 = figure(plot_width=300, plot_height=300)
p1.circle(df_user_vecs.tsne_0, df_user_vecs.tsne_1, size=20, color="navy", alpha=0.5, tags=df_user_vecs.tweet_count.tolist())
tab1 = Panel(child=p1, title="circle")

p2 = figure(plot_width=300, plot_height=300)
p2.line([1, 2, 3, 4, 5], [6, 7, 2, 4, 5], line_width=3, color="navy", alpha=0.5)
tab2 = Panel(child=p2, title="line")

tabs = Tabs(tabs=[ tab1, tab2 ])

show(tabs)

Unnamed: 0_level_0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,tsne_0,tsne_1,tweet_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.272213e+18,-0.739419,-1.059035,0.163762,-0.076321,-0.260522,0.036073,0.056783,0.106598,-0.411936,1.303556,-32.281082,-9.000099,1
1.272109e+18,0.174209,0.401730,-0.145802,-0.292049,-0.154792,0.153740,0.176192,-0.055979,-0.208155,-0.347693,17.044088,-18.042822,1
1.272079e+18,-0.222627,-0.873474,-0.321188,-0.357290,-0.151350,0.512009,-0.852363,0.113741,0.050234,0.540833,26.452530,14.172265,1
1.272033e+18,-0.059179,-0.994990,0.402563,-0.530733,0.161350,-0.207294,0.273339,0.297918,-0.751576,0.538631,-21.502831,0.171080,1
1.271859e+18,0.059877,-0.344172,-0.565281,-0.909864,0.698891,0.997068,-0.152493,-0.357882,-0.401844,-0.574967,-15.121478,-31.133333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3.249000e+03,0.171680,-0.267652,0.267325,0.388046,-0.029781,0.440219,-0.925499,-0.146318,-0.240875,0.820181,9.986276,-1.036941,1
2.737000e+03,0.397655,-0.129630,0.021220,0.512400,-0.017296,0.589387,-0.444330,0.592112,-0.577651,0.316592,-12.566261,-25.574867,1
7.670000e+02,-0.084124,-0.779910,-0.086272,0.448289,-0.201329,-0.092580,-1.059678,-0.245275,0.146036,0.600696,18.401367,13.417652,2
5.730000e+02,1.092027,-0.189933,-0.400786,1.100439,0.710896,-0.249937,-0.194067,0.143176,-0.830164,1.078706,-9.268167,-23.789892,1
