## Word2Vec

In [13]:
# https://radimrehurek.com/gensim/models/word2vec.html

### Build own word2vec embedding

In [2]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [4]:
vector = model.wv['computer']

In [6]:
sims = model.wv.most_similar('computer', topn=10)
sims

[('system', 0.21617139875888824),
 ('survey', 0.04468922317028046),
 ('interface', 0.015203381888568401),
 ('time', 0.0019510635174810886),
 ('trees', -0.03284316882491112),
 ('human', -0.07424270361661911),
 ('response', -0.09317591041326523),
 ('graph', -0.09575342386960983),
 ('eps', -0.10513808578252792),
 ('user', -0.16911619901657104)]

In [8]:
vars(model.wv)

{'vector_size': 100,
 'index_to_key': ['system',
  'graph',
  'trees',
  'user',
  'minors',
  'eps',
  'time',
  'response',
  'survey',
  'computer',
  'interface',
  'human'],
 'next_index': 0,
 'key_to_index': {'system': 0,
  'graph': 1,
  'trees': 2,
  'user': 3,
  'minors': 4,
  'eps': 5,
  'time': 6,
  'response': 7,
  'survey': 8,
  'computer': 9,
  'interface': 10,
  'human': 11},
 'vectors': array([[-5.3622725e-04,  2.3643136e-04,  5.1033497e-03, ...,
         -7.0415605e-03,  9.0145587e-04,  6.3925339e-03],
        [-8.6196875e-03,  3.6657380e-03,  5.1898835e-03, ...,
         -2.3915148e-03, -9.5100943e-03,  4.5058788e-03],
        [ 9.4563962e-05,  3.0773198e-03, -6.8126451e-03, ...,
          5.1259040e-04,  8.2130842e-03, -7.0190406e-03],
        ...,
        [-5.1577436e-03, -6.6702785e-03, -7.7790986e-03, ...,
          5.8376994e-03,  9.3939463e-03,  3.5079459e-03],
        [ 7.0871473e-03, -1.5683770e-03,  7.9461383e-03, ...,
         -5.4809595e-03,  3.8159827e-03, 

### Download word2vec embedding

In [9]:
import gensim.downloader

In [10]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [11]:
glove_vectors = gensim.downloader.load('glove-twitter-25')



In [12]:
glove_vectors.most_similar('twitter')

[('facebook', 0.948005199432373),
 ('tweet', 0.9403423070907593),
 ('fb', 0.9342358708381653),
 ('instagram', 0.9104824066162109),
 ('chat', 0.8964964747428894),
 ('hashtag', 0.8885937333106995),
 ('tweets', 0.8878158330917358),
 ('tl', 0.8778461217880249),
 ('link', 0.8778210878372192),
 ('internet', 0.8753897547721863)]

## Doc2Vec

In [14]:
# https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec

In [15]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [16]:
vector = model.infer_vector(["system", "response"])

In [17]:
vector

array([-0.09129732,  0.01977961,  0.08609653,  0.07478146, -0.0532991 ],
      dtype=float32)

## Example with tensorflow

In [23]:
from gensim.models.doc2vec import TaggedDocument

documents = [
    "Esse é um exemplo de texto para o modelo.",
    "Outro exemplo de texto para treinar.",
    "Mais um documento para processar."
]

tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(documents)]

In [24]:
tagged_data

[TaggedDocument(words=['Esse', 'é', 'um', 'exemplo', 'de', 'texto', 'para', 'o', 'modelo.'], tags=['0']),
 TaggedDocument(words=['Outro', 'exemplo', 'de', 'texto', 'para', 'treinar.'], tags=['1']),
 TaggedDocument(words=['Mais', 'um', 'documento', 'para', 'processar.'], tags=['2'])]

In [25]:
from gensim.models import Doc2Vec

doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)

doc2vec_model.build_vocab(tagged_data)

doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [31]:
doc_vectors = [doc2vec_model.dv[str(i)] for i in range(len(documents))]

In [32]:
doc_vectors

[array([-0.00532877, -0.00607249, -0.00990345,  0.00859518,  0.00361673,
         0.00023243, -0.00989265, -0.00518328, -0.0097438 ,  0.00204311,
         0.00281175,  0.00467462, -0.00436783, -0.00320414, -0.00303782,
        -0.0087573 ,  0.00214723,  0.00926725, -0.0095561 , -0.00347284,
        -0.00381653,  0.00259803, -0.00569828,  0.00270216,  0.0057809 ,
        -0.00813431, -0.00841817, -0.00998776,  0.00493612, -0.00918214,
         0.00586194,  0.00680537, -0.00649284, -0.00457284, -0.00129231,
         0.00167649, -0.00152271, -0.00861853, -0.00365438,  0.0016958 ,
        -0.00200163, -0.00721857,  0.00424815, -0.00863997,  0.00268743,
        -0.00464412,  0.00065236, -0.00200517,  0.00540339, -0.00810504,
        -0.00219556, -0.00011445, -0.00669978, -0.00661449, -0.00196804,
         0.00890698, -0.00124255,  0.00361745, -0.00578501,  0.00888245,
         0.00297204,  0.00940106,  0.00444463, -0.00423215,  0.00224025,
        -0.00442523,  0.00583886,  0.00187582, -0.0

In [None]:
import tensorflow as tf

X = tf.constant(doc_vectors, dtype=tf.float32)
y = tf.constant([0, 1, 1])

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(100,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X, y, epochs=10, batch_size=2)