In [1]:
# !python -m spacy download pt_core_news_sm

In [2]:
import pandas as pd

articles_train = pd.read_csv('/content/drive/MyDrive/curso_word2vec/data/treino.csv')
print(articles_train.shape)
articles_train.head()

(90000, 6)


Unnamed: 0,title,text,date,category,subcategory,link
0,"Após polêmica, Marine Le Pen diz que abomina n...",A candidata da direita nacionalista à Presidên...,2017-04-28,mundo,,http://www1.folha.uol.com.br/mundo/2017/04/187...
1,"Macron e Le Pen vão ao 2º turno na França, em ...",O centrista independente Emmanuel Macron e a d...,2017-04-23,mundo,,http://www1.folha.uol.com.br/mundo/2017/04/187...
2,"Apesar de larga vitória nas legislativas, Macr...",As eleições legislativas deste domingo (19) na...,2017-06-19,mundo,,http://www1.folha.uol.com.br/mundo/2017/06/189...
3,"Governo antecipa balanço, e Alckmin anuncia qu...",O número de ocorrências de homicídios dolosos ...,2015-07-24,cotidiano,,http://www1.folha.uol.com.br/cotidiano/2015/07...
4,"Após queda em maio, a atividade econômica sobe...","A economia cresceu 0,25% no segundo trimestre,...",2017-08-17,mercado,,http://www1.folha.uol.com.br/mercado/2017/08/1...


In [3]:
import spacy

nlp = spacy.load("pt_core_news_sm")
nlp

<spacy.lang.pt.Portuguese at 0x7f8784d43dd0>

In [4]:
titles = (title.lower() for title in articles_train.title)

In [5]:
def preprocessing_pipeline(doc):
  valid_tokens = [token.text for token in doc if not token.is_stop and token.is_alpha]
  return ' '.join(valid_tokens) if len(valid_tokens) > 2 else None

In [6]:
text = 'Rio  132132 de $$$$) %%% Janeiro é uma cidade maravilhosa!!!'
doc = nlp(text)

preprocessing_pipeline(doc)

'Rio Janeiro cidade maravilhosa'

In [7]:
preprocessed_titles = [preprocessing_pipeline(doc) for doc in nlp.pipe(titles, batch_size=1000, n_process=-1)]

In [8]:
titles_df = pd.DataFrame({'title': preprocessed_titles})
titles_df = titles_df.dropna().drop_duplicates()
print(titles_df.shape)
titles_df.head()

(84466, 1)


Unnamed: 0,title
0,polêmica marine le pen abomina negacionistas h...
1,macron le pen turno frança revés siglas tradic...
2,apesar larga vitória legislativas macron terá ...
3,governo antecipa balanço alckmin anuncia queda...
4,queda maio atividade econômica sobe junho bc


In [9]:
def get_tokens_generator():
  return [title.split(' ') for title in titles_df.title]

In [10]:
import logging
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : - %(message)s', level=logging.INFO)

w2v_model = Word2Vec(sg = 0, window = 2, size = 300,
                     min_count = 5, alpha = 0.03, min_alpha = 0.007)

w2v_model

<gensim.models.word2vec.Word2Vec at 0x7f8781b15fd0>

In [11]:
tokens = get_tokens_generator()
w2v_model.build_vocab(tokens, progress_per=5000)

2022-07-13 12:58:33,524 : - collecting all words and their counts
2022-07-13 12:58:33,526 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-07-13 12:58:33,545 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2022-07-13 12:58:33,561 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2022-07-13 12:58:33,577 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2022-07-13 12:58:33,594 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2022-07-13 12:58:33,610 : - PROGRESS: at sentence #25000, processed 159589 words, keeping 23491 word types
2022-07-13 12:58:33,628 : - PROGRESS: at sentence #30000, processed 191554 words, keeping 25494 word types
2022-07-13 12:58:33,646 : - PROGRESS: at sentence #35000, processed 223412 words, keeping 27330 word types
2022-07-13 12:58:33,662 : - PROGRESS: at sentence #40000, processed 255282 words, keeping 29053

In [12]:
w2v_model.corpus_count
len(tokens)

84466

In [13]:
w2v_model.train(tokens, total_examples=w2v_model.corpus_count,
                epochs=30)

2022-07-13 12:58:36,808 : - training model with 3 workers on 12924 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
2022-07-13 12:58:37,829 : - EPOCH 1 - PROGRESS: at 64.82% examples, 313820 words/s, in_qsize 5, out_qsize 0
2022-07-13 12:58:38,298 : - worker thread finished; awaiting finish of 2 more threads
2022-07-13 12:58:38,341 : - worker thread finished; awaiting finish of 1 more threads
2022-07-13 12:58:38,358 : - worker thread finished; awaiting finish of 0 more threads
2022-07-13 12:58:38,360 : - EPOCH - 1 : training on 540242 raw words (486091 effective words) took 1.5s, 316951 effective words/s
2022-07-13 12:58:39,424 : - EPOCH 2 - PROGRESS: at 66.65% examples, 308303 words/s, in_qsize 4, out_qsize 1
2022-07-13 12:58:39,815 : - worker thread finished; awaiting finish of 2 more threads
2022-07-13 12:58:39,851 : - worker thread finished; awaiting finish of 1 more threads
2022-07-13 12:58:39,857 : - worker thread finished; awaiting finish of 0 more t

(14584665, 16207260)

In [15]:
w2v_model.wv.most_similar('google')

[('apple', 0.6052509546279907),
 ('facebook', 0.5831785798072815),
 ('amazon', 0.5123606324195862),
 ('volkswagen', 0.5003378987312317),
 ('airbnb', 0.48825982213020325),
 ('uber', 0.46701881289482117),
 ('sony', 0.45689961314201355),
 ('snapchat', 0.4564026892185211),
 ('software', 0.4501359462738037),
 ('fbi', 0.44638028740882874)]

In [16]:
# Treinamento com estratégia SkipGram

w2v_model_skipgram = Word2Vec(sg = 1, window = 5, size = 300,
                              min_count = 5, alpha = 0.03, min_alpha = 0.007)

w2v_model_skipgram.build_vocab(tokens, progress_per=5000)

w2v_model_skipgram.train(tokens, total_examples=w2v_model_skipgram.corpus_count,
                         epochs=30)

2022-07-13 13:29:36,254 : - collecting all words and their counts
2022-07-13 13:29:36,258 : - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-07-13 13:29:36,282 : - PROGRESS: at sentence #5000, processed 31930 words, keeping 10193 word types
2022-07-13 13:29:36,306 : - PROGRESS: at sentence #10000, processed 63848 words, keeping 14989 word types
2022-07-13 13:29:36,329 : - PROGRESS: at sentence #15000, processed 95753 words, keeping 18279 word types
2022-07-13 13:29:36,351 : - PROGRESS: at sentence #20000, processed 127689 words, keeping 21033 word types
2022-07-13 13:29:36,376 : - PROGRESS: at sentence #25000, processed 159589 words, keeping 23491 word types
2022-07-13 13:29:36,402 : - PROGRESS: at sentence #30000, processed 191554 words, keeping 25494 word types
2022-07-13 13:29:36,426 : - PROGRESS: at sentence #35000, processed 223412 words, keeping 27330 word types
2022-07-13 13:29:36,463 : - PROGRESS: at sentence #40000, processed 255282 words, keeping 29053

(14584767, 16207260)

In [18]:
w2v_model_skipgram.wv.most_similar('google')

[('apple', 0.4207535982131958),
 ('reguladores', 0.4117451310157776),
 ('android', 0.4075790047645569),
 ('waze', 0.3910173773765564),
 ('buffett', 0.3839324712753296),
 ('yahoo', 0.37931469082832336),
 ('bmw', 0.37854450941085815),
 ('toshiba', 0.3748171329498291),
 ('concorda', 0.3693023920059204),
 ('verizon', 0.3638686537742615)]

In [19]:
w2v_model.wv.most_similar('google')

[('apple', 0.6052509546279907),
 ('facebook', 0.5831785798072815),
 ('amazon', 0.5123606324195862),
 ('volkswagen', 0.5003378987312317),
 ('airbnb', 0.48825982213020325),
 ('uber', 0.46701881289482117),
 ('sony', 0.45689961314201355),
 ('snapchat', 0.4564026892185211),
 ('software', 0.4501359462738037),
 ('fbi', 0.44638028740882874)]

In [20]:
w2v_model_skipgram.wv.most_similar('gm')

[('metalúrgicos', 0.5786983966827393),
 ('motors', 0.5443923473358154),
 ('honda', 0.5299404859542847),
 ('audi', 0.49667346477508545),
 ('cubatão', 0.4913884401321411),
 ('airbag', 0.48963674902915955),
 ('airbags', 0.4774690270423889),
 ('montadora', 0.476866215467453),
 ('mitsubishi', 0.47355473041534424),
 ('autoguiados', 0.4677491784095764)]

In [21]:
w2v_model.wv.most_similar('gm')

[('chrysler', 0.6754248142242432),
 ('embraer', 0.6709064245223999),
 ('volks', 0.6607313752174377),
 ('braskem', 0.6370549201965332),
 ('honda', 0.6281957626342773),
 ('volkswagen', 0.6063649654388428),
 ('renault', 0.5965393781661987),
 ('mitsubishi', 0.5942554473876953),
 ('fiat', 0.5924808979034424),
 ('toyota', 0.5915037989616394)]

In [22]:
w2v_model.wv.save_word2vec_format('/content/drive/MyDrive/curso_word2vec/data/cbow_model.txt', binary=False)
w2v_model_skipgram.wv.save_word2vec_format('/content/drive/MyDrive/curso_word2vec/data/skip_model.txt', binary=False)

2022-07-13 13:42:29,613 : - storing 12924x300 projection weights into /content/drive/MyDrive/curso_word2vec/data/cbow_model.txt
2022-07-13 13:42:33,170 : - storing 12924x300 projection weights into /content/drive/MyDrive/curso_word2vec/data/skip_model.txt
