In [1]:
import gensim
from gensim.utils import simple_preprocess # Converte um documento em uma lista de tokens
from gensim.corpora import Dictionary # Vai mapear uma palavra tokenizada a um ID unico
import pprint

# Bag of words with Gensim

In [2]:
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]

In [3]:
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(doc_list)

['Start spreading the news',
 "You're leaving today (tell him friend)",
 'I want to be a part of it, New York, New York',
 'Your vagabond shoes, they are longing to stray',
 'And steps around the heart of it, New York, New York']


In [4]:
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
doc_tokenized

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today', 'tell', 'him', 'friend'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york']]

In [5]:
dictionary = Dictionary()
dictionary

<gensim.corpora.dictionary.Dictionary at 0x79a7a2ff8450>

In [6]:
# Estou associando um id a cada token e uma contágem para quantas vezes ele aparece
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
BoW_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)],
 [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(3, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (18, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)]]

In [7]:
# Substituindo os ids em cada tupla de 2 pelo token em si
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
pp.pprint(id_words)

[[('news', 1), ('spreading', 1), ('start', 1), ('the', 1)],
 [('friend', 1),
  ('him', 1),
  ('leaving', 1),
  ('re', 1),
  ('tell', 1),
  ('today', 1),
  ('you', 1)],
 [('be', 1),
  ('it', 1),
  ('new', 2),
  ('of', 1),
  ('part', 1),
  ('to', 1),
  ('want', 1),
  ('york', 2)],
 [('to', 1),
  ('are', 1),
  ('longing', 1),
  ('shoes', 1),
  ('stray', 1),
  ('they', 1),
  ('vagabond', 1),
  ('your', 1)],
 [('the', 1),
  ('it', 1),
  ('new', 2),
  ('of', 1),
  ('york', 2),
  ('and', 1),
  ('around', 1),
  ('heart', 1),
  ('steps', 1)]]


# Bag of Words with Sklearn

In [8]:
# Importanto o modelo CointVectorizer para converter os tokens do documento em uma matriz de contagem de tokens
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# Instanciando o modelo
cv = CountVectorizer()

In [10]:
doc_tokenized_Ngrams# Treinando e transformando o modelo
cv_fit = cv.fit_transform(doc_list)

In [11]:
# Mostrando as palavras em uma lista
word_list = cv.get_feature_names_out()
word_list

array(['and', 'are', 'around', 'be', 'friend', 'heart', 'him', 'it',
       'leaving', 'longing', 'new', 'news', 'of', 'part', 're', 'shoes',
       'spreading', 'start', 'steps', 'stray', 'tell', 'the', 'they',
       'to', 'today', 'vagabond', 'want', 'york', 'you', 'your'],
      dtype=object)

In [12]:
cv_fit.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 2, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 2, 0, 0]])

In [13]:
count_list = cv_fit.toarray().sum(axis=0)

In [14]:
pp.pprint( dict(zip(word_list,count_list)) )

{'and': 1,
 'are': 1,
 'around': 1,
 'be': 1,
 'friend': 1,
 'heart': 1,
 'him': 1,
 'it': 2,
 'leaving': 1,
 'longing': 1,
 'new': 4,
 'news': 1,
 'of': 2,
 'part': 1,
 're': 1,
 'shoes': 1,
 'spreading': 1,
 'start': 1,
 'steps': 1,
 'stray': 1,
 'tell': 1,
 'the': 2,
 'they': 1,
 'to': 2,
 'today': 1,
 'vagabond': 1,
 'want': 1,
 'york': 4,
 'you': 1,
 'your': 1}


# Bag of N-Grams with Gensim

In [18]:
# Criando os bigrams com gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [19]:
doc_list_Ngrams = [
"Start spreading the news",
"You're leaving today",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York",
"Come and visit us",
"Come and visit the city",
]

In [20]:
# tokenizando 
doc_tokenized_Ngrams = [simple_preprocess(doc) for doc in doc_list_Ngrams]
doc_tokenized_Ngrams

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york'],
 ['come', 'and', 'visit', 'us'],
 ['come', 'and', 'visit', 'the', 'city']]

In [35]:

bigram = Phrases(doc_tokenized_Ngrams, min_count=1, threshold=2, delimiter=' ')
bigram_phraser = Phraser(bigram)
doc_bigrams = [bigram_phraser[doc] for doc in doc_tokenized_Ngrams]


In [36]:
for sent in doc_tokenized_Ngrams:
    tokens_ = bigram_phraser[sent]
    print(tokens_)

['start', 'spreading', 'the', 'news']
['you', 're', 'leaving', 'today']
['want', 'to', 'be', 'part', 'of it', 'new york', 'new york']
['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray']
['and', 'steps', 'around', 'the', 'heart', 'of it', 'new york', 'new york']
['come and', 'visit', 'us']
['come and', 'visit', 'the', 'city']
