In [59]:
import gensim
from gensim.utils import simple_preprocess # Converte um documento em uma lista de tokens
from gensim.corpora import Dictionary # Vai mapear uma palavra tokenizada a um ID unico
import pprint
import pandas as pd
import numpy as np

# Bag of words with Gensim

In [24]:
doc_list = [
"Start spreading the news",
"You're leaving today (tell him friend)",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York"
]

In [25]:
pp = pprint.PrettyPrinter(indent=1)
pp.pprint(doc_list)

['Start spreading the news',
 "You're leaving today (tell him friend)",
 'I want to be a part of it, New York, New York',
 'Your vagabond shoes, they are longing to stray',
 'And steps around the heart of it, New York, New York']


In [26]:
doc_tokenized = [simple_preprocess(doc) for doc in doc_list]
doc_tokenized

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today', 'tell', 'him', 'friend'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york']]

In [27]:
dictionary = Dictionary()
dictionary

<gensim.corpora.dictionary.Dictionary at 0x710459778390>

In [28]:
# Estou associando um id a cada token e uma contágem para quantas vezes ele aparece
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]
BoW_corpus

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)],
 [(11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 2)],
 [(16, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(3, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (18, 2),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1)]]

In [29]:
# Substituindo os ids em cada tupla de 2 pelo token em si
id_words = [[(dictionary[id], count) for id, count in line] for line in BoW_corpus]
pp.pprint(id_words)

[[('news', 1), ('spreading', 1), ('start', 1), ('the', 1)],
 [('friend', 1),
  ('him', 1),
  ('leaving', 1),
  ('re', 1),
  ('tell', 1),
  ('today', 1),
  ('you', 1)],
 [('be', 1),
  ('it', 1),
  ('new', 2),
  ('of', 1),
  ('part', 1),
  ('to', 1),
  ('want', 1),
  ('york', 2)],
 [('to', 1),
  ('are', 1),
  ('longing', 1),
  ('shoes', 1),
  ('stray', 1),
  ('they', 1),
  ('vagabond', 1),
  ('your', 1)],
 [('the', 1),
  ('it', 1),
  ('new', 2),
  ('of', 1),
  ('york', 2),
  ('and', 1),
  ('around', 1),
  ('heart', 1),
  ('steps', 1)]]


# Bag of Words with Sklearn

In [30]:
# Importanto o modelo CointVectorizer para converter os tokens do documento em uma matriz de contagem de tokens
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# Instanciando o modelo
cv = CountVectorizer()

In [32]:
# Treinando e transformando o modelo
cv_fit = cv.fit_transform(doc_list)

In [33]:
# Mostrando as palavras em uma lista
word_list = cv.get_feature_names_out()
word_list

array(['and', 'are', 'around', 'be', 'friend', 'heart', 'him', 'it',
       'leaving', 'longing', 'new', 'news', 'of', 'part', 're', 'shoes',
       'spreading', 'start', 'steps', 'stray', 'tell', 'the', 'they',
       'to', 'today', 'vagabond', 'want', 'york', 'you', 'your'],
      dtype=object)

In [34]:
cv_fit.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 1, 2, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 2, 0, 0]])

In [35]:
count_list = cv_fit.toarray().sum(axis=0)

In [36]:
pp.pprint( dict(zip(word_list,count_list)) )

{'and': 1,
 'are': 1,
 'around': 1,
 'be': 1,
 'friend': 1,
 'heart': 1,
 'him': 1,
 'it': 2,
 'leaving': 1,
 'longing': 1,
 'new': 4,
 'news': 1,
 'of': 2,
 'part': 1,
 're': 1,
 'shoes': 1,
 'spreading': 1,
 'start': 1,
 'steps': 1,
 'stray': 1,
 'tell': 1,
 'the': 2,
 'they': 1,
 'to': 2,
 'today': 1,
 'vagabond': 1,
 'want': 1,
 'york': 4,
 'you': 1,
 'your': 1}


# Bag of N-Grams with Gensim

In [37]:
# Criando os bigrams com gensim
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [38]:
doc_list_Ngrams = [
"Start spreading the news",
"You're leaving today",
"I want to be a part of it, New York, New York",
"Your vagabond shoes, they are longing to stray",
"And steps around the heart of it, New York, New York",
"Come and visit us",
"Come and visit the city",
]

In [39]:
# tokenizando 
doc_tokenized_Ngrams = [simple_preprocess(doc) for doc in doc_list_Ngrams]
doc_tokenized_Ngrams

[['start', 'spreading', 'the', 'news'],
 ['you', 're', 'leaving', 'today'],
 ['want', 'to', 'be', 'part', 'of', 'it', 'new', 'york', 'new', 'york'],
 ['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray'],
 ['and',
  'steps',
  'around',
  'the',
  'heart',
  'of',
  'it',
  'new',
  'york',
  'new',
  'york'],
 ['come', 'and', 'visit', 'us'],
 ['come', 'and', 'visit', 'the', 'city']]

In [40]:
# Rodando a classe Prases para separar alguns tokens em bigrams mais comuns
bigram = Phrases(doc_tokenized_Ngrams, min_count=1, threshold=2, delimiter=' ')
bigram_phraser = Phraser(bigram)
doc_bigrams = [bigram_phraser[doc] for doc in doc_tokenized_Ngrams]


In [41]:
for sent in doc_tokenized_Ngrams:
    tokens_ = bigram_phraser[sent]
    print(tokens_)

['start', 'spreading', 'the', 'news']
['you', 're', 'leaving', 'today']
['want', 'to', 'be', 'part', 'of it', 'new york', 'new york']
['your', 'vagabond', 'shoes', 'they', 'are', 'longing', 'to', 'stray']
['and', 'steps', 'around', 'the', 'heart', 'of it', 'new york', 'new york']
['come and', 'visit', 'us']
['come and', 'visit', 'the', 'city']


In [42]:
trigram = Phrases(bigram[doc_tokenized_Ngrams], min_count=1, delimiter=' ')

In [43]:
for sent in doc_tokenized_Ngrams:
    bigrams_ = [b for b in bigram[sent] if b.count(' ') ==1]
    trigrams_ = [t for t in trigram[bigram[sent]] if t.count(' ') == 2]
    print(trigrams_)

[]
[]
[]
[]
[]
['come and visit']
['come and visit']


# Bag of N-Grams Sklearn

In [44]:
# Instanciando e treinando o modelo para gerar os bigrams
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_fit = ngram_vectorizer.fit_transform(doc_list)

In [45]:
# Criando um array com os tokens
word_list = ngram_vectorizer.get_feature_names_out()
word_list

array(['and steps', 'are longing', 'around the', 'be part', 'heart of',
       'him friend', 'it new', 'leaving today', 'longing to', 'new york',
       'of it', 'part of', 're leaving', 'shoes they', 'spreading the',
       'start spreading', 'steps around', 'tell him', 'the heart',
       'the news', 'they are', 'to be', 'to stray', 'today tell',
       'vagabond shoes', 'want to', 'york new', 'you re', 'your vagabond'],
      dtype=object)

In [46]:
# Criando um array com a contagem de palavras
count_list = ngram_fit.toarray().sum(axis=0)
count_list

array([1, 1, 1, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1])

In [47]:
pp.pprint(dict(zip(word_list, count_list)))

{'and steps': 1,
 'are longing': 1,
 'around the': 1,
 'be part': 1,
 'heart of': 1,
 'him friend': 1,
 'it new': 2,
 'leaving today': 1,
 'longing to': 1,
 'new york': 4,
 'of it': 2,
 'part of': 1,
 're leaving': 1,
 'shoes they': 1,
 'spreading the': 1,
 'start spreading': 1,
 'steps around': 1,
 'tell him': 1,
 'the heart': 1,
 'the news': 1,
 'they are': 1,
 'to be': 1,
 'to stray': 1,
 'today tell': 1,
 'vagabond shoes': 1,
 'want to': 1,
 'york new': 2,
 'you re': 1,
 'your vagabond': 1}


In [48]:
ngram_vectorizer = CountVectorizer(ngram_range=(3, 3))
ngram_fit = ngram_vectorizer.fit_transform(doc_list)

In [52]:
word_list = ngram_vectorizer.get_feature_names_out()
count_list = ngram_fit.toarray().sum(axis=0)
pp.pprint( dict(zip(word_list,count_list)) )

{'and steps around': 1,
 'are longing to': 1,
 'around the heart': 1,
 'be part of': 1,
 'heart of it': 1,
 'it new york': 2,
 'leaving today tell': 1,
 'longing to stray': 1,
 'new york new': 2,
 'of it new': 2,
 'part of it': 1,
 're leaving today': 1,
 'shoes they are': 1,
 'spreading the news': 1,
 'start spreading the': 1,
 'steps around the': 1,
 'tell him friend': 1,
 'the heart of': 1,
 'they are longing': 1,
 'to be part': 1,
 'today tell him': 1,
 'vagabond shoes they': 1,
 'want to be': 1,
 'york new york': 2,
 'you re leaving': 1,
 'your vagabond shoes': 1}


# NLTK for N-grams

In [66]:
from nltk.util import bigrams, trigrams

In [68]:
# Colocando todas as palavras de todas as frases numa lista só
flat_list = []
for sublist in doc_tokenized:
    for item in sublist:
        flat_list.append(item)
flat_list

['start',
 'spreading',
 'the',
 'news',
 'you',
 're',
 'leaving',
 'today',
 'tell',
 'him',
 'friend',
 'want',
 'to',
 'be',
 'part',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york',
 'your',
 'vagabond',
 'shoes',
 'they',
 'are',
 'longing',
 'to',
 'stray',
 'and',
 'steps',
 'around',
 'the',
 'heart',
 'of',
 'it',
 'new',
 'york',
 'new',
 'york']

In [69]:
# Gerando bigrams com o nltk
nltk_bigrams = list(bigrams(flat_list))
nltk_bigrams

[('start', 'spreading'),
 ('spreading', 'the'),
 ('the', 'news'),
 ('news', 'you'),
 ('you', 're'),
 ('re', 'leaving'),
 ('leaving', 'today'),
 ('today', 'tell'),
 ('tell', 'him'),
 ('him', 'friend'),
 ('friend', 'want'),
 ('want', 'to'),
 ('to', 'be'),
 ('be', 'part'),
 ('part', 'of'),
 ('of', 'it'),
 ('it', 'new'),
 ('new', 'york'),
 ('york', 'new'),
 ('new', 'york'),
 ('york', 'your'),
 ('your', 'vagabond'),
 ('vagabond', 'shoes'),
 ('shoes', 'they'),
 ('they', 'are'),
 ('are', 'longing'),
 ('longing', 'to'),
 ('to', 'stray'),
 ('stray', 'and'),
 ('and', 'steps'),
 ('steps', 'around'),
 ('around', 'the'),
 ('the', 'heart'),
 ('heart', 'of'),
 ('of', 'it'),
 ('it', 'new'),
 ('new', 'york'),
 ('york', 'new'),
 ('new', 'york')]

In [70]:
# Gerando trigrams com o nltk
nltk_trigrams = list(trigrams(flat_list))
nltk_trigrams

[('start', 'spreading', 'the'),
 ('spreading', 'the', 'news'),
 ('the', 'news', 'you'),
 ('news', 'you', 're'),
 ('you', 're', 'leaving'),
 ('re', 'leaving', 'today'),
 ('leaving', 'today', 'tell'),
 ('today', 'tell', 'him'),
 ('tell', 'him', 'friend'),
 ('him', 'friend', 'want'),
 ('friend', 'want', 'to'),
 ('want', 'to', 'be'),
 ('to', 'be', 'part'),
 ('be', 'part', 'of'),
 ('part', 'of', 'it'),
 ('of', 'it', 'new'),
 ('it', 'new', 'york'),
 ('new', 'york', 'new'),
 ('york', 'new', 'york'),
 ('new', 'york', 'your'),
 ('york', 'your', 'vagabond'),
 ('your', 'vagabond', 'shoes'),
 ('vagabond', 'shoes', 'they'),
 ('shoes', 'they', 'are'),
 ('they', 'are', 'longing'),
 ('are', 'longing', 'to'),
 ('longing', 'to', 'stray'),
 ('to', 'stray', 'and'),
 ('stray', 'and', 'steps'),
 ('and', 'steps', 'around'),
 ('steps', 'around', 'the'),
 ('around', 'the', 'heart'),
 ('the', 'heart', 'of'),
 ('heart', 'of', 'it'),
 ('of', 'it', 'new'),
 ('it', 'new', 'york'),
 ('new', 'york', 'new'),
 ('york',

# Gensim for TF-IDF

In [74]:
from gensim.models import TfidfModel
tfidf = TfidfModel(BoW_corpus, smartirs='ntc')

In [81]:
# Criando uma lista de tuplas onde o valor do primeiro elemento das tuplas é o número de cada palavra e o segundo o valor de cada palavra no corpus
list(tfidf[BoW_corpus])

[[(0, 0.5442545024816783),
  (1, 0.5442545024816783),
  (2, 0.5442545024816783),
  (3, 0.33370812034660224)],
 [(4, 0.3779644730092272),
  (5, 0.3779644730092272),
  (6, 0.3779644730092272),
  (7, 0.3779644730092272),
  (8, 0.3779644730092272),
  (9, 0.3779644730092272),
  (10, 0.3779644730092272)],
 [(11, 0.3743600446812478),
  (12, 0.22953781047985866),
  (13, 0.4590756209597173),
  (14, 0.22953781047985866),
  (15, 0.3743600446812478),
  (16, 0.22953781047985866),
  (17, 0.3743600446812478),
  (18, 0.4590756209597173)],
 [(16, 0.22576456473655607),
  (19, 0.36820614593095874),
  (20, 0.36820614593095874),
  (21, 0.36820614593095874),
  (22, 0.36820614593095874),
  (23, 0.36820614593095874),
  (24, 0.36820614593095874),
  (25, 0.36820614593095874)],
 [(3, 0.21496814396163463),
  (12, 0.21496814396163463),
  (13, 0.42993628792326927),
  (14, 0.21496814396163463),
  (18, 0.42993628792326927),
  (26, 0.35059794205706235),
  (27, 0.35059794205706235),
  (28, 0.35059794205706235),
  (29, 

# scikit-learn for TF-IDF

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(doc_list)

In [94]:
np.set_printoptions(precision=2)
print(tfidf_vectorizer.transform(doc_list).toarray())

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.52 0.   0.
  0.   0.   0.52 0.52 0.   0.   0.   0.42 0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.38 0.   0.38 0.   0.38 0.   0.   0.   0.   0.
  0.38 0.   0.   0.   0.   0.   0.38 0.   0.   0.   0.38 0.   0.   0.
  0.38 0.  ]
 [0.   0.   0.   0.31 0.   0.   0.   0.25 0.   0.   0.51 0.   0.25 0.31
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.31 0.51
  0.   0.  ]
 [0.   0.36 0.   0.   0.   0.   0.   0.   0.   0.36 0.   0.   0.   0.
  0.   0.36 0.   0.   0.   0.36 0.   0.   0.36 0.29 0.   0.36 0.   0.
  0.   0.36]
 [0.3  0.   0.3  0.   0.   0.3  0.   0.24 0.   0.   0.48 0.   0.24 0.
  0.   0.   0.   0.   0.3  0.   0.   0.24 0.   0.   0.   0.   0.   0.48
  0.   0.  ]]


In [84]:
tfidf_vectorizer.get_feature_names_out()

array(['and', 'are', 'around', 'be', 'friend', 'heart', 'him', 'it',
       'leaving', 'longing', 'new', 'news', 'of', 'part', 're', 'shoes',
       'spreading', 'start', 'steps', 'stray', 'tell', 'the', 'they',
       'to', 'today', 'vagabond', 'want', 'york', 'you', 'your'],
      dtype=object)

In [93]:
pd.DataFrame(tfidf_vectorizer.transform(doc_list).toarray(), columns = tfidf_vectorizer.get_feature_names_out())

Unnamed: 0,and,are,around,be,friend,heart,him,it,leaving,longing,...,tell,the,they,to,today,vagabond,want,york,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.422242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.377964,0.0,0.377964,0.0,0.377964,0.0,...,0.377964,0.0,0.0,0.0,0.377964,0.0,0.0,0.0,0.377964,0.0
2,0.0,0.0,0.0,0.313727,0.0,0.0,0.0,0.253113,0.0,0.0,...,0.0,0.0,0.0,0.253113,0.0,0.0,0.313727,0.506225,0.0,0.0
3,0.0,0.361529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.361529,...,0.0,0.0,0.361529,0.291679,0.0,0.361529,0.0,0.0,0.0,0.361529
4,0.299341,0.0,0.299341,0.0,0.0,0.299341,0.0,0.241507,0.0,0.0,...,0.0,0.241507,0.0,0.0,0.0,0.0,0.0,0.483013,0.0,0.0
