###  Representações Vetoriais para Palavras, Sentenças e Documentos

In [None]:
import numpy as np

vetor1 = np.array([1., 2., 1., 4.])
vetor2 = np.zeros(4)
vetor3 = np.ones(4)

print('Vetor 1:', vetor1)
print('Vetor 2:', vetor2)
print('Vetor 3:', vetor3)

Vetor 1: [1. 2. 1. 4.]
Vetor 2: [0. 0. 0. 0.]
Vetor 3: [1. 1. 1. 1.]


In [None]:
[1,2] * 2

[1, 2, 1, 2]

In [None]:
vetor1 * 2

array([2., 4., 2., 8.])

In [None]:
vetor1 + vetor3

array([2., 3., 2., 5.])

In [None]:
vetor1 - vetor3

array([0., 1., 0., 3.])

In [None]:
vetor1 * vetor3 # termo a termo

array([1., 2., 1., 4.])

In [None]:
np.dot(vetor1, vetor3) # produto escalar

np.float64(8.0)

In [None]:
vetor1 @ vetor3 # produto escalar

np.float64(8.0)

In [None]:
vetor3 / vetor1 # termo a termo

array([1.  , 0.5 , 1.  , 0.25])

### Similaridade Cosseno

<img src="https://miro.medium.com/v2/resize:fit:720/format:webp/1*LfW66-WsYkFqWc4XYJbEJg.png" alt="Similaridade Cosseno" width="300">

In [None]:
from numpy.linalg import norm

A = np.array([1, 2, 3])
B = np.array([3, 2, 1])

A @ B / ( norm(A) * norm(B) )

np.float64(0.7142857142857143)

In [None]:
A = np.array([1, 2, 3])
B = np.array([1, 2.1, 3])

A @ B / ( norm(A) * norm(B) )

np.float64(0.9997521255198267)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([A], [B])

array([[0.99975213]])

## Modelo de vetorização de palavras (Embeddings)

In [None]:
!pip install safetensors

Collecting safetensors
  Downloading safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Downloading safetensors-0.6.2-cp38-abi3-macosx_11_0_arm64.whl (432 kB)
Installing collected packages: safetensors
Successfully installed safetensors-0.6.2


In [None]:
#https://huggingface.co/nilc-nlp/word2vec-skip-gram-50d/tree/main

from safetensors.torch import load_file
tensors = load_file("embeddings.safetensors")
vectors = tensors["embeddings"]  # torch.Tensor


In [None]:
vectors.shape

torch.Size([929606, 50])

In [None]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('cbow_s50.txt')

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-macosx_11_0_arm64.whl (24.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.5/24.5 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import gensim.downloader
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
gensim.downloader.load('glove-twitter-50')
word2vec = gensim.downloader.load('glove-twitter-50')
word2vec['computer']  # numpy vector of a word



array([ 0.65572  ,  0.50502  , -0.64935  ,  0.14142  , -0.73858  ,
       -0.060631 ,  0.5229   , -1.4541   ,  0.79533  , -0.05805  ,
        0.6463   ,  0.57013  , -3.7753   , -0.2414   , -0.16999  ,
       -0.57056  ,  0.45857  ,  0.53634  , -0.45337  , -0.13149  ,
       -0.94989  ,  0.063493 ,  0.47575  ,  0.34153  , -1.2699   ,
       -0.0061569, -0.47168  , -0.68425  , -0.31045  ,  0.5198   ,
       -0.3573   , -0.97569  , -0.37267  , -1.3218   ,  0.27268  ,
       -0.72485  ,  0.46634  ,  0.40591  ,  0.17259  , -0.61372  ,
       -0.39455  ,  1.406    , -1.3402   ,  0.21337  ,  0.30572  ,
       -0.043278 , -0.019297 ,  0.62462  , -0.78825  ,  0.29806  ],
      dtype=float32)

In [None]:
#gensim.downloader.load('word2vec-google-news-300')
#word2vec = gensim.downloader.load('word2vec-google-news-300')
#word2vec['computer']  # numpy vector of a word

In [None]:
cosine_similarity([word2vec['men']], [word2vec['boy']])

array([[0.6701945]], dtype=float32)

In [None]:
cosine_similarity([word2vec['boy']], [word2vec['girl']])

array([[0.9283966]], dtype=float32)

In [None]:
cosine_similarity([word2vec['boy']], [word2vec['chair']])

array([[0.4562521]], dtype=float32)

In [None]:
word2vec.most_similar('computer', topn=5)

[('cell', 0.8157469630241394),
 ('laptop', 0.8116585612297058),
 ('desktop', 0.8046691417694092),
 ('phone', 0.8026933670043945),
 ('computers', 0.7978082895278931)]

In [None]:
word2vec.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)

[('prince', 0.759779691696167)]

### Modelo mais avançado de Embeddings

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.9 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl (2.9 MB)
[2K   

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

device = 'cpu'

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)
bert = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')
bert = bert.to(device)

In [None]:
texto = 'Eu vou ao banco pagar a conta hoje.'

# tokenizando o texto
input_ids = tokenizer.encode(texto, return_tensors='pt')
wordpieces = tokenizer.convert_ids_to_tokens(input_ids[0])

# salvando ponteiros para palavras
subwords_idx = [] # first subword of each word
for i, wordpiece in enumerate(wordpieces):
    if '##' not in wordpiece and i not in [0, len(wordpieces)-1]:
        subwords_idx.append(i)

# obtendo os vetores para as palavras
input_ids = input_ids.to(device)
with torch.no_grad():
  outs = bert(input_ids)
  vetores1 = outs[0][0, :]

vetores1[subwords_idx]

In [None]:
texto2 = 'Vou sentar no banco da praça.'

# tokenizando o texto
input_ids = tokenizer.encode(texto2, return_tensors='pt')
wordpieces = tokenizer.convert_ids_to_tokens(input_ids[0])

# salvando ponteiros para palavras
subwords_idx = [] # first subword of each word
for i, wordpiece in enumerate(wordpieces):
    if '##' not in wordpiece and i not in [0, len(wordpieces)-1]:
        subwords_idx.append(i)

# obtendo os vetores para as palavras
input_ids = input_ids.to(device)
with torch.no_grad():
  outs = bert(input_ids)
  vetores2 = outs[0][0, :]

vetores2[subwords_idx]

## Exemplo

In [None]:
import pandas as pd

df = pd.read_csv('Tweets_Mg.csv')
df

Unnamed: 0.1,Unnamed: 0,Created At,Text,Geo Coordinates.latitude,Geo Coordinates.longitude,User Location,Username,User Screen Name,Retweet Count,Classificacao,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,0,Sun Jan 08 01:22:05 +0000 2017,���⛪ @ Catedral de Santo Antônio - Governador ...,,,Brasil,Leonardo C Schneider,LeoCSchneider,0,Neutro,...,,,,,,,,,,
1,1,Sun Jan 08 01:49:01 +0000 2017,"� @ Governador Valadares, Minas Gerais https:/...",-41.9333,-18.85,,Wândell,klefnews,0,Neutro,...,,,,,,,,,,
2,2,Sun Jan 08 01:01:46 +0000 2017,"�� @ Governador Valadares, Minas Gerais https:...",-41.9333,-18.85,,Wândell,klefnews,0,Neutro,...,,,,,,,,,,
3,3,Wed Jan 04 21:43:51 +0000 2017,��� https://t.co/BnDsO34qK0,,,,Ana estudando,estudandoconcur,0,Neutro,...,,,,,,,,,,
4,4,Mon Jan 09 15:08:21 +0000 2017,��� PSOL vai questionar aumento de vereadores ...,,,,Emily,Milly777,0,Negativo,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8194,8194,Thu Feb 09 11:48:07 +0000 2017,"Trio é preso suspeito de roubo, tráfico e abus...",,,,Ana Lúcia,lapiseirapentel,0,Positivo,...,,,,,,,,,,
8195,8195,Thu Feb 09 12:10:19 +0000 2017,"Trio é preso suspeito de roubo, tráfico e abus...",,,Belo Horizonte - Minas Gerais,Marcelo Rezende,Televans,0,Positivo,...,,,,,,,,,,
8196,8196,Thu Feb 09 12:04:17 +0000 2017,"Trio é preso suspeito de roubo, tráfico e abus...",,,Guarulhos - SP,Leonardo Nascimento,leonardogru,0,Positivo,...,,,,,,,,,,
8197,8197,Thu Feb 09 12:10:04 +0000 2017,"Trio é preso suspeito de roubo, tráfico e abus...",,,Brasil Natal/RN,Lucas Medeiros �©™,parabolicalucas,0,Positivo,...,,,,,,,,,,


In [None]:
df.columns

Index(['Unnamed: 0', 'Created At', 'Text', 'Geo Coordinates.latitude',
       'Geo Coordinates.longitude', 'User Location', 'Username',
       'User Screen Name', 'Retweet Count', 'Classificacao', 'Observação',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24'],
      dtype='object')

In [None]:
df['Classificacao'].value_counts()

Classificacao
Positivo    3300
Neutro      2453
Negativo    2446
Name: count, dtype: int64

In [None]:
df['Text']

0       ���⛪ @ Catedral de Santo Antônio - Governador ...
1       � @ Governador Valadares, Minas Gerais https:/...
2       �� @ Governador Valadares, Minas Gerais https:...
3                             ��� https://t.co/BnDsO34qK0
4       ��� PSOL vai questionar aumento de vereadores ...
                              ...                        
8194    Trio é preso suspeito de roubo, tráfico e abus...
8195    Trio é preso suspeito de roubo, tráfico e abus...
8196    Trio é preso suspeito de roubo, tráfico e abus...
8197    Trio é preso suspeito de roubo, tráfico e abus...
8198    Trio suspeito de roubo de cargas é preso em Sa...
Name: Text, Length: 8199, dtype: object

In [None]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rooneycoelho/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words=stopwords)

vetores = vectorizer.fit_transform(df['Text'])
vocab = vectorizer.get_feature_names_out()



In [None]:
pd.DataFrame(vetores.toarray(), columns=vocab).head(5).to_excel('encoding.xlsx')

In [None]:
X = vetores
y, label = pd.factorize(df['Classificacao'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Criando e treinando o classificador
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Acurácia:", accuracy)

Acurácia: 0.9567073170731707


## Persistindo o modelo

In [None]:
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

Como carregar?


In [None]:
model_persistido = pickle.load(open('model.pkl', 'rb'))

In [None]:
y_pred2 = model_persistido.predict(X_test)

accuracy = accuracy_score(y_test, y_pred2)
print("Acurácia:", accuracy)

Acurácia: 0.9567073170731707


*Testando nosso modelo em uma nova entrada:*

In [None]:
novo_texto = input("Digite um texto: ")
novo_texto_vetorizado = vectorizer.transform([novo_texto])
previsao = clf.predict(novo_texto_vetorizado)
label[previsao[0]]

'Negativo'

In [None]:
df[df['Classificacao'] == 'Negativo']['Text']

4       ��� PSOL vai questionar aumento de vereadores ...
8       "BB e governo de Minas travam disputa sobre de...
12      "erro desconhecido" é mato! Aliás, é da secret...
16      "Mesmo sem muito dinheiro no caixa o governo d...
25      "uso de aeronave pertencente ao  Estado de MG,...
                              ...                        
4974    Via @estadao: Governo de Minas Gerais compra m...
4978    Vídeo mostra Pimentel em  helicóptero buscando...
4981    Virou Noticia -  Banco do Brasil avisa Justiça...
4982    Virou Noticia -  Mesmo em calamidade financeir...
4989    Vooooaaaaa Minas! Voa nos helicópteros do Pime...
Name: Text, Length: 2446, dtype: object

In [None]:
df[df['Classificacao'] == 'Positivo']['Text']

65      #DefesaAgropecuária "Governo de Minas Gerais a...
66      #EBC Governo de Minas investiga casos suspeito...
87      #Minas vai dobrar oferta de trabalho e estudo ...
88      #Minas vai dobrar oferta de trabalho e estudo ...
108     #SUS Minas Gerais recebe recurso máximo dispon...
                              ...                        
8194    Trio é preso suspeito de roubo, tráfico e abus...
8195    Trio é preso suspeito de roubo, tráfico e abus...
8196    Trio é preso suspeito de roubo, tráfico e abus...
8197    Trio é preso suspeito de roubo, tráfico e abus...
8198    Trio suspeito de roubo de cargas é preso em Sa...
Name: Text, Length: 3300, dtype: object

---