In [1]:
import pandas as pd 
import numpy as np
import torch
import tensorflow as tf
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import os    
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import matplotlib.pyplot as plt

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # возращать эмбеддинги каждого слова
                                  )
emb_to_word = dict()
tokenz = []

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def make_sentence_embedding(text: str) -> torch.Tensor:
    # Получим индексы токенов
    tokenized_query = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_query)
    for i in indexed_tokens:
        tokenz.append(i)
    segments_ids = [1] * len(indexed_tokens)
    
    # to tensor 
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # calc embeddings
    outputs = model(tokens_tensor, segments_tensors)
    embeddings = outputs[2]
    
    return embeddings[-1][0]

In [5]:
clean_data = pd.read_csv('data/Corona_NLP_train.csv')

In [6]:
clean_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [7]:
comms = clean_data['OriginalTweet']
words_embs = []

for text in tqdm(comms[:300]):
    emb = make_sentence_embedding(text)
    for i in emb:
        words_embs.append(i)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:48<00:00,  2.75it/s]


In [8]:
texts_embs = torch.stack(words_embs)

texts_embs.size()

torch.Size([17150, 768])

In [9]:
def similarity_word(word_emb):
    similarities = torch.nn.functional.cosine_similarity(query_emb, texts_embs)
    res = []
    
    for i in torch.topk(similarities, 10).indices:
        res.append((tokenizer.convert_ids_to_tokens(tokenz[i]), similarities[i]))
    
    return res

def non_similarity_word(word_emb):
    similarities = torch.nn.functional.cosine_similarity(query_emb, texts_embs)
    res = []
    
    for i in torch.topk(similarities, 10, largest=False).indices:
        res.append((tokenizer.convert_ids_to_tokens(tokenz[i]), similarities[i]))
    
    return res

In [10]:
query_emb = make_sentence_embedding("corona")[-1]

similarity_word(query_emb)

[('stop', tensor(0.4631, grad_fn=<SelectBackward0>)),
 ('##virus', tensor(0.4522, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.4497, grad_fn=<SelectBackward0>)),
 ('corona', tensor(0.4490, grad_fn=<SelectBackward0>)),
 ('corona', tensor(0.4473, grad_fn=<SelectBackward0>)),
 ('rate', tensor(0.4470, grad_fn=<SelectBackward0>)),
 ('corona', tensor(0.4467, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.4464, grad_fn=<SelectBackward0>)),
 ('##virus', tensor(0.4428, grad_fn=<SelectBackward0>)),
 ('##virus', tensor(0.4425, grad_fn=<SelectBackward0>))]

In [11]:
non_similarity_word(query_emb)

[('with', tensor(0.0276, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0382, grad_fn=<SelectBackward0>)),
 ('58', tensor(0.0401, grad_fn=<SelectBackward0>)),
 (',', tensor(0.0445, grad_fn=<SelectBackward0>)),
 ('given', tensor(0.0467, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0498, grad_fn=<SelectBackward0>)),
 (',', tensor(0.0506, grad_fn=<SelectBackward0>)),
 ('clean', tensor(0.0520, grad_fn=<SelectBackward0>)),
 ('.', tensor(0.0564, grad_fn=<SelectBackward0>)),
 (',', tensor(0.0579, grad_fn=<SelectBackward0>))]

In [12]:
query_emb = make_sentence_embedding("virus")[-1] + make_sentence_embedding("kill")[-1] + make_sentence_embedding("peopl")[-1]

similarity_word(query_emb)

[('#', tensor(0.5198, grad_fn=<SelectBackward0>)),
 ('read', tensor(0.5190, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.5077, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.5072, grad_fn=<SelectBackward0>)),
 ('please', tensor(0.5037, grad_fn=<SelectBackward0>)),
 ('this', tensor(0.5033, grad_fn=<SelectBackward0>)),
 ('thread', tensor(0.5011, grad_fn=<SelectBackward0>)),
 ('##out', tensor(0.4985, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.4983, grad_fn=<SelectBackward0>)),
 ('#', tensor(0.4961, grad_fn=<SelectBackward0>))]

In [13]:
non_similarity_word(query_emb)

[('.', tensor(0.0263, grad_fn=<SelectBackward0>)),
 (':', tensor(0.0399, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0456, grad_fn=<SelectBackward0>)),
 ('corona', tensor(0.0461, grad_fn=<SelectBackward0>)),
 (',', tensor(0.0472, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0518, grad_fn=<SelectBackward0>)),
 (':', tensor(0.0527, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0536, grad_fn=<SelectBackward0>)),
 (',', tensor(0.0554, grad_fn=<SelectBackward0>)),
 ("'", tensor(0.0555, grad_fn=<SelectBackward0>))]

In [14]:
query_emb = make_sentence_embedding("stop")[-1] - make_sentence_embedding("corona")[-1]

similarity_word(query_emb)

[('clean', tensor(0.2183, grad_fn=<SelectBackward0>)),
 ('@', tensor(0.2135, grad_fn=<SelectBackward0>)),
 ('my', tensor(0.1977, grad_fn=<SelectBackward0>)),
 ('i', tensor(0.1913, grad_fn=<SelectBackward0>)),
 ('19', tensor(0.1913, grad_fn=<SelectBackward0>)),
 ('.', tensor(0.1891, grad_fn=<SelectBackward0>)),
 ('bing', tensor(0.1824, grad_fn=<SelectBackward0>)),
 ('korea', tensor(0.1802, grad_fn=<SelectBackward0>)),
 ('i', tensor(0.1793, grad_fn=<SelectBackward0>)),
 ('##sh', tensor(0.1767, grad_fn=<SelectBackward0>))]

In [15]:
non_similarity_word(query_emb)

[('/', tensor(-0.3043, grad_fn=<SelectBackward0>)),
 (':', tensor(-0.2963, grad_fn=<SelectBackward0>)),
 ('/', tensor(-0.2959, grad_fn=<SelectBackward0>)),
 ('/', tensor(-0.2861, grad_fn=<SelectBackward0>)),
 ('/', tensor(-0.2841, grad_fn=<SelectBackward0>)),
 (':', tensor(-0.2826, grad_fn=<SelectBackward0>)),
 (':', tensor(-0.2819, grad_fn=<SelectBackward0>)),
 ('/', tensor(-0.2798, grad_fn=<SelectBackward0>)),
 (':', tensor(-0.2784, grad_fn=<SelectBackward0>)),
 ('/', tensor(-0.2765, grad_fn=<SelectBackward0>))]