# Натянуть сову на линейное пространство

Создайте эмбеддинги слов и визуализируйте векторные операции над ними: сложение, вычитание, взятие ближайшего, дальнейшего и прочее. Сравните качество представлений gensim и BERT с точки зрения операций над словами, докажите примерами.

Для создания эмбеддингов с gensim обучите модель на нормализованных текстовых данных. Данные найдите на kaggle или выберите один из предложенных датасетов. Для создания эмбеддингов с BERT используйте предобученные модели.

Предлагаемые датасеты:
 - [sentiment твитов про ковид](https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)
 - [Amazon product reviews](https://www.kaggle.com/kashnitsky/hierarchical-text-classification) - этот
 - [Отзывы интернет-магазина](https://www.kaggle.com/shymammoth/shopee-reviews)
 - [Тексты статей конференции NIPS](https://www.kaggle.com/rowhitswami/nips-papers-1987-2019-updated?select=papers.csv)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

df = pd.read_csv('unlabeled_150k.csv')
df.head(5)

Unnamed: 0,Title,Text
0,Proraso Proraso Liquid Cream After-Shave 3.4oz,I do not know what they were thinking with all...
1,"PondCare 169J Algae Fix, 2-1/2 Gallon",I have used Algae Fix once a week in my pond a...
2,Philips Norelco Bodygroom Shaver,Norelco got it right with their new Bodygroom....
3,CHAMPION Neoprene Knee Support with Open Patella,"very comfortable support, at least compared to..."
4,Melissa & Doug Baby Zoo Animals Stamp Set,The stamps are a little smaller than I thought...


## Нормализация

In [2]:
from nltk.corpus import stopwords
import nltk

In [3]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet

nltk.download('omw-1.4')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package omw-1.4 to /home/jovvik/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovvik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
nltk.download('stopwords')
eng_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/jovvik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess_text(text: str) -> str:
    text = text.lower()
    def is_allowed_char(c): return c.isalpha() or c == ' '
    text = ''.join(list(filter(is_allowed_char, text)))

    def is_stopword(word): return word not in eng_stopwords
    text = ' '.join(list(filter(is_stopword, text.split())))

    text = ' '.join(list(map(lemmatizer.lemmatize, text.split())))
    return text

from tqdm import tqdm
tqdm.pandas()
df = df.Text.progress_apply(preprocess_text)
df.head()

100%|██████████| 150000/150000 [00:18<00:00, 8067.31it/s]


0    know thinking reformulations excellent product...
1    used algae fix week pond kept algae check unli...
2    norelco got right new bodygroom shaver promise...
3    comfortable support least compared elastic one...
4    stamp little smaller thought would others stat...
Name: Text, dtype: object

## Gensim

In [6]:
from gensim.models import Word2Vec



In [7]:
reviews = [i.split() for i in df]
word_count = {}
for sentence in reviews:
    for word in sentence:
        word_count[word] = (word_count.get(word) or 0) + 1
words_by_freq = [i[0] for i in sorted(word_count.items(), key=lambda x: x[1])][::-1]

In [8]:
gmodel = Word2Vec(sentences=reviews,
                  sg=False,  # cbow model
                  vector_size=100,
                  window=5,
                  seed=0,
                  epochs=15,
                  min_count=1,
                  workers=8)
print('размерность векторов слов в модели:', gmodel.wv.vectors.shape)

размерность векторов слов в модели: (154349, 100)


In [9]:
sample_words = ["ship", "man", "order"]
first = lambda l : [x[0] for x in l]
print("Самые похожие")
for word in sample_words:
    print(word, first(gmodel.wv.similar_by_word(word)))
print("Самые непохожие")
for word in sample_words:
    print(word, first(gmodel.wv.similar_by_vector(-gmodel.wv[word])))
print("Сумма/произведение")
from itertools import product
for w1, w2 in product(sample_words, sample_words):
    if w1 == w2:
        continue
    print(f"{w1} + {w2}",
          first(gmodel.wv.similar_by_vector(gmodel.wv[w1] + gmodel.wv[w2], topn=1)))
    print(f"{w1} * {w2}",
          first(gmodel.wv.similar_by_vector(gmodel.wv[w1] * gmodel.wv[w2], topn=1)))
print(first(gmodel.wv.similar_by_vector(
    gmodel.wv["child"] + gmodel.wv["man"])))
print(first(gmodel.wv.similar_by_vector(
    gmodel.wv["son"] - gmodel.wv["man"])))
print(first(gmodel.wv.similar_by_word("awesome")))


Самые похожие
ship ['shipped', 'shipping', 'shipper', 'send', 'arrive', 'fee', 'mailed', 'sent', 'sh', 'sending']
man ['guy', 'lady', 'woman', 'gentleman', 'gay', 'dude', 'men', 'ya', 'supergirl', 'bos']
order ['reorder', 'ordering', 'purchase', 'buy', 'ordered', 'shipment', 'return', 'iparcel', 'cancel', 'stock']
Самые непохожие
ship ['commandreward', 'detailfirst', 'jawpropping', 'exfoliating', 'struggled', 'someithing', 'breastmilkformula', 'oilbased', 'washcloth', 'productsfelt']
man ['visco', 'woil', 'eyesbut', 'achiveve', 'ruinedmy', 'provided', 'bridgetunnel', 'factmy', 'blanketeven', 'minimal']
order ['withstands', 'kidspros', 'ribbit', 'realisticlooking', 'musclesin', 'deetz', 'prmotes', 'sratchng', 'resilience', 'glorifies']
Сумма/произведение
ship + man ['ship']
ship * man ['oldests']
ship + order ['order']
ship * order ['ethic']
man + ship ['ship']
man * ship ['oldests']
man + order ['order']
man * order ['releaved']
order + ship ['order']
order * ship ['ethic']
order + man

In [10]:
top_words = words_by_freq[:3000]
from tensorboardX import SummaryWriter
with SummaryWriter() as writer:
    writer.add_embedding(gmodel.wv[top_words], metadata=top_words)

In [12]:
gmodel.wv[top_words].shape

(3000, 100)

## Bert
https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

In [6]:
import torch
from transformers import BertTokenizer, BertModel

In [7]:
bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
bert.eval()
bertTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def mostSimilar(vec):
    similarities = [(i, torch.cosine_similarity(vec, i[1], 0).item())
                    for i in precomp_top]
    res = sorted(similarities, key=lambda x: x[1])[::-1]
    return [i[0][0] for i in res[:10]]

def toVec(word):
    toks = bertTokenizer.tokenize("[CLS] " + word + " [SEP]")
    itoks = bertTokenizer.convert_tokens_to_ids(toks)
    toks_tensor = torch.tensor([itoks])
    segments_ids = [1] * len(itoks)
    segments_tensor = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = bert(toks_tensor, segments_tensor)
        embedding = outputs[2]
        tvecs = embedding[-2][0]
        return torch.mean(tvecs, dim=0)

def mostSimilarByWord(word):
    return mostSimilar(toVec(word))

In [11]:
TOP_SIZE = 5000
top = words_by_freq[:TOP_SIZE]
precomp_top = list(tqdm(zip(top, map(toVec, top)), total=TOP_SIZE))

100%|██████████| 5000/5000 [01:41<00:00, 49.42it/s]


In [17]:
sample_words = ["ship", "man", "order"]
def first(l): return [x[0] for x in l]
def second(l): return [x[1] for x in l]
from itertools import product

print("Самые похожие")
for word in sample_words:
    print(word, mostSimilarByWord(word))
print("Самые непохожие")
for word in sample_words:
    print(word, mostSimilar(toVec(word) * (-1)))
print("Сумма/произведение")
for w1, w2 in product(sample_words, sample_words):
    if w1 == w2:
        continue
    print(f"{w1} + {w2}", mostSimilar(toVec(w1) + toVec(w2))[0])
    print(f"{w1} * {w2}", mostSimilar(toVec(w1) * toVec(w2))[0])
print(mostSimilar(toVec("child") + toVec("man")))
print(mostSimilar(toVec("son") - toVec("man")))
print(mostSimilarByWord("awesome"))

Самые похожие
ship ['ship', 'planet', 'vehicle', 'gift', 'glass', 'movement', 'crate', 'pirate', 'elephant', 'spirit']
man ['man', 'woman', 'guy', 'monster', 'alpha', 'scratch', 'suit', 'collar', 'earl', 'thicker']
order ['order', 'ordering', 'ordered', 'pattern', 'count', 'charge', 'suggestion', 'demand', 'movement', 'shipment']
Самые непохожие
ship ['articulation', 'underarms', 'assist', 'moisturizer', 'hadnt', 'oatmeal', 'draw', 'shouldnt', 'darn', 'ovulation']
man ['articulation', 'oatmeal', 'chamomile', 'moisturizer', 'melatonin', 'cholesterol', 'assist', 'peppermint', 'underarms', 'ovulation']
order ['articulation', 'chamomile', 'oatmeal', 'moisturizer', 'peppermint', 'underarms', 'diabetic', 'antioxidant', 'assist', 'dishwasher']
Сумма/произведение
ship + man man
ship * man articulation
ship + order ship
ship * order articulation
man + ship man
man * ship articulation
man + order man
man * order articulation
order + ship ship
order * ship articulation
order + man man
order * man

In [20]:
from tensorboardX import SummaryWriter
import numpy as np

with SummaryWriter() as writer:
    writer.add_embedding(
        np.stack(second(precomp_top)),
        metadata=np.stack(first(precomp_top))
    )