# Семинар 3: Представления слов: продолжение

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%%writefile requirements.txt
gensim
pandas
razdel
sklearn
allennlp
torch==1.4

Writing requirements.txt


In [None]:
!pip install -r requirements.txt

Collecting razdel
  Downloading https://files.pythonhosted.org/packages/15/2c/664223a3924aa6e70479f7d37220b3a658765b9cfe760b4af7ffdc50d38f/razdel-0.5.0-py3-none-any.whl
Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/2c/49/bf0ec241496a82c9dd2f0b6ff6f8156b6b2b72b849df8c00a4f2bcf61485/allennlp-1.0.0-py3-none-any.whl (473kB)
[K     |████████████████████████████████| 481kB 11.1MB/s 
[?25hCollecting torch==1.4
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 20kB/s 
Collecting transformers<2.12,>=2.9
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 52.5MB/s 
[?25hCollecting jsonnet>=0.10.0; sys_platform != 

In [None]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!gzip -d lenta-ru-news.csv.gz
!head -n 2 lenta-ru-news.csv

--2020-08-12 14:24:11--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
Resolving github.com (github.com)... 52.74.223.119
Connecting to github.com (github.com)|52.74.223.119|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200812%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200812T142411Z&X-Amz-Expires=300&X-Amz-Signature=dc3fe392879bcbd391e3b6dcdeec6bd1ca62b55b2fe739cf18b40e770c626578&X-Amz-SignedHeaders=host&actor_id=0&repo_id=87156914&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.gz&response-content-type=application%2Foctet-stream [following]
--2020-08-12 14:24:12--  https://github-production-release-asset-2e65be.s3.amazonaws.com/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMA

In [None]:
import pandas as pd
import torch
import re
import datetime as dt
from razdel import tokenize, sentenize
from string import punctuation

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

dataset = pd.read_csv("lenta-ru-news.csv", sep=',', quotechar='\"', escapechar='\\', encoding='utf-8', header=0)
dataset["date"] = dataset["url"].apply(lambda x: dt.datetime.strptime(get_date(x), "%Y/%m/%d"))
dataset = dataset[dataset["date"] > "2017-01-01"]
dataset["text"] = dataset["text"].apply(lambda x: x.replace("\xa0", " "))
dataset["title"] = dataset["title"].apply(lambda x: x.replace("\xa0", " "))
train_dataset = dataset[dataset["date"] < "2018-04-01"]
test_dataset = dataset[dataset["date"] > "2018-04-01"]

texts = []
for text in train_dataset["text"]:
    for sentence in sentenize(text):
        texts.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation])
    
for title in train_dataset["title"]:
    texts.append([token.text.lower() for token in tokenize(title) if token.text not in punctuation])

assert len(texts) == 827217
assert len(texts[0]) > 0
assert texts[0][0].islower()
print(texts[0])

['возобновление', 'нормального', 'сотрудничества', 'между', 'россией', 'и', 'нато', 'невозможно', 'пока', 'москва', 'не', 'будет', 'соблюдать', 'нормы', 'международного', 'права']


## Предобработка и батчинг

In [None]:
from collections import Counter

class Vocabulary:
    def __init__(self):
        self.word2index = {
            "<unk>": 0
        }
        self.index2word = ["<unk>"]

    def build(self, texts, min_count=5):
        words_counter = Counter(token for tokens in texts for token in tokens)
        #self.vocab = self.word2index.items()
        for word, count in words_counter.most_common():
            if count >= min_count:
                self.word2index[word] = len(self.word2index)
        self.index2word = [word for word, _ in sorted(self.word2index.items(), key=lambda x: x[1])]
        self.vocab = self.word2index.keys()
    
    @property
    def size(self):
        return len(self.index2word)
    
    def top(self, n=100):
        return self.index2word[1:n+1]
    
    def get_index(self, word):
        return self.word2index.get(word, 0)
    
    def get_word(self, index):
        return self.index2word[index]

    def vocab(self):
        return self.vocab

vocabulary = Vocabulary()
vocabulary.build(texts)
assert vocabulary.word2index[vocabulary.index2word[10]] == 10
print(vocabulary.size)
print(vocabulary.top(100))

112084
['в', 'и', 'на', '«', '»', 'что', 'с', 'по', '—', 'не', 'из', 'этом', 'об', 'о', 'он', 'за', 'года', 'россии', 'к', 'его', 'для', 'как', 'также', 'от', 'а', 'это', 'сообщает', 'до', 'году', 'после', 'сша', 'у', 'во', 'время', 'был', 'при', 'заявил', 'со', 'словам', 'рублей', 'будет', 'ее', 'она', 'но', 'ранее', 'их', 'они', 'было', 'тысяч', 'более', 'того', 'том', 'мы', 'были', 'я', 'которые', 'все', 'который', 'человек', 'под', '2016', 'из-за', 'лет', '2017', 'украины', 'марта', 'процентов', 'чтобы', 'долларов', 'глава', 'президент', 'этого', 'отметил', 'же', 'сказал', 'так', 'января', 'или', 'страны', 'ру', 'то', 'еще', 'области', 'данным', 'была', 'президента', 'около', 'сообщил', 'февраля', 'однако', 'компании', 'может', 'уже', 'один', 'рассказал', 'только', 'процента', '1', '10', 'июня']


In [None]:
def build_contexts(tokenized_texts, vocabulary, window_size):
    contexts = []
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            central_word = vocabulary.get_index(tokens[i])
            context = [vocabulary.get_index(tokens[i + delta]) for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]
            if len(context) != 2 * window_size:
                continue

            contexts.append((central_word, context))
            
    return contexts

contexts = build_contexts(texts, vocabulary, window_size=2)
print(contexts[:5])
print(vocabulary.get_word(contexts[0][0]), [vocabulary.get_word(index) for index in contexts[0][1]])

[(1568, [17232, 26343, 135, 371]), (135, [26343, 1568, 371, 2]), (371, [1568, 135, 2, 695]), (2, [135, 371, 695, 2140]), (695, [371, 2, 2140, 216])]
сотрудничества ['возобновление', 'нормального', 'между', 'россией']


In [None]:
import random
import numpy as np

def get_next_batch(contexts, window_size, batch_size, epochs_count):
    assert batch_size % (window_size * 2) == 0
    central_words, contexts = zip(*contexts)
    batch_size //= (window_size * 2)
    
    for epoch in range(epochs_count):
        indices = np.arange(len(contexts))
        np.random.shuffle(indices)
        batch_begin = 0
        while batch_begin < len(contexts):
            batch_indices = indices[batch_begin: batch_begin + batch_size]
            batch_contexts, batch_centrals = [], []
            for data_ind in batch_indices:
                central_word, context = central_words[data_ind], contexts[data_ind]
                batch_contexts.extend(context)
                batch_centrals.extend([central_word] * len(context))
                
            batch_begin += batch_size
            # torch? что за torch?
            yield torch.cuda.LongTensor(batch_contexts), torch.cuda.LongTensor(batch_centrals)

## Модель и обучение

In [None]:
import torch.nn as nn
import torch.optim as optim 
import time

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        projections = self.embeddings.forward(inputs)
        output = self.out_layer.forward(projections)
        return output
      

model = SkipGramModel(vocabulary.size, 32)

device = torch.device("cuda")
model = model.to(device)

loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()
optimizer = optim.Adam(model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss().cuda()

for step, (batch_contexts, batch_centrals) in enumerate(get_next_batch(contexts, window_size=2, batch_size=512, epochs_count=5)):
    logits = model(batch_centrals) # Прямой проход
    loss = loss_function(logits, batch_contexts) # Подсчёт ошибки
    loss.backward() # Подсчёт градиентов dL/dw
    optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
    optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации

    total_loss += loss.item()
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
        total_loss = 0
        start_time = time.time()


embeddings = model.embeddings.weight.cpu().data.numpy()

In [None]:
with open('embs1.npy', 'ab') as f:
  np.save(f, embeddings)

## Базовые проверки

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def most_similar(embeddings, vocabulary, word):
    word_emb = embeddings[vocabulary.get_index(word)]
    
    similarities = cosine_similarity([word_emb], embeddings)[0]
    top10 = np.argsort(similarities)[-10:]
    
    return [vocabulary.get_word(index) for index in reversed(top10)]

most_similar(embeddings, vocabulary, 'путин')

['путин',
 'мединский',
 'сафронов',
 'аристархов',
 'колокольцев',
 'семашко',
 'колычев',
 'гройсман',
 'президент',
 'брынзак']

In [None]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    output_notebook()
    
    if isinstance(color, str): 
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: 
        pl.show(fig)
    return fig


def get_tsne_projection(word_vectors):
    tsne = TSNE(n_components=2, verbose=100, n_iter=500)
    return scale(tsne.fit_transform(word_vectors))

def get_pca_projection(word_vectors):
    pca = PCA(n_components=2)
    return scale(pca.fit_transform(word_vectors))
    
    
def visualize_embeddings(embeddings, vocabulary, word_count, method="pca"):
    word_vectors = embeddings[1: word_count + 1]
    words = vocabulary.index2word[1: word_count + 1]
    get_projections = get_pca_projection if method == "pca" else get_tsne_projection
    projections = get_projections(word_vectors)
    draw_vectors(projections[:, 0], projections[:, 1], color='green', token=words)
    
    
visualize_embeddings(embeddings, vocabulary, 1000)



### Задание 1: Рубрикация: самописный word2vec

Проверьте, как модель выше работает в задаче рубрикации

In [None]:
target_labels = set(train_dataset["topic"].dropna().tolist())
target_labels -= {'Бизнес', 'Крым', 'Культпросвет'}
target_labels = list(target_labels)

In [None]:
pattern = r'(\b{}\b)'.format('|'.join(target_labels))

train_with_topics = train_dataset[train_dataset["topic"].str.contains(pattern, case=False, na=False)]
train_with_topics = train_with_topics
test_with_topics = test_dataset[test_dataset["topic"].str.contains(pattern, case=False, na=False)]

  return func(self, *args, **kwargs)


In [None]:
from razdel import tokenize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def get_text_embedding(model, embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())]
                           if word.text.lower() in vocabulary.vocab
                           else np.zeros((model.embeddings.embedding_dim,))
                           for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

In [None]:
y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], model.embeddings.embedding_dim))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(model, embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], model.embeddings.embedding_dim))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(model, embeddings, vocabulary, embedding)

In [None]:
%%time

cls = LogisticRegression(multi_class='ovr', penalty='l2', class_weight='balanced', verbose=1)
cls.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 11.4 s, sys: 5.84 s, total: 17.3 s
Wall time: 8.82 s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.8s finished


In [None]:
y_pred = cls.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.67      0.54      1663
           1       0.57      0.60      0.58      2447
           2       0.64      0.43      0.51      4324
           3       0.63      0.65      0.64      1182
           4       0.35      0.45      0.39       847
           5       0.17      0.67      0.27       258
           6       0.74      0.71      0.73      3185
           7       0.72      0.68      0.70      1995
           8       0.77      0.64      0.70      4291
           9       0.69      0.62      0.65      2156
          11       0.79      0.74      0.76      2119
          12       0.91      0.91      0.91      3429
          13       0.54      0.82      0.65      2191
          14       0.83      0.62      0.71      1177

    accuracy                           0.66     31264
   macro avg       0.63      0.66      0.62     31264
weighted avg       0.69      0.66      0.67     31264



In [None]:
from razdel import tokenize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def get_text_embedding(model, embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())]
                           if word.text.lower() in vocabulary.vocab
                           else np.zeros((model.embeddings.embedding_dim,))
                           for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

In [None]:
y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], model.embeddings.embedding_dim))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(model, embeddings, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], model.embeddings.embedding_dim))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(model, embeddings, vocabulary, embedding)

In [None]:
%%time

cls = LogisticRegression(multi_class='ovr', penalty='l2', class_weight='balanced', verbose=1)
cls.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 11.4 s, sys: 5.84 s, total: 17.3 s
Wall time: 8.82 s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.8s finished


In [None]:
y_pred = cls.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.67      0.54      1663
           1       0.57      0.60      0.58      2447
           2       0.64      0.43      0.51      4324
           3       0.63      0.65      0.64      1182
           4       0.35      0.45      0.39       847
           5       0.17      0.67      0.27       258
           6       0.74      0.71      0.73      3185
           7       0.72      0.68      0.70      1995
           8       0.77      0.64      0.70      4291
           9       0.69      0.62      0.65      2156
          11       0.79      0.74      0.76      2119
          12       0.91      0.91      0.91      3429
          13       0.54      0.82      0.65      2191
          14       0.83      0.62      0.71      1177

    accuracy                           0.66     31264
   macro avg       0.63      0.66      0.62     31264
weighted avg       0.69      0.66      0.67     31264



### Задание 2: Самописный CBoW

Сделайте аналогичную модель, но в архитектуре CBoW

In [None]:
import torch.nn as nn
import torch.optim as optim 
import time

class CBoWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=32):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        projections = self.embeddings.forward(inputs)
        output = self.out_layer.forward(projections)
        return output
      

cbow = CBoWModel(vocabulary.size, 32)

device = torch.device("cuda")
cbow = cbow.to(device)

loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()
optimizer = optim.Adam(cbow.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss().cuda()

for step, (batch_contexts, batch_centrals) in enumerate(get_next_batch(contexts, window_size=2, batch_size=512, epochs_count=5)):
    logits = cbow(batch_contexts) # Прямой проход
    loss = loss_function(logits, batch_centrals) # Подсчёт ошибки
    loss.backward() # Подсчёт градиентов dL/dw
    optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
    optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации

    total_loss += loss.item()
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
        total_loss = 0
        start_time = time.time()

Step = 348000, Avg Loss = 7.9585, Time = 10.48s
Step = 349000, Avg Loss = 7.9414, Time = 10.48s
Step = 350000, Avg Loss = 7.9569, Time = 10.48s
Step = 351000, Avg Loss = 7.9220, Time = 10.48s
Step = 352000, Avg Loss = 7.9376, Time = 10.49s
Step = 353000, Avg Loss = 7.9542, Time = 10.49s
Step = 354000, Avg Loss = 7.9314, Time = 10.49s
Step = 355000, Avg Loss = 7.9428, Time = 10.49s
Step = 356000, Avg Loss = 7.9330, Time = 10.48s
Step = 357000, Avg Loss = 7.9404, Time = 10.48s
Step = 358000, Avg Loss = 7.9651, Time = 10.49s
Step = 359000, Avg Loss = 7.9418, Time = 10.49s
Step = 360000, Avg Loss = 7.9287, Time = 10.48s
Step = 361000, Avg Loss = 7.9502, Time = 10.49s
Step = 362000, Avg Loss = 7.9466, Time = 10.49s


In [None]:
embs = cbow.embeddings.weight.cpu().data.numpy()

In [None]:
with open('embs2.npy', 'ab') as f:
  np.save(f, embeddings)

In [None]:
most_similar(embs, vocabulary, 'путин')

['путин',
 'мединский',
 'гройсман',
 'президент',
 'колокольцев',
 'сафронков',
 'жириновский',
 'лавров',
 'брынзак',
 'бухаров']

### Задание 3*: Negative Sampling

Реализуйте negative sampling вместо полного softmax'а

In [None]:
%%time
import numpy as np
def build_contexts(tokenized_texts, vocabulary, window_size):

    contexts = []
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            central_word = vocabulary.get_index(tokens[i])
            context = [vocabulary.get_index(tokens[i + delta]) for delta in range(-window_size, window_size + 1) 
                       if delta != 0 and i + delta >= 0 and i + delta < len(tokens)]
            if len(context) != 2 * window_size:
                continue

            neg_samples = np.random.randint(0, vocabulary.size-1, size=window_size*2).tolist()

            contexts.append((central_word, context, [neg_samples]))
            
    return contexts

contexts = build_contexts(texts, vocabulary, window_size=2)

CPU times: user 3min 44s, sys: 5.55 s, total: 3min 50s
Wall time: 3min 49s


In [None]:
print(contexts[:2])

[(1568, [17232, 26343, 135, 371], [[71024, 24798, 102218, 83042]]), (135, [26343, 1568, 371, 2], [[54581, 37165, 93694, 33600]])]


In [None]:
def get_batch(context, window_size, batch_size, epochs_count):
    assert batch_size % (window_size * 2) == 0
    central_words, pos, neg = zip(*context)
    batch_size //= (window_size * 2)
    
    for epoch in range(epochs_count):
        indices = np.arange(len(pos))
        np.random.shuffle(indices)
        batch_begin = 0
        while batch_begin < len(pos):
            batch_indices = indices[batch_begin: batch_begin + batch_size]
            batch_centrals, batch_pos, batch_neg = [], [], []
            for data_ind in batch_indices:
                central_word, pos_words, neg_words = central_words[data_ind], pos[data_ind], neg[data_ind]
                batch_pos.extend(pos_words)
                batch_centrals.extend([central_word] * len(pos_words))
                batch_neg.extend(neg_words * len(pos_words))
                
            batch_begin += batch_size
            yield torch.cuda.LongTensor(batch_centrals), torch.cuda.LongTensor(batch_pos), torch.cuda.LongTensor(batch_neg)

In [None]:
import torch.nn as nn
import torch.optim as optim 
import time
import torch.nn.functional as F

class SkipGramNeg(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(SkipGramNeg, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        score = torch.mul(emb_target, emb_context).squeeze()
        score = torch.sum(score, dim=1)
        out = F.logsigmoid(score)
        
        emb_negative = self.embeddings_context(negative_example)
        neg_score = torch.bmm(emb_negative, emb_target.unsqueeze(2)).squeeze(2)
        neg_score = torch.sum(neg_score, dim=1)
        neg_out = F.logsigmoid(-1 * neg_score)
        return -1 * torch.mean(out + neg_out)

In [None]:
model = SkipGramNeg(32, vocabulary.size)

optimizer = optim.Adam(model.parameters())

device = torch.device("cuda")
model = model.to(device)

loss_every_nsteps = 1000
total_loss = 0
start_time = time.time()

for step, (batch_centrals, batch_pos, batch_neg) in enumerate(get_batch(contexts, window_size=2, batch_size=512, epochs_count=10)):

    loss = model(batch_centrals, batch_pos, batch_neg) 
    loss.backward() 
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss
    if step != 0 and step % loss_every_nsteps == 0:
        print("Step = {}, Avg Loss = {:.4f}, Time = {:.2f}s".format(step, total_loss / loss_every_nsteps, time.time() - start_time))
        total_loss = 0
        start_time = time.time()

In [None]:
embs = model.embeddings_context.weight.cpu().data.numpy()

In [None]:
most_similar(embs, vocabulary, 'путин')

['путин',
 'правительство',
 'неоднократно',
 'также',
 'активисты',
 'считает',
 'сергей',
 'его',
 'сроки',
 'был']

In [None]:
from razdel import tokenize
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def get_text_embedding(model, embeddings, vocabulary, phrase):
    embeddings = np.array([embeddings[vocabulary.get_index(word.text.lower())]
                           if word.text.lower() in vocabulary.vocab
                           else np.zeros((model.embeddings_context.embedding_dim,))
                           for word in tokenize(phrase)])
    return np.mean(embeddings, axis=0)

In [None]:
y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], model.embeddings_context.embedding_dim))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = get_text_embedding(model, embs, vocabulary, embedding)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], model.embeddings_context.embedding_dim))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = get_text_embedding(model, embs, vocabulary, embedding)

In [None]:
%%time

cls = LogisticRegression(multi_class='ovr', penalty='l2', class_weight='balanced', verbose=1)
cls.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


CPU times: user 17 s, sys: 9.6 s, total: 26.6 s
Wall time: 13.7 s


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:   13.7s finished


In [None]:
y_pred = cls.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.64      0.65      2119
           1       0.87      0.84      0.85      3429
           2       0.39      0.46      0.42      2447
           3       0.61      0.47      0.53      4291
           4       0.50      0.49      0.50      2156
           5       0.40      0.55      0.46      1663
           6       0.17      0.26      0.21       847
           7       0.85      0.58      0.69      1177
           9       0.44      0.23      0.30      4324
          10       0.49      0.75      0.60      2191
          11       0.65      0.53      0.58      1995
          12       0.52      0.49      0.51      1182
          13       0.14      0.53      0.22       258
          14       0.01      0.50      0.01         8
          15       0.64      0.63      0.64      3185

    accuracy                           0.53     31272
   macro avg       0.49      0.53      0.48     31272
weighted avg       0.57   

# Unsupervised targets
У пословных моделей есть ряд проблем. Основная - в разных контекстах у одинаковых токенов будут одинаковые представления. Кроме того, наивные Skip-gram и CBoW не учитывают порядок токенов в контексте. 

Как извлечь информацию из сырых текстов? Чему должны учиться модели, из которых мы получим наши представления?

1.   **Skip-gram**
2.   **CBoW**
3.   LM: language modeling (ELMo, ULMFiT)
4.   NSP: next sentence prediction (BERT, в модификациях иногда убирается)
5.   MLM: masked language modeling (BERT, основной таргет)




# Языковые модели



Языковое моделирование - довольно древняя и понятная задача. Статистичская языковая модель (statistical language model) - вероятностное распределение над последовательностями слов $$P(w_1,...,w_n)$$

Другая постановка:
$$P(w_n | w_1,...,w_{n-1}) = P(w_n|w_1^{n-1})$$

N-граммные модели:

$$P(w_n|w_1^{n-1}) \approx P(w_n|w_{n-N+1}^{n-1})$$

## Пример N-граммной модели

In [None]:
class NGramModel:
    def __init__(self, vocabulary, n=4):
        self.n = n
        self.n_grams = [Counter() for _ in range(n+1)]
        self.vocabulary = vocabulary
    
    def collect_n_grams(self, tokens):
        indices = [vocabulary.get_index(token) for token in tokens]
        count = len(indices)
        for n in range(self.n + 1):
            for i in range(min(count - n + 1, count)):
                n_gram = indices[i:i+n]
                self.n_grams[n][tuple(n_gram)] += 1
                
    def normalize(self):
        for n in range(self.n, 0, -1):
            current_n_grams = self.n_grams[n]
            for words, count in current_n_grams.items():
                prev_order_n_gram_count = self.n_grams[n-1][words[:-1]]
                current_n_grams[words] = count / prev_order_n_gram_count
        self.n_grams[0][tuple()] = 1.0
    
    def predict(self, context):
        indices = [vocabulary.get_index(token) for token in context]
        context = tuple(indices[-self.n + 1:])
        step_probabilities = np.zeros((self.vocabulary.size, ), dtype=np.float64)
        for shift in range(self.n):
            current_n = self.n - shift
            wanted_context_length = current_n - 1
            if wanted_context_length > len(context):
                continue
            start_index = len(context) - wanted_context_length
            wanted_context = context[start_index:]
            
            s = 0.0
            for index in range(self.vocabulary.size):
                n_gram = wanted_context + (index,)
                p = self.n_grams[current_n].get(n_gram, 0)
                step_probabilities[index] = p
                s += p
            if s != 0.0:
                break
        return step_probabilities

vocabulary.word2index["<eos>"] = vocabulary.size
vocabulary.index2word.append("<eos>")
n_gram_model = NGramModel(vocabulary)
for text in texts[:1000]:
    n_gram_model.collect_n_grams(text + ["<eos>"])
n_gram_model.normalize()

In [None]:
seed = ["путин"]
while seed[-1] != "<eos>":
    proba = n_gram_model.predict(seed)
    seed.append(np.random.choice(vocabulary.index2word, size=1, p=proba)[0])
    print(seed)

['путин', 'не']
['путин', 'не', 'вышел']
['путин', 'не', 'вышел', 'к']
['путин', 'не', 'вышел', 'к', 'митингующим']
['путин', 'не', 'вышел', 'к', 'митингующим', 'после']
['путин', 'не', 'вышел', 'к', 'митингующим', 'после', 'пожара']
['путин', 'не', 'вышел', 'к', 'митингующим', 'после', 'пожара', 'в']
['путин', 'не', 'вышел', 'к', 'митингующим', 'после', 'пожара', 'в', 'кемерове']
['путин', 'не', 'вышел', 'к', 'митингующим', 'после', 'пожара', 'в', 'кемерове', '<eos>']


## ELMo (Embeddings from Language Models)

Оригинальная статья: https://arxiv.org/pdf/1802.05365.pdf

The Illustrated BERT, ELMo and co.: http://jalammar.github.io/illustrated-bert/

Как применить?

In [None]:
!wget http://vectors.nlpl.eu/repository/11/195.zip
!mkdir elmo && mv 195.zip elmo/195.zip && cd elmo && unzip 195.zip && rm 195.zip && cd ..
!ls elmo

--2020-08-12 14:27:43--  http://vectors.nlpl.eu/repository/11/195.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.225
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.225|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 206977021 (197M) [application/zip]
Saving to: ‘195.zip’


2020-08-12 14:28:15 (6.55 MB/s) - ‘195.zip’ saved [206977021/206977021]

Archive:  195.zip
  inflating: meta.json               
  inflating: model.hdf5              
  inflating: options.json            
  inflating: README                  
  inflating: vocab.txt               
meta.json  model.hdf5  options.json  README  vocab.txt


In [32]:
from allennlp.modules.elmo import Elmo, batch_to_ids
import gc
import numpy as np

def create_embeddings(text, alone_sequence = False):
  elmo = Elmo(options_file="elmo/options.json", weight_file="elmo/model.hdf5", num_output_representations=2).cuda()
  if alone_sequence == True:
    character_ids = batch_to_ids([text]).cuda()
  else:
    character_ids = batch_to_ids(text).cuda()
  embeddings = elmo(character_ids)['elmo_representations'][0].detach().cpu().numpy()
  del character_ids
  del elmo
  gc.collect()
  return np.mean(embeddings, axis=1)

In [None]:
data_batches = [texts[i:i+40] for i in range(0,len(texts[:80]), 40)]

In [None]:
%%time
embeddings = [create_embeddings(batch) for batch in data_batches]

### Задание 4: Рубрикация: ELMo

Проверьте, как ELMo работает в задаче рубрикации

In [None]:
y_train = train_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_train = np.zeros((train_with_topics.shape[0], embeddings[0].shape[1]))
for i, embedding in enumerate(train_with_topics["text"]):
    X_train[i, :] = create_embeddings(embedding, alone_sequence = True)

y_test = test_with_topics["topic"].apply(lambda x: target_labels.index(x)).to_numpy()
X_test = np.zeros((test_with_topics.shape[0], embeddings[0].shape[1]))
for i, embedding in enumerate(test_with_topics["text"]):
    X_test[i, :] = create_embeddings(embedding, alone_sequence = True)

In [None]:
%%time

cls = LogisticRegression(multi_class='ovr', penalty='l2', class_weight='balanced', verbose=1)
cls.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 11.4 s, sys: 5.84 s, total: 17.3 s
Wall time: 8.82 s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    8.8s finished


In [None]:
y_pred = cls.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.67      0.54      1663
           1       0.57      0.60      0.58      2447
           2       0.64      0.43      0.51      4324
           3       0.63      0.65      0.64      1182
           4       0.35      0.45      0.39       847
           5       0.17      0.67      0.27       258
           6       0.74      0.71      0.73      3185
           7       0.72      0.68      0.70      1995
           8       0.77      0.64      0.70      4291
           9       0.69      0.62      0.65      2156
          11       0.79      0.74      0.76      2119
          12       0.91      0.91      0.91      3429
          13       0.54      0.82      0.65      2191
          14       0.83      0.62      0.71      1177

    accuracy                           0.66     31264
   macro avg       0.63      0.66      0.62     31264
weighted avg       0.69      0.66      0.67     31264

