In [None]:
!pip install fasttext pyonmttok

In [None]:
!pip install --upgrade keras tensorflow-gpu

In [None]:
!rm -f ru_tg_train.tar.gz
!wget https://www.dropbox.com/s/1ecl9orr2tagcgi/ru_tg_train.tar.gz
!rm -f ru_tg_train.json
!tar -xzvf ru_tg_train.tar.gz
!rm ru_tg_train.tar.gz

In [None]:
!rm -f lenta-ru-news.csv.gz
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
!rm -f lenta-ru-news.csv
!gzip -d lenta-ru-news.csv.gz

In [None]:
!wget https://www.dropbox.com/s/2nx97d8nzbzusee/ru_vectors_v2.bin

In [None]:
import fasttext

model = fasttext.load_model('ru_vectors_v2.bin')

In [None]:
import json

with open("ru_tg_train.json", "r") as r:
    tg_data = json.load(r)
tg_data.sort(key=lambda x: x['timestamp'])

In [None]:
import csv
import re

def get_date(url):
    dates = re.findall(r"\d\d\d\d\/\d\d\/\d\d", url)
    return next(iter(dates), None)

with open("lenta-ru-news.csv", "r") as r:
    next(r)
    reader = csv.reader(r, delimiter=',')
    lenta_data = []
    for row in reader:
        url, _, text, _, _ = row
        date = get_date(url)
        lenta_data.append({"date": date, "text": text, "site_name": "lenta"})

lenta_data.sort(key=lambda x: x["date"])

In [None]:
len(lenta_data)

In [None]:
def words_to_embed(model, words):
    vectors = [model.get_word_vector(w) for w in words]
    norm_vectors = [x / np.linalg.norm(x) for x in vectors]
    avg_wv = np.mean(norm_vectors, axis=0)
    max_wv = np.max(norm_vectors, axis=0)
    min_wv = np.min(norm_vectors, axis=0)
    return np.concatenate((avg_wv, max_wv, min_wv))

In [None]:
import pyonmttok
tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=False)

def preprocess(text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text

In [None]:
import numpy as np

def get_samples(data, count):
    last_host_end = {}
    samples = []
    for count, row in enumerate(data[:count]):
        if count % 10000 == 0:
            print(count)
        
        host = row['site_name']
        text = preprocess(row['text'])
        words = text.split(" ")
        if len(words) < 4:
            continue
        words = words[:300]
            
        border = len(words) // 2
        begin_words = words[:border]
        end_words = words[border:]

        left_vector = words_to_embed(model, begin_words)
        left_text = " ".join(begin_words)
        right_vector = words_to_embed(model, end_words)
        right_text = " ".join(end_words)

        samples.append((left_vector, right_vector, left_text, right_text, 1))
        if host in last_host_end:
            samples.append((left_vector, last_host_end[host][0], left_text, last_host_end[host][1], 0))
        last_host_end[host] = (right_vector, right_text)
    return samples

tg_samples = get_samples(tg_data, 100000)
lenta_samples = get_samples(lenta_data, 100000)

In [None]:
tg_test_size = len(tg_samples) // 10
lenta_test_size = len(lenta_samples) // 10
tg_test_samples = tg_samples[-tg_test_size:]
train_samples = tg_samples[:-tg_test_size] + lenta_samples[:-lenta_test_size]
test_samples = tg_test_samples + lenta_samples[-lenta_test_size:]

In [None]:
from sklearn import metrics
from scipy import spatial

scores = []
test_y = []
for sample in test_samples:
    left_vector, right_vector, _, _, y = sample
    test_y.append(y)
    scores.append(-spatial.distance.cosine(left_vector, right_vector))
metrics.roc_auc_score(test_y, scores)

In [None]:
from keras.layers import Input, Dense, Dot
from keras.models import Model

left_input = Input(shape=(150,), dtype='float32')
right_input = Input(shape=(150,), dtype='float32')
dense = Dense(50, activation='linear')
left_dense = dense(left_input)
right_dense = dense(right_input)
dot_layer = Dense(1, activation='sigmoid')(Dot(axes=1, normalize=True)([left_dense, right_dense]))
nn_model = Model(inputs=[left_input, right_input], output=dot_layer)

In [None]:
from keras import optimizers
nn_model.compile(optimizer=optimizers.Adam(lr=0.3), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)

train_left = []
train_right = []
train_y = []
for sample in train_samples:
    left_vector, right_vector, left_text, right_text, y = sample
    train_left.append(left_vector)
    train_right.append(right_vector)
    train_y.append(y)

test_left = []
test_right = []
test_y = []
for sample in test_samples:
    left_vector, right_vector, _, _, y = sample
    test_left.append(left_vector)
    test_right.append(right_vector)
    test_y.append(y)

nn_model.fit([np.array(train_left), np.array(train_right)],
             np.array(train_y),
             batch_size=64,
             epochs=100,
             callbacks=[es,],
             validation_data=([np.array(test_left), np.array(test_right)], np.array(test_y)),
             verbose=2)

In [None]:
embedder = Model(inputs=[left_input, ], output=left_dense)
tg_test_left = []
tg_test_right = []
test_y = []
for sample in tg_test_samples:
    tg_left, tg_right, _, _, y = sample
    tg_test_left.append(tg_left)
    tg_test_right.append(tg_right)
    test_y.append(y)
pred_left = embedder.predict([np.array(tg_test_left)])
pred_right = embedder.predict([np.array(tg_test_right)])
scores = []
for left, right in zip(pred_left, pred_right):
    left = left / np.linalg.norm(left)
    right = right / np.linalg.norm(right)
    score = (left.dot(right) + 1.0) / 2.0 - 1.0
    scores.append(score)
metrics.roc_auc_score(test_y, scores)

In [None]:
matrix = dense.get_weights()[0]
bias = dense.get_weights()[1]

In [None]:
with open("matrix.txt", "w") as w:
    for row_num in range(matrix.shape[1]):
        row = []
        for col_num in range(matrix.shape[0]):
            row.append(float(matrix[col_num][row_num]))
        w.write(",".join(map(str, row)) + "\n")

with open("bias.txt", "w") as w:
    for value in bias:
        w.write("{}\n".format(value))