In [None]:
import fasttext
from scipy import spatial
from sklearn import metrics

import os
import sys
import csv

from keras.layers import Input, Embedding, LSTM, Dense, Dot, Reshape, Flatten, Concatenate, Dropout
from keras.models import Model
import numpy as np

import tensorflow as tf
from sklearn.metrics import roc_auc_score
from collections import defaultdict

In [None]:
model = fasttext.load_model('../models/ru_tg_lenta_vector_model.bin')

In [None]:
print(spatial.distance.cosine(np.array(model.get_word_vector('tiger')), np.array(model.get_word_vector('elephant'))))

In [None]:
import json
all_data = json.load(open('/Users/ilya-gusev/data/ru_tg_texts.json'))

In [None]:
all_data = sorted(all_data, key=lambda x:x['date'])

In [None]:
for row in all_data[0::2000]:
    print(row['date'])
    print(row['text'].replace('\n', ' '))

In [None]:
last_host_end = {}

left = []
texts_left = []
right = []
texts_right = []
y = []

def words_to_embed(model, words):
    vectors = [model.get_word_vector(w) for w in words]
    norm_vectors = [x / np.linalg.norm(x) for x in vectors]
    avg_wv = np.mean(norm_vectors, axis=0)
    max_wv = np.max(norm_vectors, axis=0)
    min_wv = np.min(norm_vectors, axis=0)
    return np.concatenate((avg_wv, max_wv, min_wv))

for count, row in enumerate(all_data):
    if count % 1000 == 0:
        print(count)
    
    host = row['site_name']
    text = row['text'].strip().replace('\n', ' ')
    date = row['date']
    
    words = text.split()
    words = [w for w in words if w != '']
    if len(words) < 4:
        continue
    words = words[:300]
        
    border = len(words) // 2
    begin_words = words[:border]
    end_words = words[border:]
    left_sample = words_to_embed(model, begin_words)
    right_sample = words_to_embed(model, end_words)
    
    left.append(left_sample)
    texts_left.append(" ".join(begin_words))
    right.append(right_sample)
    texts_right.append(" ".join(end_words))
    y.append(1)
    if host in last_host_end:
        left.append(left_sample)
        texts_left.append(" ".join(begin_words))
        right.append(last_host_end[host][0])
        texts_right.append(last_host_end[host][1])
        y.append(0)
    last_host_end[host] = (right_sample, " ".join(end_words))

In [None]:
test_size = 30000
train_left = left[:-test_size ]
test_left = left[-test_size :]
train_right = right[:-test_size ]
test_right = right[-test_size :]
train_y = y[:-test_size ]
test_y = y[-test_size:]

texts_test_left = texts_left[-test_size:]
texts_test_right = texts_right[-test_size:]

In [None]:
scores = []
for i in range(0,len(test_y)):
    scores.append(-spatial.distance.cosine(test_left[i], test_right[i]))

In [None]:
metrics.roc_auc_score(test_y, scores)

In [None]:
left_input = Input(shape=(150,), dtype='float32')
right_input = Input(shape=(150,), dtype='float32')
dense = Dense(50, activation='linear')
left_dense = dense(left_input)
right_dense = dense(right_input)
dot_layer = Dense(1, activation='sigmoid')(Dot(axes=1, normalize=True)([left_dense, right_dense]))
nn_model = Model(inputs=[left_input, right_input], output=dot_layer)

In [None]:
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
nn_model.fit([np.array(train_left), np.array(train_right)],
             np.array(train_y),
             batch_size=256,
             epochs=100,
             callbacks=[es,],
             validation_data=([np.array(test_left), np.array(test_right)], np.array(test_y)),
             verbose=2)

In [None]:
pred = nn_model.predict([np.array(test_left), np.array(test_right)])

In [None]:
nn_scores = [float(pred[i]) for i in range(0,len(pred))]

In [None]:
print(len(nn_scores), len(test_y), len(texts_left), len(texts_right))
for i in range(0,500):
    print("===============")
    print(nn_scores[i], test_y[i])
    print(texts_test_left[i])
    print("@@@")
    print(texts_test_right[i])
    print("")

In [None]:
metrics.roc_auc_score(test_y, nn_scores)

In [None]:
import random
for _ in range(10000):
    i = int(random.random()*len(texts_test_left))
    j = int(random.random()*len(texts_test_right))
    pred = nn_model.predict([np.array([test_left[i]]), np.array([test_right[j]])])
    if float(pred[0]) > 0.9:
        print('======')
        print(pred)
        print(texts_test_left[i])
        print("@@@")
        print(texts_test_right[j])

In [None]:
embedder = Model(inputs=[left_input, ], output=left_dense)

In [None]:
print(texts_test_left[0])
print(test_left[:1])
print(embedder.predict(np.array(test_left[:1])))

In [None]:
matrix = dense.get_weights()[0]
bias = dense.get_weights()[1]

In [None]:
with open("matrix.txt", "w") as w:
    for row_num in range(matrix.shape[1]):
        row = []
        for col_num in range(matrix.shape[0]):
            row.append(float(matrix[col_num][row_num]))
        w.write(",".join(map(str, row)) + "\n")

with open("bias.txt", "w") as w:
    for value in bias:
        w.write("{}\n".format(value))

In [None]:
matrix