In [None]:
!rm -f ru_tg_train.tar.gz
!wget https://www.dropbox.com/s/1ecl9orr2tagcgi/ru_tg_train.tar.gz
!rm -f ru_tg_train.json
!tar -xzvf ru_tg_train.tar.gz
!rm ru_tg_train.tar.gz

In [None]:
!rm -f en_tg_train.tar.gz
!wget https://www.dropbox.com/s/umd8tyx4wz1wquq/en_tg_train.tar.gz
!rm -f en_tg_train.json
!tar -xzvf en_tg_train.tar.gz
!rm en_tg_train.tar.gz

In [None]:
!pip install url-normalize

In [None]:
import sys
import json
from url_normalize import url_normalize
from urllib.parse import urlparse   


def normalize_url(url):
    url_n = url_normalize(url)
    parsed = urlparse(url_n)
    scheme = "%s://" % parsed.scheme
    res = parsed.geturl().replace(scheme, '', 1)
    host = parsed.netloc
    if res.startswith('www.'):
        res = res.replace('www.', '')
        host = host.replace('www.', '')
    return res, host

data_files = [json.load(open("en_tg_train.json")), json.load(open("ru_tg_train.json"))]

url_set = set()
host_set = set()
for data in data_files:
    for row in data:
        url = row["url"]
        url_n, url_host = normalize_url(url)
        url_set.add(url_n)
        host_set.add(url_host)

In [None]:
from collections import Counter, defaultdict

count = Counter()
host_to_host = defaultdict(Counter)
for data in data_files:
    for row in data:
        url = row["url"]
        url_n, url_host = normalize_url(url)
        if "out_links" not in row:
            continue
        for link in row["out_links"]:
            link_n, link_host = normalize_url(link)
            if url_host != link_host and link_n in url_set:
                count[link] += 1
            if url_host != link_host and link_host in host_set:
                host_to_host[url_host][link_host] += 1

h2i = {}
i2h = []
for host in host_to_host:
    h2i[host] = len(i2h)
    i2h.append(host)

In [None]:
import numpy as np

size = len(i2h)
E = np.ones((size, size))
E = E / np.sum(E, axis=1, keepdims=True)
print(E)

In [None]:
matrix = np.zeros((size, size))

for host in host_to_host:
    for link_host in host_to_host[host]:
        if link_host not in h2i:
            continue
        matrix[h2i[host]][h2i[link_host]] = host_to_host[host][link_host]
print(matrix)

In [None]:
for i in range(size):
    if np.sum(matrix[i]) == 0:
        matrix[i] = np.ones(size)
print(matrix)

In [None]:
matrix = matrix / np.sum(matrix, axis=1, keepdims=True)
matrix = 0.7 * matrix + 0.3 * E
print(matrix)

In [None]:
x = np.ones((1, size))
x = x / np.sum(x, axis=1, keepdims=True)
print(x)

In [None]:
y = np.dot(x, np.linalg.matrix_power(matrix, 20))
print(y)

In [None]:
rating = []
for i, host in enumerate(i2h):
    rating.append((i, y[0][i], host))
rating = sorted(rating, key=lambda x:-x[1])
for i, pr, host in rating:
    print(pr, host)