In [2]:
import sys
import json
from collections import defaultdict
from url_normalize import url_normalize
from urlparse import urlparse
import numpy as np

def norm_url(url):
    url_n = url_normalize(url)
    parsed = urlparse(url_n)
    scheme = "%s://" % parsed.scheme
    res = parsed.geturl().replace(scheme, '', 1)
    host = parsed.netloc
    if res.startswith('www.'):
        res = res.replace('www.', '')
        host = host.replace('www.', '')
    return res, host

count = defaultdict(int)

# simple out_links that were parsed from json format on 1-3 archives 
data_files = [json.load(open('../pr_data/out_link_ru.json')), json.load(open('../pr_data/out_link_en.json'))]

url_set = set()
host_set = set()

host_to_host = defaultdict(lambda: defaultdict(int))
for data in data_files:
    for row in data:
        url = row['url']
        url_n, url_host = norm(url)
        url_set.add(url_n)
        host_set.add(url_host)

In [3]:
for data in data_files:
    for row in data:
        url = row['url']
        url_n, url_host = norm_url(url)
        for link in row['out_links']:
            link_n, link_host = norm_url(link)

            if url_host != link_host and link_n in url_set:
                count[link] += 1
            if url_host != link_host and link_host in host_set:
                host_to_host[url_host][link_host] += 1

size = 0
h2i = {}
i2h = {}
for host in host_to_host:
    h2i[host] = size
    i2h[size] = host
    size += 1

In [4]:
E = np.ones((size,size))
E = E / np.sum(E, axis=1, keepdims=True)

In [5]:
matrix = np.zeros((size,size))

for host in host_to_host:
    for link_host in host_to_host[host]:
        if link_host in h2i:
            matrix[h2i[host]][h2i[link_host]] = host_to_host[host][link_host]

In [6]:
for i in xrange(0,size):
    if np.sum(matrix[i]) == 0:
        matrix[i] = np.ones(size)

In [7]:
matrix = matrix / np.sum(matrix, axis=1, keepdims=True)
matrix = 0.7 * matrix + 0.3 * E

In [8]:
x = np.ones((1,size))
x = x / np.sum(x, axis=1, keepdims=True)

In [9]:
y = np.dot(x, np.linalg.matrix_power(matrix,20))

In [10]:
rating = []
for i in xrange(0,size):
    rating.append((i, y[0][i], i2h[i]))
rating = sorted(rating, key=lambda x:-x[1])

In [None]:
for i, pr, host in rating:
    print(pr, host)

In [12]:
out = open('../ratings/rating_merged.txt', 'w')
for i, pr, host in rating:
    out.write('%f\t%s\n' % (pr, host))