In [2]:
import sys
import json
from collections import defaultdict
from url_normalize import url_normalize
from urlparse import urlparse
import numpy as np

def norm(url):
    url_n = url_normalize(url)
    parsed = urlparse(url_n)
    scheme = "%s://" % parsed.scheme
    res = parsed.geturl().replace(scheme, '', 1)
    host = parsed.netloc
    if res.startswith('www.'):
        res = res.replace('www.', '')
        host = host.replace('www.', '')
    return res, host

count = defaultdict(int)

# simple out_links that were parsed from json format on 1-3 archives 
data_files = [json.load(open('../pr_data/out_link_ru.json')), json.load(open('../pr_data/out_link_en.json'))]

url_set = set()
host_set = set()

host_to_host = defaultdict(lambda: defaultdict(int))
for data in data_files:
    for row in data:
        url = row['url']
        url_n, url_host = norm(url)
        url_set.add(url_n)
        host_set.add(url_host)

In [3]:
for data in data_files:
    for row in data:
        url = row['url']
        url_n, url_host = norm(url)
        for link in row['out_links']:
            link_n, link_host = norm(link)

            #if url != url_n or link != link_n:
            #    print url, link
            #    print url_n, link_n
            #    print url_host, link_host
            #    print ""
            if url_host != link_host and link_n in url_set:
                count[link] += 1
            if url_host != link_host and link_host in host_set:
                host_to_host[url_host][link_host] += 1
                #print url_host, link_host

size = 0
h2i = {}
i2h = {}
for host in host_to_host:
    h2i[host] = size
    i2h[size] = host
    size += 1

In [4]:
E = np.ones((size,size))
E = E / np.sum(E,axis=1,keepdims=True)

In [5]:
matrix = np.zeros((size,size))

for host in host_to_host:
    for link_host in host_to_host[host]:
        if link_host in h2i:
            matrix[h2i[host]][h2i[link_host]] = host_to_host[host][link_host]



In [6]:
for i in xrange(0,size):
    if np.sum(matrix[i]) == 0:
        matrix[i] = np.ones(size)

In [7]:
matrix = matrix / np.sum(matrix, axis=1, keepdims=True)
matrix = 0.7 * matrix + 0.3 * E

In [8]:
x = np.ones((1,size))
x = x / np.sum(x, axis=1, keepdims=True)

In [9]:
y = np.dot(x, np.linalg.matrix_power(matrix,20))

In [10]:
rating = []
for i in xrange(0,size):
    rating.append((i, y[0][i], i2h[i]))
rating = sorted(rating, key=lambda x:-x[1])

In [11]:
for i, pr, host in rating:
    print pr, host

0.0439734189461805 nytimes.com
0.03422756812396966 bloomberg.com
0.030917367641824546 ria.ru
0.030494445118127867 reuters.com
0.02821962081185469 theguardian.com
0.018876997923221096 forbes.com
0.016195295107512456 bbc.co.uk
0.015411228361898122 mirror.co.uk
0.01366090862416822 kommersant.ru
0.011733054633376214 espn.com
0.011295388109338915 iz.ru
0.01064541028182151 nature.com
0.010576077716753457 thesun.co.uk
0.009389306150592804 rbc.ru
0.009244313568642644 theverge.com
0.008977491936831417 interfax.ru
0.008952379344328574 genius.com
0.008490955190924526 hollywoodreporter.com
0.008373240406942714 telegraph.co.uk
0.007972720064507558 businessinsider.com
0.007438507262550445 politico.com
0.007378119496150317 russian.rt.com
0.006528785573586648 ft.com
0.006445619733318664 rg.ru
0.006334112725478704 nypost.com
0.0059883458667069536 newyorker.com
0.005791950758287045 theconversation.com
0.004910975149658063 independent.co.uk
0.00484816080489715 wired.com
0.004808894591832174 techcrunch.co

0.000339671264353566 snob.ru
0.00033951156922401376 ulpressa.ru
0.00033921598186409304 bigthink.com
0.0003390311521495324 hackaday.com
0.0003386444537645913 nakanune.ru
0.00033844083616859673 bnkomi.ru
0.0003383389259238246 newsroom.co.nz
0.0003375195257868307 zik.ua
0.0003372686209461231 people.onliner.by
0.0003370818096161907 wonderzine.com
0.0003368717207344479 uk.pcmag.com
0.0003367945967749991 news.slashdot.org
0.0003367267031640765 petapixel.com
0.0003365839740635513 wvlt.tv
0.0003357391464562923 itc.ua
0.0003349661047090441 sportinglife.com
0.00033437746945202794 coconuts.co
0.000334228803915349 theedgemarkets.com
0.00033386751903932753 kiev.vgorode.ua
0.00033314040473589513 forces.net
0.00033272276729813245 blog.ethereum.org
0.00033247860945983664 gazeta.uz
0.0003324074309601514 fakty.com.ua
0.0003321257106226403 musicradar.com
0.0003319376410665724 ccn.com
0.0003317541152267303 brightside.me
0.000331567407856073 kotaku.co.uk
0.0003312356085273035 asiatimes.com
0.00033096482753

In [12]:
out = open('../ratings/rating_merged.txt', 'w')
for i, pr, host in rating:
    out.write('%f\t%s\n' % (pr, host))