In [1]:
from html.parser import HTMLParser
from urllib.request import urlopen

In [2]:
def normalize(url, ref):
    if ref.startswith('http'):
        with urlopen(ref) as connection:
            return connection.geturl()
    elif ref.startswith('./'):
        with urlopen(url + ref[2:]) as connection:
            return connection.geturl()
    #else url starts with '/'
    url_splitten = url.split('/')[:3]
    url_splitten.append(ref[1:])
    ref = '/'.join(url_splitten)
    with urlopen(ref) as connection:
            return connection.geturl()

In [3]:
class PageParser(HTMLParser):
    def __init__(self, url, graph):
        HTMLParser.__init__(self)
        self.refs = graph
        self.url = url
        graph[url] = set()
        
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == 'a' and 'href' in attrs:
            try:
                ref = attrs['href']
                if ref.startswith(self.url) or ref.startswith('/') or ref.startswith('./'):
                    ref = normalize(self.url, ref)
                    self.refs[self.url].add(ref)
            except Exception as e:
                print (type(e).__name__  + ' occured while parsing refference %s' % attrs['href'])
                raise
            
    def handle_endtag(self, tag):
        pass

    def handle_data(self, data):
        pass

In [4]:
def parse_url(url, graph, encoding):
    with urlopen(url) as connection:
        page = connection.read().decode(encoding)
        parser = PageParser(connection.geturl(), graph)
        parser.feed(page)

In [5]:
def parse_site(url, graph, parsed, encoding, depth=100):
    if depth <= 0:
        return
    parsed.add(url)
    print (url)
    %%time parse_url(url, graph, encoding)
    for ref in graph[url]:
        if not ref in parsed:
            parse_site(ref, graph, parsed, encoding, depth=depth - 1)

In [6]:
chess = dict()
parsed = set()
parse_site('http://www.64chess.com/', chess, parsed, 'koi8-r')
print ('==================================')
anytask = dict()
parsed = set()
parse_site('http://anytask.org/', anytask, parsed, 'utf-8')

http://www.64chess.com/
CPU times: user 33.3 ms, sys: 6.67 ms, total: 40 ms
Wall time: 275 ms
http://www.64chess.com/GuestBook/index.php
CPU times: user 26.7 ms, sys: 3.33 ms, total: 30 ms
Wall time: 257 ms
http://www.64chess.com/Humor.html
CPU times: user 50 ms, sys: 6.67 ms, total: 56.7 ms
Wall time: 644 ms
http://www.64chess.com/Humor/Tartokover.html
CPU times: user 13.3 ms, sys: 0 ns, total: 13.3 ms
Wall time: 36 ms
http://www.64chess.com/Links.html
CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 33.4 ms
http://www.64chess.com/Games.html
CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 41 ms
http://www.64chess.com/Theory.html
CPU times: user 16.7 ms, sys: 0 ns, total: 16.7 ms
Wall time: 36.8 ms
http://www.64chess.com/index.html
CPU times: user 30 ms, sys: 0 ns, total: 30 ms
Wall time: 65.3 ms
http://www.64chess.com/Autor.html
CPU times: user 6.67 ms, sys: 0 ns, total: 6.67 ms
Wall time: 24.9 ms
http://www.64chess.com/Humor/History.html
CPU times: user 10 ms, sys:

In [16]:
def PR(graph, eps=0.0001):
    pr = dict()
    n_urls = len(graph)
    for url in graph:
        pr[url] = 1. / n_urls
    diff = eps * 10
    while diff > eps:
        new_pr = dict()
        for url in graph:
            new_pr[url] = 0.
        for url in graph:
            n_children = len(graph[url]) + 1
            for child in graph[url]:
                new_pr[child] += pr[url] / n_children
            new_pr[url] += pr[url] / n_children
        diff = 0
        for url in graph:
            diff += abs(new_pr[url] - pr[url])
        pr = new_pr.copy()
    return pr

In [18]:
chess_pr = PR(chess)
anytask_pr = PR(anytask)

In [28]:
for url in chess_pr:
    print ('url: %s PR: %.2f%%' % (url, 100 * chess_pr[url]))

url: http://www.64chess.com/GuestBook/index.php PR: 0.00%
url: http://www.64chess.com/Humor/Tartokover.html PR: 7.89%
url: http://www.64chess.com/Autor.html PR: 9.77%
url: http://www.64chess.com/Humor.html PR: 0.00%
url: http://www.64chess.com/Games.html PR: 9.77%
url: http://www.64chess.com/index.html PR: 19.55%
url: http://www.64chess.com/Humor/Averbax.html PR: 7.89%
url: http://www.64chess.com/Champions.html PR: 0.00%
url: http://www.64chess.com/Compozite.html PR: 9.77%
url: http://www.64chess.com/Humor/Cases.html PR: 7.89%
url: http://www.64chess.com/Humor/History.html PR: 7.89%
url: http://www.64chess.com/Links.html PR: 9.77%
url: http://www.64chess.com/Theory.html PR: 9.77%
url: http://www.64chess.com/ PR: 0.00%


In [29]:
for url in anytask_pr:
    print ('url: %s PR: %.2f%%' % (url, 100 * anytask_pr[url]))

url: http://anytask.org/accounts/login/?next=/school/shpya PR: 2.44%
url: http://anytask.org/accounts/login/?next=/school/urfu PR: 2.44%
url: http://anytask.org/accounts/login/?next=/school/mipt PR: 2.44%
url: http://anytask.org/accounts/login/ PR: 24.39%
url: http://anytask.org/accounts/login/?next=/school/shad PR: 2.44%
url: http://anytask.org/accounts/login/?next=/school/msu PR: 2.44%
url: http://anytask.org/accounts/login/?next=/school/bsu PR: 2.44%
url: http://anytask.org/ PR: 21.46%
url: http://anytask.org/accounts/password/reset/ PR: 11.06%
url: http://anytask.org/accounts/login/?next=/school/hse PR: 2.44%
url: http://anytask.org/accounts/register/ PR: 26.02%
