In [1]:
import logging
import scrapy
import json
import re
import operator
    
from scrapy.crawler import CrawlerProcess

<b>Задание 1</b>

In [2]:
class TopLinksSpider(scrapy.Spider):
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'FEED_FORMAT':'json',
        'FEED_URI': 'wikipedia.json'
    }
    
    name = 'all_links'
    start_urls = [
        'https://en.wikipedia.org/wiki/Information_retrieval', 
        'https://en.wikipedia.org/wiki/Principality_of_Sealand',
        'https://en.wikipedia.org/wiki/Belarus',
        'https://en.wikipedia.org/wiki/Penguin',
        'https://en.wikipedia.org/wiki/Robert_Rodriguez'
    ]
    
    header_selector = 'h1#firstHeading.firstHeading *::text'
    snippet_selector = 'string(//div[@id="mw-content-text"]//p[position() = 1])'
    body_link_selector = '(//div[@id="mw-content-text"]//*//a/@href)[position() < 100]'
    allowed_re = re.compile('https://.+\.wikipedia\.org/wiki/'
                            '(?!((File|Talk|Category|Portal|Special|Wikipedia'
                            '|Help|Draft):|Main_Page)).+')
    
    visited_urls = set()
    visited_urls_count = len(start_urls)
    limit = 20000
       
    def parse(self, response):
        self.visited_urls.add(response.url)

        links = response.xpath(self.body_link_selector).extract()
        
        valid_page_urls = set()
        for url in self.filter_invalid_urls(response, links):
            if url in valid_page_urls:
                continue
                
            if not url in self.visited_urls and self.visited_urls_count < self.limit:
                valid_page_urls.add(url)
                
                self.visited_urls_count+=1
                self.visited_urls.add(url)
                
                yield scrapy.Request(url, callback=self.parse)
            elif url in self.visited_urls:
                valid_page_urls.add(url)
                
                    
        yield {
            'url': response.url,
            'title': response.css(self.header_selector).extract_first(),
            'snippet': response.xpath(self.snippet_selector).extract_first()[:255],
            'links': valid_page_urls
        }
                    
                    
    def filter_invalid_urls(self, response, links):
        for link in links:
            if link[0] == '#':
                continue
                
            url = response.urljoin(link)
            
            if self.allowed_re.match(url):
                yield url
                
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

process.crawl(TopLinksSpider)
process.start()

2018-04-12 21:22:26 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: scrapybot)
2018-04-12 21:22:26 [scrapy.utils.log] INFO: Versions: lxml 3.7.3.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.9.0, Python 3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:25:24) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.0.0 (OpenSSL 1.0.2m  2 Nov 2017), cryptography 1.8.1, Platform Windows-10-10.0.16299-SP0
2018-04-12 21:22:26 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'wikipedia.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


<b>Задание 2</b>

In [3]:
def load_data(file_name):
    with open(file_name, 'r') as file:    
        return json.load(file)

# reading loaded Wikipedia pages
wikipedia_pages = load_data('wikipedia.json')

In [40]:
import networkx as nx
    
def build_graph(pages):
    graph = nx.DiGraph()
    
    for page in pages:
        url = page["url"]
        title = page["title"]
        snippet = page["snippet"]
        graph.add_node(url, title=title, snippet=snippet)
        
    for page in pages:
        url = page["url"]
        links = page["links"]
        graph.add_edges_from([(url, link) for link in links])
    
    return graph

In [41]:
# Строим граф
wikipedia_graph = build_graph(wikipedia_pages)

<b>Задание 3</b>

In [82]:
pagerank_output_format = """
Link: {}
Rank: {}
Input edges: {}
Output edges: {}
Title: {}
Snippet: {}
"""

def print_top_10(graph, ranked_pages, output_format):
    top_10 = sorted(ranked_pages.items(), key=operator.itemgetter(1), reverse=True)[:10]
    for url, rank in top_10:
        node = graph.node[url]
        result = output_format.format(url, rank, len(graph.in_edges(url)), len(graph.out_edges(url)), node["title"], node["snippet"])
        print(result)

In [78]:
ranked_pages = nx.pagerank(wikipedia_graph)
print_top_10(wikipedia_graph, ranked_pages, pagerank_output_format)


Link: https://en.wikipedia.org/wiki/United_States
Rank: 0.0040688629071372485
Input edges: 1464
Output edges: 44
Title: United States
Snippet: Coordinates: 40°N 100°W﻿ / ﻿40°N 100°W﻿ / 40; -100


Link: https://en.wikipedia.org/wiki/Geographic_coordinate_system
Rank: 0.003627851368243023
Input edges: 1593
Output edges: 17
Title: Geographic coordinate system
Snippet: A geographic coordinate system is a coordinate system used in geography that enables every location on Earth to be specified by a set of numbers, letters or symbols.[n 1] The coordinates are often chosen such that one of the numbers represents a vertical 


Link: https://en.wikipedia.org/wiki/Demonym
Rank: 0.003194231553964571
Input edges: 590
Output edges: 15
Title: Demonym
Snippet: A demonym (/ˈdɛmənɪm/; δῆμος dẽmos "people, tribe", ὄόνομα ónoma "name") is a word that identifies residents or natives of a particular place, which is derived from the name of that particular place.[1]


Link: https://en.wikipedia.org/wiki/Uni

<b>Задание 4</b>

In [72]:
ranked_pages = nx.pagerank(wikipedia_graph, alpha=0.3)
print_top_10(wikipedia_graph, ranked_pages, pagerank_output_format)


Link: https://en.wikipedia.org/wiki/International_Standard_Book_Number
Rank: 0.00193526399884272
Input edges: 1515
Output edges: 6
Title: International Standard Book Number
Snippet: The International Standard Book Number (ISBN) is a unique[a][b] numeric commercial book identifier. Publishers purchase ISBNs from an affiliate of the International ISBN Agency.[1]


Link: https://en.wikipedia.org/wiki/Geographic_coordinate_system
Rank: 0.0017207252295194128
Input edges: 1593
Output edges: 17
Title: Geographic coordinate system
Snippet: A geographic coordinate system is a coordinate system used in geography that enables every location on Earth to be specified by a set of numbers, letters or symbols.[n 1] The coordinates are often chosen such that one of the numbers represents a vertical 


Link: https://en.wikipedia.org/wiki/United_States
Rank: 0.0016914009128999896
Input edges: 1464
Output edges: 44
Title: United States
Snippet: Coordinates: 40°N 100°W﻿ / ﻿40°N 100°W﻿ / 40; -100


Link: h

In [73]:
ranked_pages = nx.pagerank(wikipedia_graph, alpha=0.5)
print_top_10(wikipedia_graph, ranked_pages, pagerank_output_format)


Link: https://en.wikipedia.org/wiki/United_States
Rank: 0.002716785748840709
Input edges: 1464
Output edges: 44
Title: United States
Snippet: Coordinates: 40°N 100°W﻿ / ﻿40°N 100°W﻿ / 40; -100


Link: https://en.wikipedia.org/wiki/International_Standard_Book_Number
Rank: 0.0026787629355163513
Input edges: 1515
Output edges: 6
Title: International Standard Book Number
Snippet: The International Standard Book Number (ISBN) is a unique[a][b] numeric commercial book identifier. Publishers purchase ISBNs from an affiliate of the International ISBN Agency.[1]


Link: https://en.wikipedia.org/wiki/Geographic_coordinate_system
Rank: 0.0026662080504995236
Input edges: 1593
Output edges: 17
Title: Geographic coordinate system
Snippet: A geographic coordinate system is a coordinate system used in geography that enables every location on Earth to be specified by a set of numbers, letters or symbols.[n 1] The coordinates are often chosen such that one of the numbers represents a vertical 


Link: 

In [74]:
ranked_pages = nx.pagerank(wikipedia_graph, alpha=0.95)
print_top_10(wikipedia_graph, ranked_pages, pagerank_output_format)


Link: https://en.wikipedia.org/wiki/United_States
Rank: 0.004092637606690076
Input edges: 1464
Output edges: 44
Title: United States
Snippet: Coordinates: 40°N 100°W﻿ / ﻿40°N 100°W﻿ / 40; -100


Link: https://en.wikipedia.org/wiki/Demonym
Rank: 0.0037635516841919383
Input edges: 590
Output edges: 15
Title: Demonym
Snippet: A demonym (/ˈdɛmənɪm/; δῆμος dẽmos "people, tribe", ὄόνομα ónoma "name") is a word that identifies residents or natives of a particular place, which is derived from the name of that particular place.[1]


Link: https://en.wikipedia.org/wiki/Geographic_coordinate_system
Rank: 0.0035503263988385277
Input edges: 1593
Output edges: 17
Title: Geographic coordinate system
Snippet: A geographic coordinate system is a coordinate system used in geography that enables every location on Earth to be specified by a set of numbers, letters or symbols.[n 1] The coordinates are often chosen such that one of the numbers represents a vertical 


Link: https://en.wikipedia.org/wiki/Un

<b>Задание 5</b>

In [54]:
hubs, authorities = nx.hits(wikipedia_graph)

In [83]:
hubs_output_format = """
Link: {}
Hub value: {}
Input edges: {}
Output edges: {}
Title: {}
Snippet: {}
"""

print_top_10(wikipedia_graph, hubs, hubs_output_format)


Link: https://en.wikipedia.org/wiki/Geshe
Hub value: 0.004284646406113005
Input edges: 81
Output edges: 97
Title: Geshe
Snippet: Geshe (Tib. dge bshes, short for dge-ba'i bshes-gnyen, "virtuous friend"; translation of Skt. kalyāņamitra) or geshema is a Tibetan Buddhist academic degree for monks and nuns. The degree is emphasized primarily by the Gelug lineage, but is also awarded i


Link: https://en.wikipedia.org/wiki/Rinpoche
Hub value: 0.004283169543521295
Input edges: 83
Output edges: 97
Title: Rinpoche
Snippet: Rinpoche, also spelled Rimboche and Rinboqê (Tibetan: རིན་པོ་ཆེ་, Wylie: rin po che, THL: Rinpoché, ZYPY: Rinboqê), is an honorific term used in the Tibetan language. It literally means "precious one", and may be used to refer to a person, place, or thing


Link: https://en.wikipedia.org/wiki/Narthang_Monastery
Hub value: 0.0042782260136843275
Input edges: 93
Output edges: 97
Title: Narthang Monastery
Snippet: Narthang Monastery (Tibetan: སྣར་ཐང་; Chin: 纳塘寺) is a monastery

In [84]:
authorities_output_format = """
Link: {}
Authority value: {}
Input edges: {}
Output edges: {}
Title: {}
Snippet: {}
"""

print_top_10(wikipedia_graph, authorities, authorities_output_format)


Link: https://en.wikipedia.org/wiki/Bodhisattva
Authority value: 0.011541879057895135
Input edges: 349
Output edges: 77
Title: Bodhisattva
Snippet: In Buddhism, Bodhisattva (/ˌboʊdiːˈsʌtvə/ BOH-dee-SUT-və)[1] is the Sanskrit term for anyone who has generated Bodhicitta, a spontaneous wish and compassionate mind to attain Buddhahood for the benefit of all sentient beings.[2] Bodhisattvas are a popular


Link: https://en.wikipedia.org/wiki/Tibetan_Buddhism
Authority value: 0.0112095769094572
Input edges: 341
Output edges: 95
Title: Tibetan Buddhism
Snippet: New branches:


Link: https://en.wikipedia.org/wiki/Vajrayana
Authority value: 0.011207394588328796
Input edges: 318
Output edges: 86
Title: Vajrayana
Snippet: New branches:


Link: https://en.wikipedia.org/wiki/Glossary_of_Buddhism
Authority value: 0.010701092194605036
Input edges: 246
Output edges: 22
Title: Glossary of Buddhism
Snippet: Some Buddhist terms and concepts lack direct translations into English that cover the breadth o