# Scraping current and old versions of wikipedia

1. For each page of the Wikipedia related to Astronomy 
2. We would like to find its archive version that is newest, but from 2014 or older
3. For normal page and it's archive version we would like to get the base text i.e. all text paragraphs

In [1]:
import requests

In [2]:
webpage_url = 'https://en.wikipedia.org/wiki/New_Horizons'

In [3]:
article_name = webpage_url.split('/')[-1]
article_name

'New_Horizons'

## Finding newest archive version from 2014 or older

In [4]:
# After bit of exploring we find that archive versions of the webpage interesting us look like following:
# https://en.wikipedia.org/w/index.php?title=New_Horizons&action=history&year=2014&month=12
revision_history = 'https://en.wikipedia.org/w/index.php?title={}&action=history&year=2014&month=12'.format(article_name)
revision_history

'https://en.wikipedia.org/w/index.php?title=New_Horizons&action=history&year=2014&month=12'

In [5]:
revision_history_webpage = requests.get(revision_history).text
revision_history_webpage

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>New Horizons: Revision history - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"New_Horizons","wgTitle":"New Horizons","wgCurRevisionId":876122668,"wgRevisionId":0,"wgArticleId":390905,"wgIsArticle":false,"wgIsRedirect":false,"wgAction":"history","wgUserName":null,"wgUserGroups":["*"],"wgCategories":[],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"

In [6]:
import re

In [7]:
# We want to find all ids referring to old articles and find the newest one i.e. largest one
results = re.findall(r'oldid=[0-9]*', revision_history_webpage)
results[0]

'oldid=640339957'

In [8]:
result = results[0]
int(result.split('=')[1])

640339957

In [9]:
newest_id = max([int(result.split('=')[1]) for result in results])
newest_id

640339957

In [10]:
# The link for old version of the webpage has always the form of:
# https://en.wikipedia.org/w/index.php?title=New_Horizons&oldid=640339957
archive_url = 'https://en.wikipedia.org/w/index.php?title={}&oldid={}'.format(article_name, newest_id)
archive_url

'https://en.wikipedia.org/w/index.php?title=New_Horizons&oldid=640339957'

In [11]:
archive_text = requests.get(archive_url).text
webpage_text = requests.get(webpage_url).text

## Extract all paragraphs from a website

In [12]:
# In order to effectively find all paragraphs we will be using lxml - a library for processing XML and HTML
from lxml import etree
from lxml.html import tostring, html5parser

In [13]:
html_tree = html5parser.fromstring(webpage_text)

In [14]:
# The result of a parsing is a tree. 
# Each node in the tree can have leaves accessed through indexing
html_tree[0], html_tree[1]

(<Element {http://www.w3.org/1999/xhtml}head at 0x110a8a3c8>,
 <Element {http://www.w3.org/1999/xhtml}body at 0x110d0d188>)

In [15]:
html_tree[0][1]

<Element {http://www.w3.org/1999/xhtml}title at 0x110cf88c8>

In [16]:
# Every HTML node has a tag
html_tree[0][1].tag

'{http://www.w3.org/1999/xhtml}title'

In [17]:
# and can have some characteristics
html_tree[1][0].items()

[('id', 'mw-page-base'), ('class', 'noprint')]

In [18]:
# Single element can be transformed to corresponding text
tostring(html_tree[0][1])

b'<html:title xmlns:html="http://www.w3.org/1999/xhtml">New Horizons - Wikipedia</html:title>\n'

In [19]:
tostring(html_tree[0][1], encoding='utf-8', method='text')

b'New Horizons - Wikipedia\n'

In [20]:
# We find the element by path after source code inspection
element = html_tree.xpath('//*[@id="mw-content-text"]')
element = element[0][0]

In [21]:
element[5].tag

'{http://www.w3.org/1999/xhtml}p'

In [22]:
def is_tag_ok(tag):
    if not isinstance(tag, str):
        return False
    return tag.endswith('p')

In [23]:
paragraphs = [paragraph for paragraph in element if is_tag_ok(paragraph.tag)]
paragraphs = [tostring(paragraph, encoding='utf-8', method='text').decode('utf-8') for paragraph in paragraphs]
text = '\n'.join(paragraphs)
text

'New Horizons is an interplanetary space probe that was launched as a part of NASA\'s New Frontiers program.[3] Engineered by the Johns Hopkins University Applied Physics Laboratory (APL) and the Southwest Research Institute (SwRI), with a team led by S. Alan Stern,[4] the spacecraft was launched in 2006 with the primary mission to perform a flyby study of the Pluto system in 2015, and a secondary mission to fly by and study one or more other Kuiper belt objects (KBOs) in the decade to follow.[5][6][7][8][9] It is the fifth space probe to achieve the escape velocity needed to leave the Solar System.\n\nOn January 19, 2006, New Horizons was launched from Cape Canaveral Air Force Station by an Atlas V rocket directly into an Earth-and-solar escape trajectory with a speed of about 16.26 kilometers per second (10.10\xa0mi/s; 58,500\xa0km/h; 36,400\xa0mph). At launch, it was the fastest probe ever launched from Earth.[10] After a brief encounter with asteroid 132524 APL, New Horizons procee

In [24]:
# Wrapping everything together

def wikipedia_main_text_from_webpage_text(webpage_text):
    html_tree = html5parser.fromstring(webpage_text)
    element = html_tree.xpath('//*[@id="mw-content-text"]')
    element = element[0][0]
    paragraphs = [paragraph for paragraph in element if is_tag_ok(paragraph.tag)]
    paragraphs = [tostring(paragraph, encoding='utf-8', method='text').decode('utf-8') for paragraph in paragraphs]
    text = '\n'.join(paragraphs)
    return text

In [25]:
webpages_to_scrape = [
    'https://en.wikipedia.org/wiki/New_Horizons',
    'https://en.wikipedia.org/wiki/Planet',
    'https://en.wikipedia.org/wiki/Dwarf_planet',
    'https://en.wikipedia.org/wiki/Ceres_(dwarf_planet)',
    'https://en.wikipedia.org/wiki/Pluto',
    'https://en.wikipedia.org/wiki/Eris_(dwarf_planet)',
    'https://en.wikipedia.org/wiki/Makemake'
]

In [26]:
for webpage_url in webpages_to_scrape:
    article_name = webpage_url.split('/')[-1]
    print(article_name)
    
    revision_history = 'https://en.wikipedia.org/w/index.php?title={}&action=history&year=2014&month=12'.format(article_name)
    revision_history_webpage = requests.get(revision_history).text
    results = re.findall(r'oldid=[0-9]*', revision_history_webpage)
    newest_id = max([int(result.split('=')[1]) for result in results])
    archive_url = 'https://en.wikipedia.org/w/index.php?title={}&oldid={}'.format(article_name, newest_id)
    
    webpage_text = requests.get(webpage_url).text
    archive_text = requests.get(archive_url).text
    
    webpage_main_text = wikipedia_main_text_from_webpage_text(webpage_text)
    archive_main_text = wikipedia_main_text_from_webpage_text(archive_text)
    
    with open('{}_new.txt'.format(article_name), 'w') as file:
        file.write(webpage_main_text)
        
    with open('{}_old.txt'.format(article_name), 'w') as file:
        file.write(archive_main_text)

New_Horizons
Planet
Dwarf_planet
Ceres_(dwarf_planet)
Pluto
Eris_(dwarf_planet)
Makemake
