# Functions for scraping the wiki

In [55]:
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import numpy as np
import bs4
import typing



def get_site_list(root:ET.Element, sites=[]):
    sites.extend(list(filter(lambda x : "https" in x, root.text.split("\n"))))
    for child in root:
        get_site_list(child, sites)
    return sites

def open_site(url:str)->bs4.BeautifulSoup:
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    return soup

def get_overview_section(soup:bs4.BeautifulSoup):
    ret_string = ""
    ele = soup.find(id="toc").previous_sibling
    go_on = True
    while go_on:
        ret_string = ele.get_text().strip() + "\n" + ret_string
        ele = ele.previous_sibling
        try:
            go_on = (len(ele.findAll("aside") )==0) and (ele.name != "aside" and ele is not None)
        except:
            pass
    return ret_string.strip()

def find_sidebar(asides):
    h2_text = [item.get_text().strip() for item in asides[0].find_all("h2")]
    for sidebar in asides: # Looking through the <aside>-tags and stopping the search when we found the right one
        h2_text = [item.get_text().strip() for item in sidebar.find_all("h2")]
        if title in h2_text:
            break
    return sidebar

def get_section_content(section: bs4.element.Tag)-> dict: 
    ret_dict = dict()
    for item in section.find_all("h3"):    
        val = item.nextSibling.nextSibling.getText()
        ret_dict[item.getText()] = val
    return section.find("h2").getText() ,ret_dict

def get_sidebar_content(sidebar: bs4.element.Tag):
    sections = sidebar.find_all("section")
    ret_dict = dict()
    for section in sections:
        key, value = get_section_content(section)
        ret_dict[key] = value
    return ret_dict

def get_overview_section(soup:bs4.BeautifulSoup):
    ret_string = ""
    ele = soup.find(id="toc").previous_sibling
    go_on = True
    while go_on:
        ret_string = ele.get_text().strip() + "\n" + ret_string
        ele = ele.previous_sibling
        try:
            go_on = (len(ele.findAll("aside") )==0) and (ele.name != "aside" and ele is not None)
        except:
            pass
    return ret_string.strip()

def scrape(soup:bs4.BeautifulSoup, categories:list):
    title = soup.find(id="firstHeading").get_text().strip()
    sidebar = find_sidebar(soup.find_all("aside"))
    sidebar_content = get_sidebar_content(sidebar)
    overview_section = get_overview_section(soup)
    return (title, sidebar_content, overview_section)

def write_to_DB(title:str, sidebar:dict, overview:str, db)->None:
    raise NotImplementedError("Sorry, this is not yet implemented!")
    

# Set up the sqlite database

In [64]:
import sqlite3
raise NotImplementedError

NotImplementedError: 

# Main Program

In [62]:
#Main program

import time

sitemaps = """https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-43291.xml
https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-43291-55198.xml
https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-55165.xml""".split("\n")

sites = []
for sitemap in sitemaps:
    r = requests.get(sitemap)
    root = ET.fromstring(r.text)
    sites.extend(get_site_list(root))

counter, limit = 0, 2

for site in sites[1:]: # zeroth site is the main page 
    # __________ Some code to help slow down and limit the traffic for LOTR wiki at least while we are still testing.___________________
    counter +=1
    if counter >= limit:
        break
    time.sleep(1)
    soup = open_site(site)
    categories = [item["data-name"] for item in soup.find_all("li", {"class":"category normal"})]
    if "Real world" in categories:
        continue # Skip this one, we only want to know about fictional characters, places, items etc.
    site_info = scrape(soup, categories)
    write_to_DB(*site_info, connection)



# History

This section of the notebook records and decribes the process I followed to get to the resulting code above.

## Mapping out the website

Let's use the site map instead of crawling classically :)

In [1]:
import requests
import xml.etree.ElementTree as ET

sitemaps = """https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-2-43291.xml
https://lotr.fandom.com/sitemap-newsitemapxml-NS_0-id-43291-55198.xml
https://lotr.fandom.com/sitemap-newsitemapxml-NS_14-id-21-55165.xml""".split("\n")

r = requests.get(sitemaps[0])
root = ET.fromstring(r.text)
print(root.tag, root.tail, root.attrib, f"Has {len([0 for child in root])} children" )


{http://www.sitemaps.org/schemas/sitemap/0.9}urlset None {} Has 5001 children


In [2]:
def pre_order(root:ET.Element, indents=0):
    print("\t"*indents, root.text)
    for child in root:
        pre_order(child, indents+1)

pre_order(root)


 


	 

		 https://lotr.fandom.com/wiki/Main_Page
		 2023-04-18T00:14:38Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Wellington
		 2023-01-27T23:07:57Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Fellowship_of_the_Ring_(group)
		 2023-04-15T02:03:25Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Beorn
		 2023-01-03T10:01:41Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Melkor
		 2023-04-08T17:25:33Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Amroth
		 2023-04-10T13:17:07Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Aul%C3%AB
		 2023-02-06T01:16:29Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Middle-earth
		 2023-04-06T01:23:26Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Peter_Jackson
		 2023-01-03T10:01:14Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/Frodo_Baggins
		 2023-04-10T11:30:14Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/The_Fellowship_of_the_Ring_(novel)
		 2023-04-21T22:24:38Z
		 1.0
	 

		 https://lotr.fandom.com/wiki/The_Lord_of_the_Rings
		 2023-04-21T23:03:07Z
		 1.0
	 

		 https

Alright, so a bunch of stuff but site links are separated from everything else by newlines. let's use that fact to get a list of all sites and count how many there are.

In [3]:
sites = []
def fill_sites(root:ET.Element):
    sites.extend(list(filter(lambda x : "https" in x, root.text.split("\n"))))
    for child in root:
        fill_sites(child)

for sitemap in sitemaps:
    r = requests.get(sitemap)
    root = ET.fromstring(r.text)
    fill_sites(root)
len(sites), sites[:10]


(6978,
 ['https://lotr.fandom.com/wiki/Main_Page',
  'https://lotr.fandom.com/wiki/Wellington',
  'https://lotr.fandom.com/wiki/Fellowship_of_the_Ring_(group)',
  'https://lotr.fandom.com/wiki/Beorn',
  'https://lotr.fandom.com/wiki/Melkor',
  'https://lotr.fandom.com/wiki/Amroth',
  'https://lotr.fandom.com/wiki/Aul%C3%AB',
  'https://lotr.fandom.com/wiki/Middle-earth',
  'https://lotr.fandom.com/wiki/Peter_Jackson',
  'https://lotr.fandom.com/wiki/Frodo_Baggins'])

## extracting content

In [4]:
from bs4 import BeautifulSoup
import numpy as np

r = requests.get(sites[9])

soup = BeautifulSoup(r.content, "html.parser")
#print(soup.prettify()) #Better look at this page witht he dev-tools of your browser.

Having had a look on the page we want to get the page title first: 

In [5]:
title = soup.find(id="firstHeading").get_text().strip()
title

'Frodo Baggins'

We can identify articles that definitely do not relate to Middle Earth by the category "Real world":

In [6]:
AAA =[]
for link in soup.find_all("a"):
    try:
        AAA.append(link['title' ])
    except: pass

AAA = np.array(AAA)
(AAA=="Category:Real world").sum()

0

The categories seem like super useful information, let's extract them more systematically:

In [7]:
categories = [item["data-name"] for item in soup.find_all("li", {"class":"category normal"})]
categories

['Featured articles',
 'Hobbits',
 'Cleanup',
 'Baggins',
 'Elf friends',
 'Fellowship members',
 'Major characters (The Lord of the Rings)',
 'The Lord of the Rings characters',
 'The Hobbit: An Unexpected Journey Characters',
 'Bearers of the One Ring',
 'The Fellowship of the Ring (film) characters',
 'The Two Towers (film) characters',
 'The Return of the King (film) characters']

Let's also get the sidebar information:

In [8]:
asides = soup.find_all("aside")
len(asides)

2

Okay, so multiple asides, inspecting the page in browser we see that we can filter them by seeing if they have an h2-tag containing the page's title.

In [9]:
h2_text = [item.get_text().strip() for item in asides[0].find_all("h2")]
h2_text, [item.get_text().strip() for item in asides[1].find_all("h2")]

(['Frodo Baggins',
  'Biographical information',
  'Physical description',
  'Media portrayal'],
 [])

In [10]:
h2_text = [item.get_text().strip() for item in asides[0].find_all("h2")]

for sidebar in asides: # Looking through the <aside>-tags and stopping the search when we found the right one
    h2_text = [item.get_text().strip() for item in sidebar.find_all("h2")]
    if title in h2_text:
        break

Looking at the website, we see that the sidebar is subdivided into sections, each section holds the category names as h3's and the values in a div that comes after. Putting this to use:

In [11]:
sections = sidebar.find_all("section")

In [12]:
section = sections[0]


import typing
import bs4
def get_section_content(section: bs4.element.Tag)-> dict: 
    ret_dict = dict()
    for item in section.find_all("h3"):    
        val = item.nextSibling.nextSibling.getText()
        ret_dict[item.getText()] = val
    return section.find("h2").getText() ,ret_dict
get_section_content(section)

('Biographical information',
 {'Other names': 'Frodo of the Nine Fingers, Nine-Fingered Frodo, Maura Labingi(Westron name),Mr. Underhill,Sneaky hobbit (by Gollum)',
  'Titles': 'Bearer of the One Ring, Elf-friend, Deputy Mayor of Michel Delving',
  'Birth': 'September 22, TA 2968 (SR 1368)',
  'Rule': "November, TA 3019 to Mid-year's Day, TA 3020",
  'Death': 'Unknown (Last sighting September 29, 3021) (SR 1421)',
  'Weapon': 'Sting and Barrow-blade'})

In [13]:
def get_sidebar(sidebar: bs4.element.Tag):
    sections = sidebar.find_all("section")
    ret_dict = dict()
    for section in sections:
        key, value = get_section_content(section)
        ret_dict[key] = value
    return ret_dict
get_sidebar(sidebar)

{'Biographical information': {'Other names': 'Frodo of the Nine Fingers, Nine-Fingered Frodo, Maura Labingi(Westron name),Mr. Underhill,Sneaky hobbit (by Gollum)',
  'Titles': 'Bearer of the One Ring, Elf-friend, Deputy Mayor of Michel Delving',
  'Birth': 'September 22, TA 2968 (SR 1368)',
  'Rule': "November, TA 3019 to Mid-year's Day, TA 3020",
  'Death': 'Unknown (Last sighting September 29, 3021) (SR 1421)',
  'Weapon': 'Sting and Barrow-blade'},
 'Physical description': {'Gender': 'Male',
  'Height': "4'1” (1.24 m)",
  'Hair': 'Brown',
  'Eyes': 'Blue',
  'Culture': 'Shire-hobbit (Brandybuck family/Baggins family)'},
 'Media portrayal': {'Actor': 'Elijah Wood (LOTR Trilogy)Valery Dyachenko (Khraniteli)',
  'Voice': 'Elijah Wood (video games)'}}

Let's also get the first paragraph, the one before the Table of Contents

In [14]:
ele = soup.find(id="toc")


In [24]:
ele = ele.previous_sibling
ele

<p>
<aside class="portable-infobox pi-background pi-border-color pi-theme-Hobbits pi-layout-default" role="region">
<figure class="pi-item pi-image" data-source="image">
<a class="image image-thumbnail" href="https://static.wikia.nocookie.net/lotr/images/3/32/Frodo_%28FotR%29.png/revision/latest?cb=20221006065757" title="">
<img alt="" class="pi-image-thumbnail" data-image-key="Frodo_%28FotR%29.png" data-image-name="Frodo (FotR).png" height="225" src="https://static.wikia.nocookie.net/lotr/images/3/32/Frodo_%28FotR%29.png/revision/latest/scale-to-width-down/350?cb=20221006065757" srcset="https://static.wikia.nocookie.net/lotr/images/3/32/Frodo_%28FotR%29.png/revision/latest/scale-to-width-down/350?cb=20221006065757 1x, https://static.wikia.nocookie.net/lotr/images/3/32/Frodo_%28FotR%29.png/revision/latest/scale-to-width-down/700?cb=20221006065757 2x" width="270">
</img></a>
<figcaption class="pi-item-spacing pi-caption">Frodo, as portrayed by Elijah Wood</figcaption>
</figure>
<h2 clas

In [25]:
len(ele.find_all("aside"))

1

In [26]:
def get_overview_section(soup:bs4.BeautifulSoup):
    ret_string = ""
    ele = soup.find(id="toc").previous_sibling
    go_on = True
    while go_on:
        ret_string = ele.get_text().strip() + "\n" + ret_string
        ele = ele.previous_sibling
        try:
            go_on = (len(ele.findAll("aside") )==0) and (ele.name != "aside" and ele is not None)
        except:
            pass
    return ret_string.strip()
print(get_overview_section(soup))

"I will take the Ring, though I do not know the way."
—Frodo, at the Council of Elrond, in The Fellowship of the Ring

Frodo Baggins, son of Drogo Baggins, was a hobbit of the Shire in the late Third Age. He is commonly considered Tolkien's most renowned character for his leading role in the Quest of the Ring, in which he bore the One Ring to Mount Doom, where it was destroyed. He was a Ring-bearer, best friend to his gardener, Samwise Gamgee, and was one of three hobbits who sailed from Middle-earth to the Uttermost West at the end of the Third Age.


Alright this has to be good enough for now. We get the sidebar information and the text before the table of contents.

# Set up and fill the RDB

In [None]:
import sqlite3 as sql

conn = sql.connect('LOTR.db')