# Import the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import json
import re
import pymongo
from pymongo import MongoClient

# Setup the required inputs

In [2]:
# Website url to parse data from
website_url = 'https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings'
# Json Data template as python dictionary
json_data = {
    'name': "",
    'subCategories': [],
    'pages': [],
    'files': []
}
# CSS Selector queries for beautifulsoup
queries = {"subCategories": "div.CategoryTreeItem a", "pages": "div#mw-pages ul a", "files": "div#mw-category-media ul a"}
port = 27017

# Define the beautifulsoup setup function

In [3]:
# Get the html from the url and return the beautifulsoup object to use in parsing
def setupSoup(website_url):
    try:
        html_doc = requests.get(website_url).text
        soup = bs(html_doc, 'html.parser')
        return soup
    # A very generic exception handling, something more sophisticated could be written to better guide the user.
    except requests.exceptions.RequestException as e:
        print(e)

# Define the JSON Builder function

In [4]:
# Build and return the parsed JSON data
def buildJson(queries,soup,json_data,website_url):
    # Set the category name if the URL is a valid wikipedia category URL
    if(re.search("https://en.wikipedia.org/wiki/Category:", website_url)):
        json_data["name"] = website_url.split("Category:")[1]
    # Parse the data for each query
    for query in queries:
        for item in soup.select(queries[query]):
            url = "https://en.wikipedia.org" + item.get("href")
            content = item.text
            data = {'name': content, 'url': url}
            json_data[query].append(data)
    return json_data

# Define the database setup function

In [5]:
def setupDB(port,json_data):
    client = MongoClient('localhost', port)
    db = client.categories_db
    collection = db.parsed_categories
    # Empty the collection before inserting data. This is for testing purposes
    collection.delete_many({})
    collection.insert_one(json_data)
    return client

# Run the program

In [6]:
soup = setupSoup(website_url)
json_data = buildJson(queries,soup,json_data,website_url)
client = setupDB(port,json_data)

In [7]:
# Just checking the collection for the content
client.categories_db.parsed_categories.find_one()

{'_id': ObjectId('5e58eb1d69fd6882c51135a9'),
 'name': 'The_Lord_of_the_Rings',
 'subCategories': [{'name': '174567 Varda',
   'url': 'https://en.wikipedia.org/wiki/Category:174567_Varda'},
  {'name': 'The Lord of the Rings book cover images',
   'url': 'https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_book_cover_images'},
  {'name': 'The Lord of the Rings characters',
   'url': 'https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_characters'},
  {'name': 'The Lord of the Rings (film series)',
   'url': 'https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_(film_series)'},
  {'name': 'Middle-earth locations',
   'url': 'https://en.wikipedia.org/wiki/Category:Middle-earth_locations'},
  {'name': 'Translations of The Lord of the Rings',
   'url': 'https://en.wikipedia.org/wiki/Category:Translations_of_The_Lord_of_the_Rings'}],
 'pages': [{'name': 'The Lord of the Rings',
   'url': 'https://en.wikipedia.org/wiki/The_Lord_of_the_Rings'},
  {'name': '378214 Saur

# Some functions to explore the database
## collection.find() 
Returns a mongodb cursor including everything in the collection, you can loop through the cursor to see the contents, you can pass in an object to search for a specific object
## collection.find_one()
Returns the first object from the database
## collection.delete_many({})
Deletes every entry in the database
## print(client.list_database_names())
Prints the databases created