In [351]:
import requests
from bs4 import BeautifulSoup as bs
import json
import re

In [352]:
website_url = 'https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings'
# Get the html from the url and parse it to a beautifulsoup object
def setupSoup(website_url):
    try:
        html_doc = requests.get(website_url).text
        soup = BeautifulSoup(html_doc, 'html.parser')
        return soup
    # A very generic exception handling, something more sophisticated could be written to better guide the user.
    except requests.exceptions.RequestException as e:
        print(e)

soup = setupSoup(website_url)

In [353]:
# Json Data template as python dictionary
json_data = {
    'name': "",
    'subCategories': [],
    'pages': [],
    'files': []
}

In [354]:
# CSS Selector queries for beautifulsoup
queries = {"subCategories": "div.CategoryTreeItem a", "pages": "div#mw-pages ul a", "files": "div#mw-category-media ul a"}

In [355]:
def buildJson(queries,soup,json_data,website_url):
    # Set the category name if the URL is a valid wikipedia category URL
    if(re.search("https://en.wikipedia.org/wiki/Category:", website_url)):
        json_data["name"] = website_url.split("Category:")[1]
    # Parse the data for each query
    for query in queries:
        for item in soup.select(queries[query]):
            url = "https://en.wikipedia.org" + item.get("href")
            content = item.text
            data = {'name': content, 'url': url}
            json_data[query].append(data)

In [356]:
buildJson(queries,soup,json_data,website_url)

In [357]:
print(json.dumps(json_data, indent=2))

{
  "name": "The_Lord_of_the_Rings",
  "subCategories": [
    {
      "name": "174567 Varda",
      "url": "https://en.wikipedia.org/wiki/Category:174567_Varda"
    },
    {
      "name": "The Lord of the Rings book cover images",
      "url": "https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_book_cover_images"
    },
    {
      "name": "The Lord of the Rings characters",
      "url": "https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_characters"
    },
    {
      "name": "The Lord of the Rings (film series)",
      "url": "https://en.wikipedia.org/wiki/Category:The_Lord_of_the_Rings_(film_series)"
    },
    {
      "name": "Middle-earth locations",
      "url": "https://en.wikipedia.org/wiki/Category:Middle-earth_locations"
    },
    {
      "name": "Translations of The Lord of the Rings",
      "url": "https://en.wikipedia.org/wiki/Category:Translations_of_The_Lord_of_the_Rings"
    }
  ],
  "pages": [
    {
      "name": "The Lord of the Rings",
      "url

In [358]:
import pymongo
from pymongo import MongoClient

In [359]:
client = MongoClient('mongodb://localhost:27017')

In [360]:
db = client.pymongo_test

In [361]:
posts = db.posts
post_data = {
    'title': 'Python and MongoDB',
    'content': 'PyMongo is fun, you guys',
    'author': 'Scott'
}
result = posts.insert_one(post_data)
print('One post: {0}'.format(result.inserted_id))

One post: 5e57adc499e898741b4fee9b


In [362]:
post_1 = {
    'title': 'Python and MongoDB',
    'content': 'PyMongo is fun, you guys',
    'author': 'Scott'
}
post_2 = {
    'title': 'Virtual Environments',
    'content': 'Use virtual environments, you guys',
    'author': 'Scott'
}
post_3 = {
    'title': 'Learning Python',
    'content': 'Learn Python, it is easy',
    'author': 'Bill'
}
new_result = posts.insert_many([post_1, post_2, post_3])
print('Multiple posts: {0}'.format(new_result.inserted_ids))

Multiple posts: [ObjectId('5e57adc499e898741b4fee9c'), ObjectId('5e57adc499e898741b4fee9d'), ObjectId('5e57adc499e898741b4fee9e')]


In [363]:
bills_post = posts.find_one({'author': 'Bill'})
print(bills_post)

{'_id': ObjectId('5e57ac2599e898741b4fee8a'), 'title': 'Learning Python', 'content': 'Learn Python, it is easy', 'author': 'Bill'}


In [364]:
scotts_posts = posts.find({'author': 'Scott'})
print(scotts_posts)

<pymongo.cursor.Cursor object at 0x0000013E4E245CC8>


In [365]:
for post in scotts_posts:
    print(post)

{'_id': ObjectId('5e57ac1d99e898741b4fee87'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57ac2599e898741b4fee88'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57ac2599e898741b4fee89'), 'title': 'Virtual Environments', 'content': 'Use virtual environments, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57ad9d99e898741b4fee8c'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57ad9d99e898741b4fee8d'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57ad9d99e898741b4fee8e'), 'title': 'Virtual Environments', 'content': 'Use virtual environments, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57adb199e898741b4fee91'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
{'_id': ObjectId('5e57adb199e89