In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
from pprint import pprint

In [2]:
# Initialize PyMongo to work with MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.L3_D3_db
collection = db.list_of_lists_of_lists

In [4]:
# URL of page to be scraped
url = 'https://en.wikipedia.org/wiki/List_of_lists_of_lists'

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
root = {"name":soup.find('h1').text,
              "children": []
        }

current_h1 = root["children"]

In [6]:
# Generate main ResultSet of all generations after root h1 node. 
# Do not take divs. Recursive = false to only parse 1 level 
all_siblings = soup.find('div', class_='mw-parser-output').find_all(['h2','h3','h4','ul'], recursive=False)

In [7]:
### FUNCTION DEFINITIONS

# Get href link
def get_link(sibling):
    has_link = sibling.find('a')
    if has_link:
        link = has_link.get('href')
    else:
        link = None
    return ("https://en.wikipedia.org"+ link)

# Remove [edit] text from end of strings
def remove_edit(string):
    if "[edit]" in string:
        string = string[:-6]
    return string

# Function for processing nested ul's / li's. Takes ResultSet of li's as arg
def process_ul(li_results, parent):
    
    # Iterate through each li. Enumerate to save index as a counter
    for j, li in enumerate(li_results):
        
        # Grab href as link
        link = get_link(li)
        
        # Always add each li to the current parent (e.g. current_h1[h2_count]['children'] OR can be nested li parent)
        parent.append({'name':li.get_text().splitlines()[0],
                       'url':link,
                       'children':[]
                      })
        
        # Check whether or not this li has a ul.
        # If yes, set new parent to this li node. If no, continue to next iteration with next li 
        possible_ul = li.find('ul')
        if possible_ul:
            new_parent = parent[j]['children']
            nested_li_results = possible_ul.find_all('li')
            process_ul(nested_li_results, new_parent)
        else:
            continue


In [9]:
# Initialize counters for h2/h3/h4 levels so program knows current h2/h3/h4 parent
h2_count = 0
h3_count = 0
h4_count = 0

# Iterate over each result in ResultSet
for sibling in all_siblings:
    
    # Check if this node/sibling contains a link.
    # If yes, save href as _link_ to be added to final object. If no, assign None to _link_
    link = get_link(sibling)
    
    # Remove "[edit]" text from the end of header strings
    list_name = remove_edit(sibling.text)
    
    if sibling.name == 'ul':
        # Generate ResultSet of all li's under this ul. Recursively process all ul/li's
        li_results = sibling.find_all('li')
        process_ul(li_results, ul_parent)
    
    elif sibling.name == 'h2':
        # Reset h3 count, since new h2
        h3_count = 0
        
        # Append as new child to current/active h1 parent. 
        # Add name (.text), empty list of children (even if won't be populated), and link (href) if exists!
        current_h1.append({'name':list_name, 
                           'url':link,
                           'children':[]
                          })
        
        # Set current_h2 to THIS level's children, reset on every h2 encounter
        current_h2 = current_h1[h2_count]['children']
        
        # Set this h2 as possible next ul_parent (in case ul comes next)
        ul_parent = current_h2
        
        # Increase counter. This ensures the next h3 is added as a child to this h2
        h2_count += 1
        
    elif sibling.name == 'h3':  
        
        # Same as code for h2
        h4_count = 0
        current_h2.append({'name':list_name,
                           'url':link,
                           'children':[]
                          })
        current_h3 = current_h2[h3_count]['children']
        ul_parent = current_h3
        h3_count += 1
    
    elif sibling.name == 'h4':
        
        # Same as code for h2
        current_h3.append({'name':list_name,
                           'url':link,
                           'children':[]
                          })
        current_h4 = current_h3[h4_count]['children']
        ul_parent = current_h4
        h4_count += 1

In [10]:
root

{'name': 'List of lists of lists',
 'children': [{'name': 'General reference',
   'url': '/w/index.php?title=List_of_lists_of_lists&action=edit&section=1',
   'children': [{'name': 'Lists of academic journals',
     'url': '/wiki/Lists_of_academic_journals',
     'children': []},
    {'name': 'Lists of important publications in science',
     'url': '/wiki/Lists_of_important_publications_in_science',
     'children': []},
    {'name': 'Lists of unsolved problems',
     'url': '/wiki/Lists_of_unsolved_problems',
     'children': []}]},
  {'name': 'Culture and the arts',
   'url': '/w/index.php?title=List_of_lists_of_lists&action=edit&section=2',
   'children': [{'name': 'Literature',
     'url': '/w/index.php?title=List_of_lists_of_lists&action=edit&section=3',
     'children': [{'name': 'Lists of books',
       'url': '/wiki/Lists_of_books',
       'children': [{'name': 'Lists of 100 best books',
         'url': '/wiki/Lists_of_100_best_books',
         'children': []},
        {'name'

In [11]:
# Insert dictionary into MongoDB as a document
collection.insert_one(root)

<pymongo.results.InsertOneResult at 0x2a41fc555c8>