In [1]:
import requests 
from bs4 import BeautifulSoup
import pymongo
import asyncio
from dotenv import dotenv_values
import logging
import aiohttp
import re
from time import sleep
from random import choice

## Parsing approach
To implement Wiki parser I have decided to use reqiests + BeautifulSoup libraries as wikipedia do not have dynamic page creation as well as some parsing protection (they just kindly ask not to send to many requests in short time persion).

### Parser implementation
I have implemented & used this code in .py file (because it was much more simpler to profile, run in automatic mode), here just a copy of it. 

The core parser idea is: 
1) Extract minimum information of films and put it with urls in db
2) Triverse db, check whether film has all required fields (title, director, box office...)
    and only if it has not we parse it by its url. 

This approach saved my time many times, as it is a normal case, when parser meets some "strange" unparsable page and just falls with error and all progress is lost. 

#### Methods: 
1) Class constructor
Opens a config file, to get mongo-db login & password that stored in .env file: 
    ```
    LOGIN=some_login
    PASSWORD=some_password
    ```

    Creates an async connection to db (whole parser written in async mode to speedup parsing, in my tests it shows 2.5 performance)

2) extract_film_urls method
    - retrieves the page
    - searches for the target table
    - extracts only urls and box_office (it was simpler to parse it from here, whether from film page) fields     
    - stores all of them in DB

3) parse_films_data method
    - checks whether db has unparsed films (title field missing)
    - if there are films to parse: 
    -  parses data of each missing film using Parser.parse_film_data
    - puts data in db

4) parse_film_data method <br>
    Core method of the parser: 
    - wait some random time (if we do not it wikipedia can ignore our page request) 0-0.7 seconds is enough. 
    - retrieve whole page
    - get table (each film page has special table with all data needed for assignment)
    - extract all fields (except image) from that table in dictionary   
    - **Title** - can be extracted easily, nothing to say. 
    - **Release date** - Just asked llm to write me a regex pattern to match (as date-format differs from page to page)
    - **Country** - was the hardest field to parse, has many variations from page to page. (even field name may differ from page to page). To properly handle it, I've just wrote simplest code to parse all what I can, collected urls of pages that were parsed bad (raised an error on them and collected), observed what I can do.... and did it until there is no unparsable pages. 
    - **Director** - there also were problems with this fields, I have hardcoded extraction of all strings from following field and just observed that "Animation director:" and some similar fields always contained ":", so I dropped everything that had ":".  


5) fetch_page method <br>
    As requests do not support async requests, I've found such async implementation. It is equivalent requests.get, but can work in async mode. 

In [4]:
class Parser:
    def __init__(self):
        logging.basicConfig(filename='logs/parser.log', level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        self.logger.info('Parser started.') 
        self.config = dotenv_values(".env")        
        
        self.listing_url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films" 
        self.mongo_connection = pymongo.AsyncMongoClient(f"mongodb://{self.config['LOGIN']}:{self.config['PASSWORD']}@127.0.0.1")
    
    async def extract_film_urls(self, url: str):  
        self.logger.info('JOB: extract_film_urls') 
        
        db = self.mongo_connection['films']
        collection = db.films
        
        request = requests.get(url)
        
        if request.status_code != 200: 
            self.logger.error('Could not get page.')   
            return
        
        content = BeautifulSoup(request.content, 'html.parser')
        tables = content.find_all('table', class_='wikitable plainrowheaders')
        table = tables[max(enumerate(tables), key=lambda x: len(x[1].find_all('tr')))[0]]

        film_urls = [] 
        for film_data_raw in table.find_all('tr')[1:]:
            url = 'https://en.wikipedia.org' + film_data_raw.find('i').find('a').get('href')
            
            box_office = film_data_raw.find_all('td')[1].text.replace(',', '') 
            box_office = ''.join(char if char.isnumeric() else ' ' for char in box_office[1:]).split()[0]

            contains = await collection.find_one({"url": url})
            if contains is None:
                film_urls.append({'url': url, 'box_office': box_office})
                
        self.logger.info(f'Found {len(film_urls)} new films.')   
        if len(film_urls) == 0: 
            return
        
        self.logger.info('Inserting new films to database...') 
        await collection.insert_many(film_urls)
        self.logger.info('Successfully inserted.') 
        
    async def parse_films_data(self): 
        self.logger.info('JOB: parse_films_data') 
        
        db = self.mongo_connection['films']
        collection = db.films
        
        films = await collection.find({"url": {"$exists": True}, "title": {"$exists": False}}).to_list(length=None)
        
        film_promises = []
        for film in films:
            film_promises.append(self.parse_film_data(film['url']))

        write_promises = []
        for film, data in zip(films, await asyncio.gather(*film_promises)):
            write_promises.append(collection.update_one({"_id": film['_id']}, {"$set": data}))
            
        await asyncio.gather(*write_promises)
            
    async def parse_film_data(self, url: str):
        self.logger.info(f'JOB:      parse_film_data {url}') 
        sleep(choice([0, 0.15, 0.4, 0.5, 0.7]))
        
        page_content = await self.fetch_page(url)
        content = BeautifulSoup(page_content, 'html.parser')

        # title, release_year, director, country
        table = content.find('table', class_='infobox vevent').find('tbody')
        rows = table.find_all('tr')

        fields = {row.find('th').text: row.find('td') for row in rows[1:] if row.find('th') is not None}
                
        film = dict()
        # title processing 
        film['title'] = rows[0].find('th').text 
        
        # release_year processing
        pattern = r'\b(1[7-9]\d{2}|2[0-1]\d{2})\b'
        
        field = (fields.get('Release dates') or fields.get('Release date')).text
        film['release_year'] = int(re.findall(pattern, field)[0])
        
        # director processing
        if fields['Directed by'].find('li') is not None:
            film['director'] = [(name.text[:name.text.find(' (')]  if '(' in name.text else name.text)
                                for name in fields['Directed by'].find_all('li') if ':' not in name.text]
        else: 
            film['director'] = [fields.get('Directed by').find('a').text]
        
        # country processing
        if 'Country' in fields:
            film['country'] = fields.get('Country').text    
        else:
            if fields.get('Countries').find('li') is not None:
                film['country'] = fields.get('Countries').find('li').text
            else:
                field = str(fields.get('Countries')) 
                film['country'] = field[field.find('>') + 1:field[1:].find('<') + 1] 
                
        if '[' in film['country']:
            film['country'] = film['country'][:film['country'].find('[')]
            
        self.logger.info(f'JOB DONE: parse_film_data {url}') 
        return film
        
    async def fetch_page(self, url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                return await response.text()