## Part #1: Parsing and saving data

By using requests library, I get html code of the wikipedia page
Then, by using BeautifulSoup, table with top grossing films is being found, parsed and preprocessed

In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})

rows = table.find_all('tr')[1:]  # Skip header
films = []

for row in rows:
    cols = row.find_all('td')
    
    title = row.find('i').find('a')['title'] 
    title = str(title).replace('[', '').replace(']', '')
    year = cols[3].text.strip()
    director = '-' #TODO: later
    revenue = cols[2].text.strip().split('$')[-1]
    
    revenue = revenue.replace('$', '').replace(',', '')

    country_of_origin = director #TODO: later
    
    films.append({
        'title': title,
        'release_year': year,
        'director': director,
        'box_office': revenue,
        'country': country_of_origin
    })

df = pd.DataFrame(films)

In [70]:
conn = sqlite3.connect('../data/films.db')
cursor = conn.cursor()

# Create table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS films (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        release_year INT,
        director TEXT,
        box_office REAL,
        country TEXT
    )
''')

# Insert data
df.to_sql('films', conn, if_exists='replace', index=False)
conn.commit()
conn.close()

In [71]:
df.to_json('../data/films.json', orient='records', indent=2)