In [54]:
import requests
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find('table', {'class': 'wikitable'})

films_data = []

for row in table.find_all('tr')[1:]:  
    columns = row.find_all('td')
    
    if len(columns) > 4:  
        title_tag = row.find('th').find('a')  
        title = title_tag.get_text(strip=True) if title_tag else None
        film_url = "https://en.wikipedia.org" + title_tag['href'] if title_tag else None
        
        release_year = columns[3].get_text(strip=True)
        
        box_office_revenue = columns[2].get_text(strip=True)
        
        country_of_origin = columns[4].get_text(strip=True)
        
        if film_url:
            film_response = requests.get(film_url)
            film_soup = BeautifulSoup(film_response.content, "html.parser")
            
            director_tag = film_soup.find('th', string='Directed by')
            directors = None
            if director_tag:
                # print(director_tag.find_next('td').get_text())
                directors_raw = director_tag.find_next('td').get_text(strip=True)
                directors_cleaned = re.sub(r'\[\d+\]', '', directors_raw)
                directors_split = re.sub(r'([a-z])([A-Z])', r'\1, \2', directors_cleaned)
                directors = directors_split.split(',')[0].strip()

                
                # print(directors)
            
            country_tag = film_soup.find('th', string='Country')
            if country_tag is None:
                country_tag = film_soup.find('th', string='Countries')
            film_country = None
            if country_tag:
                directors_raw = country_tag.find_next('td').get_text(strip=True)
                directors_cleaned = re.sub(r'\[\d+\]', '', directors_raw)
                directors_split = re.sub(r'([a-z])([A-Z])', r'\1, \2', directors_cleaned)
                film_country = directors_split.split(',')[0].strip()

        film_data = {
            "Title": title,
            "Release Year": int(release_year) if release_year.isdigit() else None,
            "Directors": directors,
            "Box Office Revenue": box_office_revenue,
            "Country of Origin": film_country or country_of_origin  
        }
        # print(film_data)
        films_data.append(film_data)

for film in films_data:
    print(film)


{'Title': 'Avatar', 'Release Year': 2009, 'Directors': 'James Cameron', 'Box Office Revenue': '$2,923,706,026', 'Country of Origin': 'United Kingdom'}
{'Title': 'Avengers: Endgame', 'Release Year': 2019, 'Directors': 'Anthony Russo', 'Box Office Revenue': '$2,797,501,328', 'Country of Origin': 'United States'}
{'Title': 'Avatar: The Way of Water', 'Release Year': 2022, 'Directors': 'James Cameron', 'Box Office Revenue': '$2,320,250,281', 'Country of Origin': 'United States'}
{'Title': 'Titanic', 'Release Year': 1997, 'Directors': 'James Cameron', 'Box Office Revenue': 'T$2,257,844,554', 'Country of Origin': 'United States'}
{'Title': 'Star Wars: The Force Awakens', 'Release Year': 2015, 'Directors': 'J. J. Abrams', 'Box Office Revenue': '$2,068,223,624', 'Country of Origin': 'United States'}
{'Title': 'Avengers: Infinity War', 'Release Year': 2018, 'Directors': 'Anthony Russo', 'Box Office Revenue': '$2,048,359,754', 'Country of Origin': 'United States'}
{'Title': 'Ne Zha 2', 'Release 

In [55]:
import sqlite3
import requests
from bs4 import BeautifulSoup
import re

# Initialize SQLite database
conn = sqlite3.connect('films.db')
cursor = conn.cursor()

# Create films table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS films (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT NOT NULL,
        release_year INTEGER,
        director TEXT,
        box_office TEXT,
        country TEXT
    )
''')

for film in films_data:
    cursor.execute('''
            INSERT INTO films (title, release_year, director, box_office, country)
            VALUES (?, ?, ?, ?, ?)
        ''', (film['Title'], int(film['Release Year']) if release_year.isdigit() else None, film['Directors'], film['Box Office Revenue'], film['Country of Origin']))


conn.commit()
conn.close()

print("Data inserted into the films table successfully.")


Data inserted into the films table successfully.


In [59]:
import sqlite3
import json

conn = sqlite3.connect('films.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM films")

films = cursor.fetchall()
films_data  = []
for film in films:
    film_data = {
            "Title": film[1],
            "Release Year": film[2],
            "Directors": film[3],
            "Box Office Revenue": film[4],
            "Country of Origin": film[5] 
        }
    films_data.append(film_data)
    
with open('films_data.json', 'w') as json_file:
    json.dump(films_data, json_file, indent=4)
conn.close()
