**# Part 1**
Data Extraction (Web Scraping)

In [3]:
# %pip install requests
# %pip install beautifulsoup4
# %pip install lxml

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_movie_data():
    url = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"
    # Use a real browser header to avoid 403 blocks
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/91.0'}
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "lxml")

    # Find the specific table for "Highest-grossing films"
    # Usually, it is the first 'wikitable' on this specific page
    movie_table = soup.find('table', class_='wikitable')
    
    # Extract headers
    headers = [th.text.strip() for th in movie_table.find_all('th')]
    
    rows = []
    # Loop through rows, skipping the header
    for tr in movie_table.find_all('tr')[1:]:
        cells = tr.find_all(['td', 'th'])
        row = [cell.text.strip() for cell in cells]
        if len(row) > 0:
            rows.append(row)

    # Create DataFrame
    # Note: Wikipedia table columns: Rank, Peak, Title, Worldwide gross, Year, Ref
    df = pd.DataFrame(rows, columns=headers[:len(rows[0])])
    
    df.to_csv("movies_raw.csv", index=False)
    print("Success! Data saved to movies_raw.csv")

if __name__ == "__main__":
    scrape_movie_data()

Success! Data saved to movies_raw.csv


**# Part 2**
Data Wrangling

In [6]:
import pandas as pd
import re

df = pd.read_csv("movies_raw.csv")

# 1. Clean Title: Remove any footnotes like [nb 1] or [1]
def clean_text(text):
    return re.sub(r'\[.*?\]', '', str(text)).strip()

df['Title'] = df['Title'].apply(clean_text)

# 2. Clean Gross Revenue: Remove $, commas, and citations
def clean_currency(value):
    # Keep only digits
    clean_val = re.sub(r'[^\d]', '', str(value))
    return int(clean_val) if clean_val else 0

df['Worldwide Gross'] = df['Worldwide Gross'].apply(clean_currency)

# 3. Clean Year: Ensure it is a 4-digit integer
df['Year'] = df['Year'].apply(lambda x: int(re.search(r'\d{4}', str(x)).group()) if re.search(r'\d{4}', str(x)) else 0)

# 4. Feature Engineering: Create a "Decade" column
df['Decade'] = (df['Year'] // 10) * 10

df.to_csv("movies_cleaned.csv", index=False)