In [None]:
from IPython.display import Javascript

def keep_alive():
    display(Javascript('''
        function keepAlive() {
            console.log("Keeping Colab alive...");
            setTimeout(keepAlive, 60000);
        }
        keepAlive();
    '''))

keep_alive()

In [None]:
!pip install selenium beautifulsoup4 pandas webdriver-manager fake-useragent
!apt-get update
!apt-get install -y chromium-browser
!apt install chromium-chromedriver
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb
!apt-get -f install -y

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from fake_useragent import UserAgent

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#import csv
#from concurrent.futures import ThreadPoolExecutor, as_completed
#from tenacity import retry, stop_after_attempt, wait_exponential

In [None]:
# Function to get a random user-agent
def get_random_user_agent():
    ua = UserAgent()
    return ua.random

# Setup WebDriver with enhanced options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(f"user-agent={get_random_user_agent()}")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.set_page_load_timeout(60)  # Set page load timeout to 60 seconds

# Base URL
base_url = "https://www.the-numbers.com"
budget_url = f"{base_url}/movie/budgets/all"
page_numbers = list(range(1, 6601, 100))  # Generate page numbers [1, 101, ..., 6501]

# Initialize an empty DataFrame
df = pd.DataFrame()

# Function to extract table data
def extract_table_data(table):
    data = {}
    if table:
        rows = table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 2:
                key = cols[0].text.strip()
                value = cols[1].text.strip()
                data[key] = value
    return data

# Function to scrape extra movie details
def scrape_movie_details(movie_url):
    try:
        driver.get(movie_url)
        # Wait for the page to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#movie_finances"))
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")

        details = {"Movie URL": movie_url}

        # Extract financial information
        financial_table = soup.find("table", id="movie_finances")
        if financial_table:
            rows = financial_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) >= 2:  # Ensure it's a key-value row
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        # Extract metrics
        metrics_table = soup.select_one("#summary > table:nth-of-type(1)")
        if metrics_table:
            rows = metrics_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) == 2:
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        # Extract movie details
        details_table = soup.select_one("#summary > table:nth-of-type(3)")
        if not details_table:
            details_table = soup.select_one("#summary_mobile > div > table:nth-child(11)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(11)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(13)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(9)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(6)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(8)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(4)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(10)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(12)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(7)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(5)")

        if details_table:
            rows = details_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) == 2:
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        return details
    except Exception as e:
        print(f"Error scraping {movie_url}: {e}")
        return {}

# Loop through all budget pages
for page in page_numbers:
    url = f"{budget_url}/{page}" if page != 1 else budget_url
    print(f"Scraping: {url}")

    try:
        driver.get(url)
        # Wait for the table to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.TAG_NAME, "table"))
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table")

        # Extract headers (only once)
        if df.empty:
            headers = [header.text.strip() for header in table.find_all("th")] + ["Movie URL"]

        # Extract table rows
        for row in table.find_all("tr")[1:]:  # Skip header row
            cells = row.find_all("td")
            row_data = [cell.text.strip() for cell in cells]

            # Find movie link inside <b> <a> tag
            movie_link_tag = row.find("b").find("a") if row.find("b") else None
            movie_link = f"{base_url}{movie_link_tag['href']}" if movie_link_tag else "N/A"

            if row_data:
                row_data.append(movie_link)  # Append movie URL

                # Scrape extra movie details
                if movie_link != "N/A":
                    extra_details = scrape_movie_details(movie_link)
                    extra_details.pop("Movie URL", None)
                    row_data.extend(extra_details.values())  # Append extra details dynamically

                # Create a temporary DataFrame for the current movie
                temp_df = pd.DataFrame([row_data], columns=headers + list(extra_details.keys()))

                # Append the temporary DataFrame to the main DataFrame
                df = pd.concat([df, temp_df], ignore_index=True)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        continue

# Close browser
driver.quit()

# Save to CSV
csv_path = "combined_movie_budgets.csv"
df.to_csv(csv_path, index=False)

print(f"Scraping completed! CSV file saved: {csv_path}")

In [None]:
# Rescraping the missing values

# Function to get a random user-agent
def get_random_user_agent():
    ua = UserAgent()
    return ua.random
# Setup WebDriver with enhanced options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(f"user-agent={get_random_user_agent()}")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.set_page_load_timeout(120)  # Set page load timeout to 120 seconds

# Function to scrape extra movie details
def scrape_movie_details(movie_url):
    try:
        driver.get(movie_url)
        # Wait for the page to load
        WebDriverWait(driver, 120).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#movie_finances"))
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")

        details = {"Movie URL": movie_url}

        # Extract financial information
        financial_table = soup.find("table", id="movie_finances")
        if financial_table:
            rows = financial_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) >= 2:  # Ensure it's a key-value row
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        # Extract metrics
        metrics_table = soup.select_one("#summary > table:nth-of-type(1)")
        if metrics_table:
            rows = metrics_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) == 2:
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        # Extract movie details
        details_table = soup.select_one("#summary > table:nth-of-type(3)")
        if not details_table:
            details_table = soup.select_one("#summary_mobile > div > table:nth-child(11)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(11)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(13)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(9)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(6)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(8)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(4)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(10)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(12)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(7)")
        if not details_table:
            details_table = soup.select_one("#summary > table:nth-child(5)")

        if details_table:
            rows = details_table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                if len(cols) == 2:
                    key = cols[0].get_text(strip=True)
                    value = cols[1].get_text(strip=True)
                    details[key] = value

        return details
    except Exception as e:
        print(f"Error scraping {movie_url}: {e}")
        return {}

# Load the existing CSV file
csv_path = "combined_movie_budgets.csv"
df = pd.read_csv(csv_path)

# Identify rows with missing 'Genre' values
missing_genre_rows = df[df['Genre:'].isna()]

# Loop through rows with missing 'Genre' and rescrape
for index, row in missing_genre_rows.iterrows():
    movie_url = row['Movie URL']
    if movie_url != "N/A":
        print(f"Rescraping: {movie_url}")
        extra_details = scrape_movie_details(movie_url)

        # Update all columns in the DataFrame with the newly scraped data
        for column in df.columns:
            if column in extra_details:
                df.at[index, column] = extra_details[column]

# Close browser
driver.quit()

# Save the updated DataFrame back to the CSV file
df.to_csv(csv_path, index=False)

print(f"Rescraping completed! CSV file updated: {csv_path}")