In [1]:
import pandas as pd
import numpy as np
import mysql.connector
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from bs4 import BeautifulSoup


In [2]:
conn = mysql.connector.connect(
    host="127.0.0.1",
    user="Himanshu",
    password="12345",
    database="movies_db"
)
cursor = conn.cursor(buffered=True)

In [20]:
# SQL query to count null values for each column in movies_data table
sql_query = """
SELECT 
    COUNT(*) as total_rows,
    SUM(CASE WHEN title IS NULL THEN 1 ELSE 0 END) as title_nulls,
    SUM(CASE WHEN type IS NULL THEN 1 ELSE 0 END) as type_nulls,
    SUM(CASE WHEN director IS NULL THEN 1 ELSE 0 END) as director_nulls,
    SUM(CASE WHEN cast IS NULL THEN 1 ELSE 0 END) as cast_nulls,
    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) as country_nulls,
    SUM(CASE WHEN date_added IS NULL THEN 1 ELSE 0 END) as date_added_nulls,
    SUM(CASE WHEN release_year IS NULL THEN 1 ELSE 0 END) as release_year_nulls,
    SUM(CASE WHEN rating IS NULL THEN 1 ELSE 0 END) as rating_nulls,
    SUM(CASE WHEN duration IS NULL THEN 1 ELSE 0 END) as duration_nulls,
    SUM(CASE WHEN listed_in IS NULL THEN 1 ELSE 0 END) as listed_in_nulls,
    SUM(CASE WHEN description IS NULL THEN 1 ELSE 0 END) as description_nulls
FROM movies_data;
"""

# Execute the query using your existing cursor
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a more readable format
column_names = [desc[0] for desc in cursor.description]
missing_values = dict(zip(column_names, results[0]))

# Print the results in a formatted way
print("\nMissing Values Analysis:")
print("-" * 50)
print(f"Total Rows: {missing_values['total_rows']}")
print("-" * 50)
for column, nulls in missing_values.items():
    if column != 'total_rows':
        percentage = (nulls / missing_values['total_rows']) * 100
        print(f"{column.replace('_nulls', '')}: {nulls} nulls ({percentage:.2f}%)")


Missing Values Analysis:
--------------------------------------------------
Total Rows: 22551
--------------------------------------------------
title: 0 nulls (0.00%)
type: 0 nulls (0.00%)
director: 8019 nulls (35.56%)
cast: 5107 nulls (22.65%)
country: 11390 nulls (50.51%)
date_added: 9362 nulls (41.51%)
release_year: 0 nulls (0.00%)
rating: 831 nulls (3.68%)
duration: 449 nulls (1.99%)
listed_in: 0 nulls (0.00%)
description: 4 nulls (0.02%)


As we can see that there are many null values in the columns director, cast, date_added, rating and durations.
Most of them can be derived from the internet.
So now what I will be doing is web scrapping using BeautifulSoup and requests package.

In [None]:
# SQL query to get 10 rows with NULL values in specified columns
sql_query = """
SELECT *
FROM movies_data
WHERE director IS NULL 
   OR cast IS NULL 
   OR country IS NULL 
   OR date_added IS NULL 
   OR rating IS NULL 
   OR duration IS NULL
LIMIT 10;
"""

# You can execute this query using your existing MySQL connection:
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a pandas DataFrame for better visualization
columns = [desc[0] for desc in cursor.description]
df_null_analysis = pd.DataFrame(results, columns=columns)

# Display the results
display(df_null_analysis)

In [None]:


# SQL query to fetch rows where 'type' is 'Movie' and any of the specified columns are NULL
query = """
    SELECT * FROM movies_data 
    WHERE type = 'Movie' 
    AND (director IS NULL OR cast IS NULL OR country IS NULL OR date_added IS NULL OR rating IS NULL)
"""

# Execute query
cursor.execute(query)

# Fetch all results
missing_data_rows = cursor.fetchall()

# Get column names from the table
column_names = [desc[0] for desc in cursor.description]

# Convert to DataFrame
df_missing_values_movies = pd.DataFrame(missing_data_rows, columns=column_names)

# Display the DataFrame
display(df_missing_values_movies)


In [3]:
# Delete after completion

# Set up WebDriver with options
options = webdriver.ChromeOptions()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

try:
    # Open IMDb website
    driver.get("https://www.imdb.com/")

    # Locate search bar and type the movie name
    movie_title = "#blackAF"  # Change this for any other movie
    print(f"\n[INFO] Searching for '{movie_title}' on IMDb...")

    search_box = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "#suggestion-search"))
    )
    search_box.send_keys(movie_title)
    search_box.send_keys(Keys.RETURN)

    # Wait for search results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list li a"))
    )

    # Click the first result dynamically
    first_result = driver.find_element(By.CSS_SELECTOR, "ul.ipc-metadata-list li a")
    first_result.click()

    # Wait for the movie page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "li[data-testid='title-pc-principal-credit']"))
    )

    # Extract all directors (handles both single & multiple directors)
    director_elements = driver.find_elements(By.CSS_SELECTOR, "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) > div > ul > li a")
    
    # Extract text and clean up data
    directors = [element.text.strip() for element in director_elements] 
    print(directors)

    # if directors:
    #     print(f"[INFO] Director(s) of '{movie_title}': ", end="")
    #     for i, director in enumerate(directors):
    #         if i < len(directors) - 1:
    #             print(f"{director}, ", end="")
    #         else:
    #             print(director)
    # else:
    #     print(f"[WARNING] No director found for '{movie_title}'.")

except Exception as e:
    print(f"[ERROR] Unexpected issue: {str(e)}")

finally:
    # Close the browser
    driver.quit()


[INFO] Searching for '#blackAF' on IMDb...
['Kenya Barris']


### For getting director

In [None]:
# Delete after completion.

import requests
from bs4 import BeautifulSoup
import time

def get_movie_directors(movie_title):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }

    # Step 1: Construct the IMDb search URL
    search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
    print(f"[INFO] Searching for '{movie_title}' on IMDb...")
    
    time.sleep(2)  # Simulating delay

    # Step 2: Fetch search results page
    search_response = requests.get(search_url, headers=headers)
    search_soup = BeautifulSoup(search_response.text, "html.parser")

    # Step 3: Find the div containing the search results
    results_div = search_soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
    if not results_div:
        print("[WARNING] No search results found!")
        return

    # Step 4: Iterate over each result to find the exact match
    movie_url = None
    for result in results_div.find_all("li"):
        movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
        if movie_tag:
            movie_name = movie_tag.text.strip()
            if movie_name.lower() == movie_title.lower():  # Case insensitive comparison
                movie_url = "https://www.imdb.com" + movie_tag["href"]
                break

    if not movie_url:
        print(f"[WARNING] No exact match found for '{movie_title}'.")
        return

    print(f"[INFO] Found exact match: {movie_url}")

    time.sleep(2)  # Simulating delay

    # Step 5: Open the movie page and extract director(s)
    movie_response = requests.get(movie_url, headers=headers)
    movie_soup = BeautifulSoup(movie_response.text, "html.parser")

    director_elements = movie_soup.select("#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) > div > ul > li a")
    directors = [director.text.strip() for director in director_elements]

    if directors:
        print(f"[INFO] Director(s) of '{movie_title}': {', '.join(directors)}")
    else:
        print(f"[WARNING] No director found for '{movie_title}'.")

# Run the function
get_movie_directors("Dhoom")


[INFO] Searching for '#blackAF' on IMDb...


In [None]:
#Approved for getting the director's name

import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def get_movie_directors_fallback(movie_title):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }

    # Try BeautifulSoup First
    try:
        print(f"[INFO] Trying BeautifulSoup for '{movie_title}'...")
        search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
        time.sleep(2)

        search_response = requests.get(search_url, headers=headers)
        search_soup = BeautifulSoup(search_response.text, "html.parser")

        results_div = search_soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
        if not results_div:
            raise Exception("No results found with BeautifulSoup.")

        movie_url = None
        for result in results_div.find_all("li"):
            movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
            if movie_tag and movie_tag.text.strip().lower() == movie_title.lower():
                movie_url = "https://www.imdb.com" + movie_tag["href"]
                break

        if not movie_url:
            raise Exception("Exact title not found using BeautifulSoup.")

        time.sleep(2)
        movie_response = requests.get(movie_url, headers=headers)
        movie_soup = BeautifulSoup(movie_response.text, "html.parser")

        selector = "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) > div > ul > li a"
        director_elements = movie_soup.select(selector)
        directors = [d.text.strip() for d in director_elements]

        if directors:
            print(f"[INFO] Found with BeautifulSoup: {', '.join(directors)}")
            return directors
        else:
            raise Exception("No director info with BeautifulSoup.")

    except Exception as bs_error:
        print(f"[WARNING] BeautifulSoup failed: {bs_error}")
        print("[INFO] Switching to Selenium...")

    # Fallback to Selenium with exact title match
    try:
        options = webdriver.ChromeOptions()
        # options.add_argument("--headless")
        options.add_argument("user-agent=" + headers["User-Agent"])
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        driver.get("https://www.imdb.com/")

        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#suggestion-search"))
        )
        search_box.send_keys(movie_title)
        search_box.send_keys(Keys.RETURN)

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list li"))
        )

        search_results = driver.find_elements(By.CSS_SELECTOR, "ul.ipc-metadata-list li")
        exact_match_element = None

        for result in search_results:
            try:
                title_element = result.find_element(By.CSS_SELECTOR, "a")
                if title_element.text.strip().lower() == movie_title.lower():
                    exact_match_element = title_element
                    break
            except:
                continue

        if not exact_match_element:
            raise Exception("Exact match not found in Selenium search results.")

        exact_match_element.click()

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li[data-testid='title-pc-principal-credit']"))
        )

        selector = "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) > div > ul > li a"
        director_elements = driver.find_elements(By.CSS_SELECTOR, selector)
        directors = [el.text.strip() for el in director_elements]

        driver.quit()

        if directors:
            print(f"[INFO] Found with Selenium: {', '.join(directors)}")
            return directors
        else:
            print("[WARNING] No directors found with Selenium.")
            return []

    except Exception as se_error:
        print(f"[ERROR] Selenium also failed: {se_error}")
        return []

# Example test
get_movie_directors_fallback("Dhoom")


[INFO] Trying BeautifulSoup for 'Dhoom'...
[INFO] Switching to Selenium...
[INFO] Found with Selenium: Sanjay Gadhvi


['Sanjay Gadhvi']

### For getting cast

In [33]:
import requests
from bs4 import BeautifulSoup
import time

def get_movie_cast(movie_title):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }

    # Step 1: Construct the IMDb search URL
    search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
    print(f"[INFO] Searching for '{movie_title}' on IMDb...")

    time.sleep(2)  # Simulating delay

    # Step 2: Fetch search results page
    search_response = requests.get(search_url, headers=headers)
    search_soup = BeautifulSoup(search_response.text, "html.parser")

    # Step 3: Find the div containing the search results
    results_div = search_soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
    if not results_div:
        print("[WARNING] No search results found!")
        return

    # Step 4: Iterate over each result to find the exact match
    movie_url = None
    for result in results_div.find_all("li"):
        movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
        if movie_tag:
            movie_name = movie_tag.text.strip()
            if movie_name.lower() == movie_title.lower():  # Case insensitive comparison
                movie_url = "https://www.imdb.com" + movie_tag["href"]
                break

    if not movie_url:
        print(f"[WARNING] No exact match found for '{movie_title}'.")
        return

    print(f"[INFO] Found exact match: {movie_url}")

    time.sleep(2)  # Simulating delay

    # Step 5: Open the movie page and extract cast list
    movie_response = requests.get(movie_url, headers=headers)
    movie_soup = BeautifulSoup(movie_response.text, "html.parser")

    cast_elements = movie_soup.select("#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eZLbYw > div.sc-9a2a0028-6.iMFgzR > div.sc-9a2a0028-10.dFokEJ > section > div.sc-70a366cc-3.iwmAOx > div > ul > li.ipc-metadata-list__item.ipc-metadata-list__item--align-end.ipc-metadata-list-item--link > div > ul > li a")
    cast_list = [cast.text.strip() for cast in cast_elements]

    if cast_list:
        print(f"[INFO] Cast of '{movie_title}': {', '.join(cast_list)}")
    else:
        print(f"[WARNING] No cast found for '{movie_title}'.")

# Run the function
get_movie_cast("(T)ERROR")


[INFO] Searching for '(T)ERROR' on IMDb...
[INFO] Found exact match: https://www.imdb.com/title/tt4370922/?ref_=fn_ttl_ttl_1
[INFO] Cast of '(T)ERROR': Khalifah Ali Al-Akili, Mike Healey, Ali Kareem


In [None]:
# Approved for getting the cast data.

def get_movie_cast(movie_title):
    def try_with_beautifulsoup():
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
        }

        search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
        print(f"[INFO] [BS] Searching for '{movie_title}' on IMDb...")
        time.sleep(2)

        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        results_div = soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
        if not results_div:
            return None

        movie_url = None
        for result in results_div.find_all("li"):
            movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
            if movie_tag and movie_tag.text.strip().lower() == movie_title.lower():
                movie_url = "https://www.imdb.com" + movie_tag["href"]
                break
        if not movie_url:
            return None

        print(f"[INFO] [BS] Found exact match: {movie_url}")
        time.sleep(2)

        movie_response = requests.get(movie_url, headers=headers)
        movie_soup = BeautifulSoup(movie_response.text, "html.parser")

        cast_elements = movie_soup.select(
            "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(3) > div > ul > li > a"
        )

        return [c.text.strip() for c in cast_elements if c.text.strip()]

    def try_with_selenium():
        print(f"[INFO] [SELENIUM] Trying Selenium for '{movie_title}'...")
        options = webdriver.ChromeOptions()
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
        # options.add_argument("--headless")  # Uncomment to run headless

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        try:
            driver.get("https://www.imdb.com/")
            search_box = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#suggestion-search")))
            search_box.send_keys(movie_title)
            search_box.send_keys(Keys.RETURN)

            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list li")))

            search_results = driver.find_elements(By.CSS_SELECTOR, "ul.ipc-metadata-list li")
            exact_match_element = None

            for result in search_results:
                try:
                    title_element = result.find_element(By.CSS_SELECTOR, "a")
                    if title_element.text.strip().lower() == movie_title.lower():
                        exact_match_element = title_element
                        break
                except:
                    continue

            if not exact_match_element:
                print("[WARNING] [SELENIUM] No exact match found.")
                return None

            exact_match_element.click()

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "section div.sc-70a366cc-3.iwmAOx"))
            )

            cast_elements = driver.find_elements(By.CSS_SELECTOR,
                "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(3) > div > ul > li > a"
            )
            return [c.text.strip() for c in cast_elements if c.text.strip()]

        except Exception as e:
            print(f"[ERROR] Selenium failed: {e}")
            return None
        finally:
            driver.quit()

    # Try BS4 first
    cast_list = try_with_beautifulsoup()

    # If fails, fallback to Selenium
    if not cast_list:
        cast_list = try_with_selenium()

    if cast_list:
        print(f"[INFO] Cast of '{movie_title}': {', '.join(cast_list)}")
        return cast_list
    else:
        print(f"[WARNING] No cast found for '{movie_title}'.")
        return []

# Example usage
get_movie_cast("Dhoom")


[INFO] [BS] Searching for 'Dhoom' on IMDb...
[INFO] [BS] Found exact match: https://www.imdb.com/title/tt0422091/?ref_=fn_ttl_ttl_2
[INFO] Cast of 'Dhoom': Abhishek Bachchan, John Abraham, Uday Chopra


['Abhishek Bachchan', 'John Abraham', 'Uday Chopra']

### For getting country

In [41]:
import requests
from bs4 import BeautifulSoup
import time

def get_movie_country(movie_title):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }

    # Step 1: Construct the IMDb search URL
    search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
    print(f"[INFO] Searching for '{movie_title}' on IMDb...")

    time.sleep(2)  # Simulating delay

    # Step 2: Fetch search results page
    search_response = requests.get(search_url, headers=headers)
    search_soup = BeautifulSoup(search_response.text, "html.parser")

    # Step 3: Find the div containing the search results
    results_div = search_soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
    if not results_div:
        print("[WARNING] No search results found!")
        return None

    # Step 4: Iterate over each result to find the exact match
    movie_url = None
    for result in results_div.find_all("li"):
        movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
        if movie_tag:
            movie_name = movie_tag.text.strip()
            if movie_name.lower() == movie_title.lower():  # Case insensitive comparison
                movie_url = "https://www.imdb.com" + movie_tag["href"]
                break

    if not movie_url:
        print(f"[WARNING] No exact match found for '{movie_title}'.")
        return None

    print(f"[INFO] Found exact match: {movie_url}")

    time.sleep(2)  # Simulating delay

    # Step 5: Open the movie page and extract the first country
    movie_response = requests.get(movie_url, headers=headers)
    movie_soup = BeautifulSoup(movie_response.text, "html.parser")

    country_elements = movie_soup.select(
        "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > div > section > div > div.sc-17af1f66-1.jNEZQr.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(37) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(2) > div > ul > li > a"
    )

    if country_elements:
        first_country = country_elements[0].text.strip()
        print(f"[INFO] Country of '{movie_title}': {first_country}")
        return first_country
    else:
        print(f"[WARNING] No country found for '{movie_title}'.")
        return None

# Run the function
get_movie_country("#blackAF")


[INFO] Searching for '#blackAF' on IMDb...


In [19]:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_movie_country(movie_title):
    def try_with_beautifulsoup():
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
        }

        search_url = f"https://www.imdb.com/find/?q={movie_title.replace(' ', '+')}&s=tt"
        print(f"[INFO] [BS] Searching for '{movie_title}' on IMDb...")

        time.sleep(2)
        search_response = requests.get(search_url, headers=headers)
        search_soup = BeautifulSoup(search_response.text, "html.parser")

        results_div = search_soup.select_one("div.sc-b03627f1-2.gWHDBT > ul")
        if not results_div:
            print("[WARNING] [BS] No search results found!")
            return None

        movie_url = None
        for result in results_div.find_all("li"):
            movie_tag = result.select_one("div.ipc-metadata-list-summary-item__c div a")
            if movie_tag:
                movie_name = movie_tag.text.strip()
                if movie_name.lower() == movie_title.lower():
                    movie_url = "https://www.imdb.com" + movie_tag["href"]
                    break

        if not movie_url:
            print(f"[WARNING] [BS] No exact match found for '{movie_title}'.")
            return None

        print(f"[INFO] [BS] Found exact match: {movie_url}")
        time.sleep(2)

        movie_response = requests.get(movie_url, headers=headers)
        movie_soup = BeautifulSoup(movie_response.text, "html.parser")

        country_elements = movie_soup.select(
            "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > div > section > div > div.sc-17af1f66-1.jNEZQr.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(45) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(2) > div > ul > li > a"
        )

        if country_elements:
            return country_elements[0].text.strip()
        return None

    def try_with_selenium():
        print(f"[INFO] [SELENIUM] Trying Selenium for '{movie_title}'...")
        options = webdriver.ChromeOptions()
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
        # options.add_argument("--headless")  # Optional

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        try:
            driver.get("https://www.imdb.com/")
            search_box = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#suggestion-search")))
            search_box.send_keys(movie_title)
            search_box.send_keys(Keys.RETURN)

            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list li")))

            search_results = driver.find_elements(By.CSS_SELECTOR, "ul.ipc-metadata-list li")
            exact_match_element = None

            for result in search_results:
                try:
                    title_element = result.find_element(By.CSS_SELECTOR, "a")
                    if title_element.text.strip().lower() == movie_title.lower():
                        exact_match_element = title_element
                        break
                except:
                    continue

            if not exact_match_element:
                print("[WARNING] [SELENIUM] No exact match found.")
                return None

            exact_match_element.click()

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "section div.sc-17af1f66-1.jNEZQr"))
            )

            country_elements = driver.find_elements(By.CSS_SELECTOR,
                "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > div > section > div > div.sc-17af1f66-1.jNEZQr.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(45) > div.sc-f65f65be-0.dQVJPm > ul > li:nth-child(2) > div > ul > li > a"
            )

            return country_elements[0].text.strip() if country_elements else None

        except Exception as e:
            print(f"[ERROR] Selenium failed: {e}")
            return None
        finally:
            driver.quit()

    # Try BS4 first
    country = try_with_beautifulsoup()

    # If fails, try Selenium
    if not country:
        country = try_with_selenium()

    if country:
        print(f"[INFO] Country of '{movie_title}': {country}")
        return country
    else:
        print(f"[WARNING] No country found for '{movie_title}'.")
        return None

# Example usage
get_movie_country("#blackAF")


[INFO] [BS] Searching for '#blackAF' on IMDb...
[INFO] [SELENIUM] Trying Selenium for '#blackAF'...
[INFO] Country of '#blackAF': United States


'United States'

### For date_added column

In [29]:
update_query = "UPDATE movies_data SET country = NULL WHERE country = %s"
cursor.execute(update_query, ('<NA>',))
conn.commit()


In [32]:
# Fetch rows where one of the specified columns is NULL
query_missing_values = """
SELECT * FROM movies_data
WHERE (director IS NULL AND cast IS NOT NULL AND date_added IS NOT NULL AND rating IS NOT NULL AND duration IS NOT NULL AND description IS NOT NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NULL AND date_added IS NOT NULL AND rating IS NOT NULL AND duration IS NOT NULL AND description IS NOT NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NOT NULL AND date_added IS NULL AND rating IS NOT NULL AND duration IS NOT NULL AND description IS NOT NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NOT NULL AND date_added IS NOT NULL AND rating IS NULL AND duration IS NOT NULL AND description IS NOT NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NOT NULL AND date_added IS NOT NULL AND rating IS NOT NULL AND duration IS NULL AND description IS NOT NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NOT NULL AND date_added IS NOT NULL AND rating IS NOT NULL AND duration IS NOT NULL AND description IS NULL AND country IS NOT NULL)
   OR (director IS NOT NULL AND cast IS NOT NULL AND date_added IS NOT NULL AND rating IS NOT NULL AND duration IS NOT NULL AND description IS NOT NULL AND country IS NULL);
"""

cursor.execute(query_missing_values)
rows_missing = cursor.fetchall()

# Fetch one row where none of the specified columns are NULL
query_complete_row = """
SELECT * FROM movies_data
WHERE director IS NOT NULL
AND cast IS NOT NULL
AND date_added IS NOT NULL
AND rating IS NOT NULL
AND duration IS NOT NULL
AND description IS NOT NULL
LIMIT 1;
"""

cursor.execute(query_complete_row)
row_complete = cursor.fetchall()

# Combine both results
all_rows = rows_missing + row_complete

# Convert to DataFrame
df_missing_data = pd.DataFrame(all_rows, columns=[col[0] for col in cursor.description])
display(df_missing_data)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s5968,Movie,'89,,"Lee Dixon, Ian Wright, Paul Merson",United Kingdom,2018-05-16,2017,TV-PG,87 min,Sports Movies,"Mixing old footage with interviews, this is th..."
1,s5972,Movie,(T)ERROR,"Lyric R. Cabral, David Felix Sutcliffe",,United States,2016-06-30,2015,NR,84 min,Documentaries,This real-life look at FBI counterterrorism op...
2,s2667,TV Show,#blackAF,,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020-04-17,2020,TV-MA,1 Season,TV Comedies,Kenya Barris and his family navigate relations...
3,s5973,Movie,#cats_the_mewvie,Michael Margolis,,Canada,2020-02-05,2020,TV-14,90 min,"Documentaries, International Movies",This pawesome documentary explores how our fel...
4,s5696,Movie,#Rucker50,Robert McCullough Jr.,,United States,2016-12-01,2016,TV-PG,56 min,"Documentaries, Sports Movies",This documentary celebrates the 50th anniversa...
...,...,...,...,...,...,...,...,...,...,...,...,...
3812,s1774,TV Show,Zumbo's Just Desserts,,"Adriano Zumbo, Rachel Khoo",Australia,2020-10-31,2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...
3813,s2867,Movie,ZZ TOP: THAT LITTLE OL' BAND FROM TEXAS,Sam Dunn,,United Kingdom,2020-03-01,2019,TV-MA,90 min,"Documentaries, Music & Musicals",This documentary delves into the mystique behi...
3814,s7102,TV Show,마녀사냥,,"Si-kyung Sung, Se-yoon Yoo, Dong-yup Shin, Ji-...",South Korea,2018-02-19,2015,TV-MA,1 Season,"International TV Shows, Korean TV Shows, Stand...",Four Korean celebrity men and guest stars of b...
3815,s7109,Movie,최강전사 미니특공대 : 영웅의 탄생,Young Jun Lee,"Um Sang-hyun, Yang Jeong-hwa, Jeon Tae-yeol, S...",,2018-09-01,2018,TV-Y7,68 min,Children & Family Movies,"Miniforce, a special task force of elite range..."


In [25]:
# Function to clean the title
def clean_title(title):
    return title.replace('(Dub)', '').replace('(Sub)', '').strip()

# Fetch titles with '(Dub)' or '(Sub)'
cursor.execute("SELECT show_id, title FROM movies_data WHERE title LIKE '%(Dub)%' OR title LIKE '%(Sub)%'")
rows = cursor.fetchall()  # Fetch all rows to clear previous query result

# Loop through each row
for show_id, old_title in rows:
    new_title = clean_title(old_title)

    # Check if cleaned title already exists
    cursor.execute("SELECT show_id FROM movies_data WHERE title = %s", (new_title,))
    existing_row = cursor.fetchall()  # Use fetchall() to clear previous query result

    if existing_row:
        # If duplicate exists, delete the current row
        delete_query = "DELETE FROM movies_data WHERE show_id = %s"
        cursor.execute(delete_query, (show_id,))
        conn.commit()
        print(f"[DELETED] Duplicate '{old_title}' (ID: {show_id}) since '{new_title}' already exists.")
    else:
        # If no duplicate, update the title
        update_query = "UPDATE movies_data SET title = %s WHERE show_id = %s"
        cursor.execute(update_query, (new_title, show_id))
        conn.commit()
        print(f"[UPDATED] {old_title} → {new_title}")

[DELETED] Duplicate '(Sub) Akira' (ID: s2740) since 'Akira' already exists.
[UPDATED] (Sub) Fafner: Heaven and Earth → Fafner: Heaven and Earth
[UPDATED] (Sub) Fairy Tail the Movie: Phoenix Priestess → Fairy Tail the Movie: Phoenix Priestess
[UPDATED] (Sub) K MISSING KINGS → K MISSING KINGS
[UPDATED] (Sub) Legend of Heroes: Trails in the Sky → Legend of Heroes: Trails in the Sky
[UPDATED] (Sub) Ninja Scroll → Ninja Scroll
[DELETED] Duplicate '(Sub) Sword Art Online: The Movie - Ordinal Scale' (ID: s2487) since 'Sword Art Online: The Movie - Ordinal Scale' already exists.
[UPDATED] (Sub) Trigun: Badlands Rumble → Trigun: Badlands Rumble
[DELETED] Duplicate 'PSYCHO-PASS (SUB)' (ID: s1245) since 'PSYCHO-PASS (SUB)' already exists.


In [None]:
# Initialize WebDriver with options
options = Options()
options.add_argument("--headless")  # Run browser in background
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

# Create WebDriver instance
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Function to search movie on IMDb and get missing data
def get_movie_details(title, missing_columns):
    print(f"\n[INFO] Searching for '{title}' on IMDb...")
    driver.get("https://www.imdb.com/")
    
    # Wait and find search bar
    search_box = driver.find_element(By.CSS_SELECTOR, "#suggestion-search")
    search_box.send_keys(title)
    search_box.send_keys(Keys.RETURN)
    time.sleep(3)  # Wait for results to load
    
    # Click on the first relevant result
    # try:
    #     first_result = driver.find_element(By.CSS_SELECTOR, "td.result_text a")
    #     first_result.click()
    #     time.sleep(3)  # Wait for movie page to load
    # except:
    #     print(f"[ERROR] Could not find movie '{title}' on IMDb.")
    #     return {}
    
    try:
    # Wait until the first search result is clickable
        first_result = WebDriverWait(driver, 30).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#__next > main > div.ipc-page-content-container.ipc-page-content-container--full.sc-54536c3d-0.kFmgEJ > div.ipc-page-content-container.ipc-page-content-container--center > section > div > div.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(4) > div.sc-b03627f1-2.gWHDBT > ul > li:nth-child(1) > div.ipc-metadata-list-summary-item__c > div > a"))
        )  # Wait for movie page to load
        driver.execute_script("arguments[0].click();", first_result)
        time.sleep(5)  # Allow time for movie page to load  # Wait for the movie page to load
    except NoSuchElementException:
        print(f"[ERROR] No search results for '{title}'.")
        return {}

    except TimeoutException:
            print("[ERROR] IMDb search page took too long to load.")
            return {}

    except Exception as e:
        print(f"[ERROR] Unexpected issue: {str(e)}")
        return {}

    movie_data = {}
    # Fetch missing details
    if "director" in missing_columns:
        try:
            # Find all director elements (targeting individual <a> tags inside the <ul>)
            director_elements = driver.find_elements(By.CSS_SELECTOR, "#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-1aa68851-0.gncgHj > section > div:nth-child(5) > section > section > div.sc-9a2a0028-4.eeUUGv > div.sc-9a2a0028-6.zHrZh > div.sc-9a2a0028-10.iUfJXd > section > div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) > div > ul > li a")
            
            # Extract text from each director element and clean it
            directors = [element.text.strip() for element in director_elements]

            if directors:
                movie_data["director"] = ", ".join(directors)  # Store as a comma-separated string
                print(f"  [INFO] Found Director(s): {movie_data['director']}")
            else:
                print(f"  [WARNING] No directors found for '{title}'.")

        except Exception as e:
            print(f"  [ERROR] Could not fetch director for '{title}': {str(e)}")


    if "country" in missing_columns:
        try:
            country = driver.find_element(By.XPATH, "//li[@data-testid='title-details-origin']//a").text
            movie_data["country"] = country
            print(f"  [INFO] Found Country: {country}")
        except:
            print(f"  [WARNING] Country not found for '{title}'.")

    if "rating" in missing_columns:
        try:
            rating = driver.find_element(By.CSS_SELECTOR, "span.sc-1fb98db9-1.gTzKIg").text
            movie_data["rating"] = rating
            print(f"  [INFO] Found Rating: {rating}")
        except:
            print(f"  [WARNING] Rating not found for '{title}'.")

    if "date_added" in missing_columns:
        try:
            date_added = driver.find_element(By.XPATH, "//li[@data-testid='title-details-release-date']//a").text
            movie_data["date_addeds"] = date_added
            print(f"  [INFO] Found Release Date: {date_added}")
        except:
            print(f"  [WARNING] Release Date not found for '{title}'.")

    if "cast" in missing_columns:
        try:
            cast = driver.find_element(By.XPATH, "//a[@data-testid='title-cast-item__actor']").text
            movie_data["cast"] = cast
            print(f"  [INFO] Found Lead Actor: {cast}")
        except:
            print(f"  [WARNING] Cast not found for '{title}'.")

    return movie_data

# Iterate through missing_movie_data DataFrame
for index, row in df_missing_data.iterrows():
    title = row["title"]
    missing_columns = [col for col in df_missing_data.columns if pd.isnull(row[col])]

    if not missing_columns:
        print(f"\n[INFO] No missing data for '{title}', skipping...")
        continue

    print(f"\n[INFO] Processing movie: {title}")
    print(f"  [INFO] Missing Columns: {missing_columns}")

    # Fetch missing data from IMDb
    movie_details = get_movie_details(title, missing_columns)

    # Update DataFrame with fetched values
    for col, value in movie_details.items():
        df_missing_data.at[index, col] = value

    print(f"  [SUCCESS] Updated '{title}' with new details.")

# Close WebDriver
driver.quit()

# Display updated DataFrame
print("\n[INFO] Updated DataFrame:")
print(df_missing_data)

In [None]:
display(missing_movie_data)

We can see that we have Sub and Dub written for the movies name. Probably it will be there for the TV shows name. Now will be editing it in the dtabase directly. That would be better.