In [19]:
import pandas as pd
import numpy as np
import mysql.connector
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [8]:
conn = mysql.connector.connect(
    host="localhost",
    user="Himanshu",
    password="Hustling@2000",
    database="movies_db"
)
cursor = conn.cursor()

In [3]:
# SQL query to count null values for each column in movies_data table
sql_query = """
SELECT 
    COUNT(*) as total_rows,
    SUM(CASE WHEN title IS NULL THEN 1 ELSE 0 END) as title_nulls,
    SUM(CASE WHEN type IS NULL THEN 1 ELSE 0 END) as type_nulls,
    SUM(CASE WHEN director IS NULL THEN 1 ELSE 0 END) as director_nulls,
    SUM(CASE WHEN cast IS NULL THEN 1 ELSE 0 END) as cast_nulls,
    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) as country_nulls,
    SUM(CASE WHEN date_added IS NULL THEN 1 ELSE 0 END) as date_added_nulls,
    SUM(CASE WHEN release_year IS NULL THEN 1 ELSE 0 END) as release_year_nulls,
    SUM(CASE WHEN rating IS NULL THEN 1 ELSE 0 END) as rating_nulls,
    SUM(CASE WHEN duration IS NULL THEN 1 ELSE 0 END) as duration_nulls,
    SUM(CASE WHEN listed_in IS NULL THEN 1 ELSE 0 END) as listed_in_nulls,
    SUM(CASE WHEN description IS NULL THEN 1 ELSE 0 END) as description_nulls
FROM movies_data;
"""

# Execute the query using your existing cursor
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a more readable format
column_names = [desc[0] for desc in cursor.description]
missing_values = dict(zip(column_names, results[0]))

# Print the results in a formatted way
print("\nMissing Values Analysis:")
print("-" * 50)
print(f"Total Rows: {missing_values['total_rows']}")
print("-" * 50)
for column, nulls in missing_values.items():
    if column != 'total_rows':
        percentage = (nulls / missing_values['total_rows']) * 100
        print(f"{column.replace('_nulls', '')}: {nulls} nulls ({percentage:.2f}%)")


Missing Values Analysis:
--------------------------------------------------
Total Rows: 3071
--------------------------------------------------
title: 0 nulls (0.00%)
type: 0 nulls (0.00%)
director: 3068 nulls (99.90%)
cast: 3071 nulls (100.00%)
country: 1452 nulls (47.28%)
date_added: 28 nulls (0.91%)
release_year: 0 nulls (0.00%)
rating: 519 nulls (16.90%)
duration: 478 nulls (15.56%)
listed_in: 0 nulls (0.00%)
description: 4 nulls (0.13%)


As we can see that there are many null values in the columns director, cast, country, date_added, rating and durations.
Most of them can be derived from the internet.
So now what I will be doing is web scrapping using BeautifulSoup.

In [6]:
# SQL query to get 10 rows with NULL values in specified columns
sql_query = """
SELECT *
FROM movies_data
WHERE director IS NULL 
   OR cast IS NULL 
   OR country IS NULL 
   OR date_added IS NULL 
   OR rating IS NULL 
   OR duration IS NULL
LIMIT 10;
"""

# You can execute this query using your existing MySQL connection:
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a pandas DataFrame for better visualization
columns = [desc[0] for desc in cursor.description]
df_null_analysis = pd.DataFrame(results, columns=columns)

# Display the results
display(df_null_analysis)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1893,TV Show,¿Dónde Está Elisa?,,,,2020-01-01,2010,TV-14,1 Season,"Crime, Drama, Latino",The lives of the Domínguez family will change ...
1,s2902,TV Show,.hack//Roots,,,Japan,2015-01-30,2006,TV-14,1 Season,"Action, Adventure, Anime","In the online RPG know as ""The World,"" Haseo a..."
2,s2921,TV Show,.hack//SIGN,,,Japan,2014-09-05,2002,TV-Y7,1 Season,"Anime, Drama","Tsukasa wakes up inside The World, a massive o..."
3,s1462,Movie,'71,,,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
4,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
5,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
6,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
7,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
8,s2486,Movie,(Dub) Sword Art Online: The Movie - Ordinal Scale,,,,2018-05-19,2017,,120 min,"Action, Adventure, Anime","Two years after the SAO incident, a new Augmen..."
9,s2740,Movie,(Sub) Akira,,,Japan,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...


In [7]:
import pandas as pd

# SQL query to fetch rows where 'type' is 'Movie' and any of the specified columns are NULL
query = """
    SELECT * FROM movies_data 
    WHERE type = 'Movie' 
    AND (director IS NULL OR cast IS NULL OR country IS NULL OR date_added IS NULL OR rating IS NULL)
"""

# Execute query
cursor.execute(query)

# Fetch all results
missing_data_rows = cursor.fetchall()

# Get column names from the table
column_names = [desc[0] for desc in cursor.description]

# Convert to DataFrame
df_missing_values_movies = pd.DataFrame(missing_data_rows, columns=column_names)

# Display the DataFrame
display(df_missing_values_movies)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1462,Movie,'71,,,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
1,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
2,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
3,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
4,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1478,s1588,Movie,Your Home Made Perfect,,,United Kingdom,2020-06-15,2019,2 Seasons,,Reality,Never before have plans come to life like this...
1479,s847,Movie,Zappa,,,,2021-04-01,2020,,128 min,"Documentaries, Music",ZAPPA is an expansive and intimate portrait of...
1480,s2416,Movie,Zapped,,,"Canada, United States",2018-09-28,2014,TV-G,94 min,"Adventure, Black Stories, Comedy","Zoey, a skilled dancer and straight-A student,..."
1481,s2019,Movie,ZOMBOAT!,,,United Kingdom,2019-10-25,2019,1 Season,,"Comedy, Drama, International",Sisters Kat and Jo realise there's a zombie ap...


In [2]:
# Set up WebDriver (Update PATH to your chromedriver)
driver = webdriver.Chrome()

try:
    # Open IMDb website
    driver.get("https://www.imdb.com/")

    # Locate search bar and type the movie name
    search_box = driver.find_element(By.CSS_SELECTOR, "#suggestion-search")
    search_box.send_keys("Kal Ho Naa Ho")

    # Click the search button
    search_button = driver.find_element(By.CSS_SELECTOR, "#suggestion-search-button")
    search_button.click()

    # Wait for search results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.sc-b03627f1-2.gWHDBT"))
    )

    # Find and click on the link for "Kal Ho Naa Ho"
    movie_link = driver.find_element(By.LINK_TEXT, "Kal Ho Naa Ho")
    movie_link.click()

    # Wait for the movie page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) a"))
    )

    # Extract the director's name
    director_element = driver.find_element(By.CSS_SELECTOR, "div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) a")
    director_name = director_element.text

    print(f"Director of 'Kal Ho Naa Ho': {director_name}")

finally:
    # Close the browser
    driver.quit()

Director of 'Kal Ho Naa Ho': Nikkhil Advani


In [9]:
# SQL query to fetch top 10 rows where type is 'Movie' and at least one column is NULL
query = """
SELECT * 
FROM movies_data
WHERE type = 'Movie' 
AND (
    director IS NULL OR
    cast IS NULL OR
    country IS NULL OR
    date_added IS NULL OR
    release_year IS NULL OR
    rating IS NULL OR
    duration IS NULL OR
    listed_in IS NULL OR
    description IS NULL
)
LIMIT 10;
"""

# Execute the query and fetch data
cursor.execute(query)
rows = cursor.fetchall()

# Fetch column names
columns = [desc[0] for desc in cursor.description]

# Create a DataFrame from fetched data
missing_movie_data = pd.DataFrame(rows, columns=columns)


# Display DataFrame
display(missing_movie_data)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1462,Movie,'71,,,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
1,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
2,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
3,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
4,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
5,s2486,Movie,(Dub) Sword Art Online: The Movie - Ordinal Scale,,,,2018-05-19,2017,,120 min,"Action, Adventure, Anime","Two years after the SAO incident, a new Augmen..."
6,s2740,Movie,(Sub) Akira,,,Japan,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
7,s2635,Movie,(Sub) Fafner: Heaven and Earth,,,Japan,2017-09-05,2010,TV-14,93 min,"Action, Adventure, Anime",A hiveminded alien force reengages war with Ta...
8,s2636,Movie,(Sub) Fairy Tail the Movie: Phoenix Priestess,,,Japan,2017-09-05,2012,TV-14,86 min,"Anime, Drama",Natsu and his mighty team of mages face evil r...
9,s2861,Movie,(Sub) K MISSING KINGS,,,Japan,2015-10-01,2014,PG,74 min,"Action, Adventure, Anime","Ever since the School Island Incident, in whic..."


In [28]:
# Initialize WebDriver with options
options = Options()
#options.add_argument("--headless")  # Run browser in background
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")

# Create WebDriver instance
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Function to search movie on IMDb and get missing data
def get_movie_details(title, missing_columns):
    print(f"\n[INFO] Searching for '{title}' on IMDb...")
    driver.get("https://www.imdb.com/")
    
    # Wait and find search bar
    search_box = driver.find_element(By.CSS_SELECTOR, "#suggestion-search")
    search_box.send_keys(title)
    search_box.send_keys(Keys.RETURN)
    time.sleep(3)  # Wait for results to load
    
    # Click on the first relevant result
    # try:
    #     first_result = driver.find_element(By.CSS_SELECTOR, "td.result_text a")
    #     first_result.click()
    #     time.sleep(3)  # Wait for movie page to load
    # except:
    #     print(f"[ERROR] Could not find movie '{title}' on IMDb.")
    #     return {}
    
    try:
    # Wait until the first search result is clickable
        first_result = WebDriverWait(driver, 30).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#__next > main > div.ipc-page-content-container.ipc-page-content-container--full.sc-54536c3d-0.kFmgEJ > div.ipc-page-content-container.ipc-page-content-container--center > section > div > div.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(4) > div.sc-b03627f1-2.gWHDBT > ul > li:nth-child(1) > div.ipc-metadata-list-summary-item__c > div > a"))
        )  # Wait for movie page to load
        driver.execute_script("arguments[0].click();", first_result)
        time.sleep(5)  # Allow time for movie page to load  # Wait for the movie page to load
    except NoSuchElementException:
        print(f"[ERROR] No search results for '{title}'.")
        return {}

    except TimeoutException:
            print("[ERROR] IMDb search page took too long to load.")
            return {}

    except Exception as e:
        print(f"[ERROR] Unexpected issue: {str(e)}")
        return {}

    movie_data = {}

    # Fetch missing details
    if "director" in missing_columns:
        try:
            director = driver.find_element(By.CSS_SELECTOR, "li[data-testid='title-pc-principal-credit']:nth-child(1) a").text
            movie_data["director"] = director
            print(f"  [INFO] Found Director: {director}")
        except:
            print(f"  [WARNING] Director not found for '{title}'.")

    if "country" in missing_columns:
        try:
            country = driver.find_element(By.XPATH, "//li[@data-testid='title-details-origin']//a").text
            movie_data["country"] = country
            print(f"  [INFO] Found Country: {country}")
        except:
            print(f"  [WARNING] Country not found for '{title}'.")

    if "rating" in missing_columns:
        try:
            rating = driver.find_element(By.CSS_SELECTOR, "span.sc-1fb98db9-1.gTzKIg").text
            movie_data["rating"] = rating
            print(f"  [INFO] Found Rating: {rating}")
        except:
            print(f"  [WARNING] Rating not found for '{title}'.")

    if "date_added" in missing_columns:
        try:
            date_added = driver.find_element(By.XPATH, "//li[@data-testid='title-details-release-date']//a").text
            movie_data["date_addeds"] = date_added
            print(f"  [INFO] Found Release Date: {date_added}")
        except:
            print(f"  [WARNING] Release Date not found for '{title}'.")

    if "cast" in missing_columns:
        try:
            cast = driver.find_element(By.XPATH, "//a[@data-testid='title-cast-item__actor']").text
            movie_data["cast"] = cast
            print(f"  [INFO] Found Lead Actor: {cast}")
        except:
            print(f"  [WARNING] Cast not found for '{title}'.")

    return movie_data

# Iterate through missing_movie_data DataFrame
for index, row in missing_movie_data.iterrows():
    title = row["title"]
    missing_columns = [col for col in missing_movie_data.columns if pd.isnull(row[col])]

    if not missing_columns:
        print(f"\n[INFO] No missing data for '{title}', skipping...")
        continue

    print(f"\n[INFO] Processing movie: {title}")
    print(f"  [INFO] Missing Columns: {missing_columns}")

    # Fetch missing data from IMDb
    movie_details = get_movie_details(title, missing_columns)

    # Update DataFrame with fetched values
    for col, value in movie_details.items():
        missing_movie_data.at[index, col] = value

    print(f"  [SUCCESS] Updated '{title}' with new details.")

# Close WebDriver
driver.quit()

# Display updated DataFrame
print("\n[INFO] Updated DataFrame:")
print(missing_movie_data)


[INFO] Processing movie: '71
  [INFO] Missing Columns: ['director', 'cast']

[INFO] Searching for ''71' on IMDb...
  [INFO] Found Director: Yann Demange
  [INFO] Found Lead Actor: Jack O'Connell
  [SUCCESS] Updated ''71' with new details.

[INFO] Processing movie: (Dub) Afro Samurai Resurrection
  [INFO] Missing Columns: ['director', 'cast', 'country']

[INFO] Searching for '(Dub) Afro Samurai Resurrection' on IMDb...
[ERROR] Unexpected issue: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=134.0.6998.178)
Stacktrace:
	GetHandleVerifier [0x00FAC7F3+24435]
	(No symbol) [0x00F32074]
	(No symbol) [0x00E006E3]
	(No symbol) [0x00DEFEC0]
	(No symbol) [0x00E0DFDF]
	(No symbol) [0x00E746AF]
	(No symbol) [0x00E8EB19]
	(No symbol) [0x00E6D5B6]
	(No symbol) [0x00E3C54F]
	(No symbol) [0x00E3D894]
	GetHandleVerifier [0x012B70A3+3213347]
	GetHandleVerifier [0x012CB0C9+3295305]
	GetHandleVerifi

InvalidSessionIdException: Message: invalid session id
Stacktrace:
	GetHandleVerifier [0x00FAC7F3+24435]
	(No symbol) [0x00F32074]
	(No symbol) [0x00E0055E]
	(No symbol) [0x00E3B9D8]
	(No symbol) [0x00E6D676]
	(No symbol) [0x00E6904C]
	(No symbol) [0x00E685C6]
	(No symbol) [0x00DD3245]
	(No symbol) [0x00DD379E]
	(No symbol) [0x00DD3C1D]
	GetHandleVerifier [0x012B70A3+3213347]
	GetHandleVerifier [0x012CB0C9+3295305]
	GetHandleVerifier [0x012C558C+3271948]
	GetHandleVerifier [0x01047360+658144]
	(No symbol) [0x00F3B27D]
	(No symbol) [0x00DD2F10]
	(No symbol) [0x00DD2ACA]
	GetHandleVerifier [0x0133E9AC+3768620]
	BaseThreadInitThunk [0x76467BA9+25]
	RtlInitializeExceptionChain [0x76EDC2EB+107]
	RtlClearBits [0x76EDC26F+191]


In [29]:
display(missing_movie_data)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1462,Movie,'71,Yann Demange,Jack O'Connell,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
1,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
2,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
3,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
4,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
5,s2486,Movie,(Dub) Sword Art Online: The Movie - Ordinal Scale,,,,2018-05-19,2017,,120 min,"Action, Adventure, Anime","Two years after the SAO incident, a new Augmen..."
6,s2740,Movie,(Sub) Akira,,,Japan,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
7,s2635,Movie,(Sub) Fafner: Heaven and Earth,,,Japan,2017-09-05,2010,TV-14,93 min,"Action, Adventure, Anime",A hiveminded alien force reengages war with Ta...
8,s2636,Movie,(Sub) Fairy Tail the Movie: Phoenix Priestess,,,Japan,2017-09-05,2012,TV-14,86 min,"Anime, Drama",Natsu and his mighty team of mages face evil r...
9,s2861,Movie,(Sub) K MISSING KINGS,,,Japan,2015-10-01,2014,PG,74 min,"Action, Adventure, Anime","Ever since the School Island Incident, in whic..."


We can see that we have Sub and Dub written for the movies name. Probable will be there for the TV shows name.