In [8]:
import pandas as pd
import numpy as np
import mysql.connector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
conn = mysql.connector.connect(
    host="localhost",
    user="Himanshu",
    password="Hustling@2000",
    database="movies_db"
)
cursor = conn.cursor()

In [3]:
# SQL query to count null values for each column in movies_data table
sql_query = """
SELECT 
    COUNT(*) as total_rows,
    SUM(CASE WHEN title IS NULL THEN 1 ELSE 0 END) as title_nulls,
    SUM(CASE WHEN type IS NULL THEN 1 ELSE 0 END) as type_nulls,
    SUM(CASE WHEN director IS NULL THEN 1 ELSE 0 END) as director_nulls,
    SUM(CASE WHEN cast IS NULL THEN 1 ELSE 0 END) as cast_nulls,
    SUM(CASE WHEN country IS NULL THEN 1 ELSE 0 END) as country_nulls,
    SUM(CASE WHEN date_added IS NULL THEN 1 ELSE 0 END) as date_added_nulls,
    SUM(CASE WHEN release_year IS NULL THEN 1 ELSE 0 END) as release_year_nulls,
    SUM(CASE WHEN rating IS NULL THEN 1 ELSE 0 END) as rating_nulls,
    SUM(CASE WHEN duration IS NULL THEN 1 ELSE 0 END) as duration_nulls,
    SUM(CASE WHEN listed_in IS NULL THEN 1 ELSE 0 END) as listed_in_nulls,
    SUM(CASE WHEN description IS NULL THEN 1 ELSE 0 END) as description_nulls
FROM movies_data;
"""

# Execute the query using your existing cursor
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a more readable format
column_names = [desc[0] for desc in cursor.description]
missing_values = dict(zip(column_names, results[0]))

# Print the results in a formatted way
print("\nMissing Values Analysis:")
print("-" * 50)
print(f"Total Rows: {missing_values['total_rows']}")
print("-" * 50)
for column, nulls in missing_values.items():
    if column != 'total_rows':
        percentage = (nulls / missing_values['total_rows']) * 100
        print(f"{column.replace('_nulls', '')}: {nulls} nulls ({percentage:.2f}%)")


Missing Values Analysis:
--------------------------------------------------
Total Rows: 3071
--------------------------------------------------
title: 0 nulls (0.00%)
type: 0 nulls (0.00%)
director: 3068 nulls (99.90%)
cast: 3071 nulls (100.00%)
country: 1452 nulls (47.28%)
date_added: 28 nulls (0.91%)
release_year: 0 nulls (0.00%)
rating: 519 nulls (16.90%)
duration: 478 nulls (15.56%)
listed_in: 0 nulls (0.00%)
description: 4 nulls (0.13%)


As we can see that there are many null values in the columns director, cast, country, date_added, rating and durations.
Most of them can be derived from the internet.
So now what I will be doing is web scrapping using BeautifulSoup.

In [6]:
# SQL query to get 10 rows with NULL values in specified columns
sql_query = """
SELECT *
FROM movies_data
WHERE director IS NULL 
   OR cast IS NULL 
   OR country IS NULL 
   OR date_added IS NULL 
   OR rating IS NULL 
   OR duration IS NULL
LIMIT 10;
"""

# You can execute this query using your existing MySQL connection:
cursor.execute(sql_query)
results = cursor.fetchall()

# Convert results to a pandas DataFrame for better visualization
columns = [desc[0] for desc in cursor.description]
df_null_analysis = pd.DataFrame(results, columns=columns)

# Display the results
display(df_null_analysis)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1893,TV Show,¿Dónde Está Elisa?,,,,2020-01-01,2010,TV-14,1 Season,"Crime, Drama, Latino",The lives of the Domínguez family will change ...
1,s2902,TV Show,.hack//Roots,,,Japan,2015-01-30,2006,TV-14,1 Season,"Action, Adventure, Anime","In the online RPG know as ""The World,"" Haseo a..."
2,s2921,TV Show,.hack//SIGN,,,Japan,2014-09-05,2002,TV-Y7,1 Season,"Anime, Drama","Tsukasa wakes up inside The World, a massive o..."
3,s1462,Movie,'71,,,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
4,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
5,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
6,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
7,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
8,s2486,Movie,(Dub) Sword Art Online: The Movie - Ordinal Scale,,,,2018-05-19,2017,,120 min,"Action, Adventure, Anime","Two years after the SAO incident, a new Augmen..."
9,s2740,Movie,(Sub) Akira,,,Japan,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...


In [7]:
import pandas as pd

# SQL query to fetch rows where 'type' is 'Movie' and any of the specified columns are NULL
query = """
    SELECT * FROM movies_data 
    WHERE type = 'Movie' 
    AND (director IS NULL OR cast IS NULL OR country IS NULL OR date_added IS NULL OR rating IS NULL)
"""

# Execute query
cursor.execute(query)

# Fetch all results
missing_data_rows = cursor.fetchall()

# Get column names from the table
column_names = [desc[0] for desc in cursor.description]

# Convert to DataFrame
df_missing_values_movies = pd.DataFrame(missing_data_rows, columns=column_names)

# Display the DataFrame
display(df_missing_values_movies)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1462,Movie,'71,,,United Kingdom,2020-08-01,2014,R,99 min,"Action, Adventure, Drama",Jack O'Connell is a British solider accidental...
1,s2632,Movie,(Dub) Afro Samurai Resurrection,,,,2017-09-05,2009,TV-MA,101 min,"Action, Adult Animation, Adventure",Afro Samurai found peace after avenging his fa...
2,s2739,Movie,(Dub) Akira,,,,2017-03-17,1988,R,125 min,"Action, Adult Animation, Anime",Clandestine army activities threaten the war t...
3,s2633,Movie,(Dub) Dragon Age: Dawn of the Seeker,,,,2017-09-05,2012,,91 min,Anime,A brash young Seeker - Cassandra - is accused ...
4,s2634,Movie,(Dub) Mass Effect: Paragon Lost,,,,2017-09-05,2013,TV-MA,94 min,"Action, Adventure, Anime","Mass Effect: Paragon Lost, the prequel to best..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1478,s1588,Movie,Your Home Made Perfect,,,United Kingdom,2020-06-15,2019,2 Seasons,,Reality,Never before have plans come to life like this...
1479,s847,Movie,Zappa,,,,2021-04-01,2020,,128 min,"Documentaries, Music",ZAPPA is an expansive and intimate portrait of...
1480,s2416,Movie,Zapped,,,"Canada, United States",2018-09-28,2014,TV-G,94 min,"Adventure, Black Stories, Comedy","Zoey, a skilled dancer and straight-A student,..."
1481,s2019,Movie,ZOMBOAT!,,,United Kingdom,2019-10-25,2019,1 Season,,"Comedy, Drama, International",Sisters Kat and Jo realise there's a zombie ap...


In [10]:
# Set up WebDriver (Update PATH to your chromedriver)
driver = webdriver.Chrome()

try:
    # Open IMDb website
    driver.get("https://www.imdb.com/")

    # Locate search bar and type the movie name
    search_box = driver.find_element(By.CSS_SELECTOR, "#suggestion-search")
    search_box.send_keys("Kal Ho Naa Ho")

    # Click the search button
    search_button = driver.find_element(By.CSS_SELECTOR, "#suggestion-search-button")
    search_button.click()

    # Wait for search results to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.sc-b03627f1-2.gWHDBT"))
    )

    # Find and click on the link for "Kal Ho Naa Ho"
    movie_link = driver.find_element(By.LINK_TEXT, "Kal Ho Naa Ho")
    movie_link.click()

    # Wait for the movie page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) a"))
    )

    # Extract the director's name
    director_element = driver.find_element(By.CSS_SELECTOR, "div.sc-70a366cc-3.iwmAOx > div > ul > li:nth-child(1) a")
    director_name = director_element.text

    print(f"Director of 'Kal Ho Naa Ho': {director_name}")

finally:
    # Close the browser
    driver.quit()

Director of 'Kal Ho Naa Ho': Nikkhil Advani
