## Question 2

Develop a basic movie recommendation system that scrapes data from the IMDb website. The program will retrieve information from the Most Popular Movies page on IMDb. The data to be scraped is

1. Movie Titles <br>
2. Runtime <br>
3. Rating <br>
4. Age Restriction <br>
5. Genre <br>
6. Writer(s) <br>
7. Director(s) <br>
8. Fun Movie Trivia <br>

The data will be stored in a Pandas dataframe.

Subsequently, the program will prompt the user to specify their preferences for:

1. Genre <br>
2. Minimum Rating<br>
3. Maximum Runtime<br>
4. Age Restriction<br>

It will then filter the dataframe accordingly and select a movie for recommendation. The recommendation will display the movie title, writer, director, and trivia. If there are no movies that match the user's preferences, the program will recommend any movie from the original dataframe.


In [None]:
import pandas as pd
import time
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import InvalidSelectorException

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import nltk

In [None]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
imb_url = "https://m.imdb.com/chart/moviemeter/?ref_=nv_mv_mpm"    
driver.get(imb_url)    


In [None]:
data_list = []

# Find the element containing movie details
details_element = driver.find_element(By.XPATH, "/html/body/div[2]/main/div/div[3]/section/div/div[2]/div/ul")
details_list = details_element.find_elements(By.CLASS_NAME, "ipc-metadata-list-summary-item__c")

# Loop through each movie detail
for detail in details_list:
    # Extract movie information
    # movie_title = detail.text.split('\n')[0]
    movie_link = detail.find_element(By.TAG_NAME, 'a').get_attribute("href")

    # Open a new tab to get more details about the movie
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(movie_link)

    # Extract genre and other details
    genre_element = driver.find_element(By.XPATH, "/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[1]/div[2]")
    genre_text = genre_element.text
    genre_list = genre_text.split('\n')

    title_element = driver.find_element(By.TAG_NAME, 'h1')
    movie_title = title_element.text

    row_element = driver.find_element(By.XPATH, "/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]")
    row_list = row_element.text.split('\n')

    # Extract runtime and rating
    try:
        ind = row_list.index('IMDb RATING')
        runtime = row_list[ind - 1]
        rating = row_list[ind + 1]
    except ValueError:
        runtime = rating = None

    # Extract age restriction
    age_restriction = row_list[2] 

    # Extract director and writers
    director_element = driver.find_element(By.XPATH, "/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[2]")
    director_data = director_element.text.split("\n")

    result = {'Director': [], 'Writers': []}
    current_key = None

    for item in director_data:
        if item == 'Director':
            current_key = 'Director'
        elif item == 'Writers' or item == "Writer":
            current_key = 'Writers'
        elif current_key:
            result[current_key].append(item)

    # Close the tab and switch back to the main window
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

    data_list.append({"Movie Titles": movie_title, "Runtime": runtime, "Rating": rating,
                      "Age Restriction": age_restriction, "Genre": genre_list, "Writer(s)": result["Writers"],
                      "Director(s)": result["Director"]})

movie_df = pd.DataFrame(data_list)

In [None]:
movie_df
movie = movie_df
movie.replace('POPULARITY', np.nan, inplace=True)

In [None]:
movie.to_csv("Movie_Data.csv")
movie = pd.read_csv('Movie_Data.csv',index_col='Unnamed: 0')
movie.head()

Unnamed: 0,Movie Titles,Runtime,Rating,Age Restriction,Genre,Writer(s),Director(s)
0,The Killer,1h 58m,6.9,15,"['Action', 'Adventure', 'Crime']",['Alexis NolentLuc JacamonAndrew Kevin Walker'...,['David Fincher']
1,The Marvels,1h 45m,6.0,12A,"['Action', 'Adventure', 'Fantasy']","['Nia DaCostaMegan McDonnellElissa Karasik', '...",['Nia DaCosta']
2,The Hunger Games: The Ballad of Songbirds & Sn...,2h 37m,7.2,12A,"['Action', 'Adventure', 'Drama']",['Michael LesslieMichael ArndtSuzanne Collins'...,['Francis Lawrence']
3,Tiger 3,2h 34m,7.7,2h 34m,"['Action', 'Adventure', 'Thriller']",['Shridhar RaghavanAnckur ChaudhryAditya Chopr...,['Maneesh Sharma']
4,Madame Web,,,,"['Action', 'Adventure', 'Sci-Fi']","['Claire ParkerS.J. ClarksonKerem Sanga', 'Sta...",['S.J. Clarkson']


In [20]:
def convert_to_mins(runtime):
    if isinstance(runtime, str):
        division = runtime.split()
        mins = 0

        for part in division:
            if 'h' in part:
                mins += int(part.replace('h',''))*60
            elif 'm' in part:
                mins += int(part.replace('m',''))
        return mins
    
# Apply the helper function to the 'Runtime' column
movie['Runtime'] = movie['Runtime'].apply(convert_to_mins)
movie

Unnamed: 0,Movie Titles,Runtime,Rating,Age Restriction,Genre,Writer(s),Director(s)
0,The Killer,118.0,6.9,15,"['Action', 'Adventure', 'Crime']",['Alexis NolentLuc JacamonAndrew Kevin Walker'...,['David Fincher']
1,The Marvels,105.0,6.0,12A,"['Action', 'Adventure', 'Fantasy']","['Nia DaCostaMegan McDonnellElissa Karasik', '...",['Nia DaCosta']
2,The Hunger Games: The Ballad of Songbirds & Sn...,157.0,7.2,12A,"['Action', 'Adventure', 'Drama']",['Michael LesslieMichael ArndtSuzanne Collins'...,['Francis Lawrence']
3,Tiger 3,154.0,7.7,2h 34m,"['Action', 'Adventure', 'Thriller']",['Shridhar RaghavanAnckur ChaudhryAditya Chopr...,['Maneesh Sharma']
4,Madame Web,,,,"['Action', 'Adventure', 'Sci-Fi']","['Claire ParkerS.J. ClarksonKerem Sanga', 'Sta...",['S.J. Clarkson']
...,...,...,...,...,...,...,...
95,Dungeons & Dragons: Honour Among Thieves,134.0,7.3,2023,"['Action', 'Adventure', 'Comedy']",['Jonathan GoldsteinJohn Francis DaleyMichael ...,[]
96,The Breakfast Club,97.0,7.8,15,"['Comedy', 'Drama']","['John Hughes', 'Stars', 'Emilio EstevezJudd N...",['John Hughes']
97,The Exorcist: Believer,111.0,4.9,15,['Horror'],"['Peter SattlerDavid Gordon GreenScott Teems',...",['David Gordon Green']
98,Gone Girl,149.0,8.1,18,"['Drama', 'Mystery', 'Thriller']","['Gillian Flynn', 'Stars', 'Ben AffleckRosamun...",['David Fincher']


In [23]:
# User input for preferences
genre = input("Enter your preferred genre: ")
min_rating = float(input("Enter your minimum rating preference: "))
max_runtime = int(input("Enter your maximum runtime preference in minutes: "))
age_restriction = input("Enter preferred age restriction ('15','12A','PG','18','R','U'):")

In [24]:
filtered_df = movie[
    (movie['Genre'].str.contains(genre, case=False)) &
    (movie['Rating'] >= min_rating) &
    (movie['Runtime'] <= max_runtime) &
    (movie['Age Restriction'] == age_restriction)
]

# filtered_df

if filtered_df.empty:
    recommendation = movie.sample(1)

filtered_df

Unnamed: 0,Movie Titles,Runtime,Rating,Age Restriction,Genre,Writer(s),Director(s)
1,The Marvels,105.0,6.0,12A,"['Action', 'Adventure', 'Fantasy']","['Nia DaCostaMegan McDonnellElissa Karasik', '...",['Nia DaCosta']
18,Barbie,114.0,7.0,12A,"['Adventure', 'Comedy', 'Fantasy']","['Greta GerwigNoah Baumbach', 'Stars', 'Margot...",['Greta Gerwig']
