<a href="https://colab.research.google.com/github/Gaurav-poddar/Web_Scrapping_projects/blob/main/Web_Scraping_IMDB_Top_250_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective : To Scrap relevant data from IMDB

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://www.imdb.com/chart/top/'
# The error i got : 403 error
# Solution used headers
# Explanation : This is probably because of mod_security or some similar server security feature which blocks known spider/bot user
# agents(urllib uses something like python urllib/3.3.0, it's easily detected). Try setting a known browser user agent with:
# the explanation  see above: https://stackoverflow.com/questions/16627227/problem-http-error-403-in-python-3-web-scraping
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'html.parser')
print(response)

<Response [200]>


### Fetching all the top 250 movie names


---



In [None]:
movies = soup.find("ul", class_ = "ipc-metadata-list ipc-metadata-list--dividers-between sc-71ed9118-0 kxsUNk compact-list-view ipc-metadata-list--base")
movie_list = movies.find_all('a', class_='ipc-title-link-wrapper')
top_250 = []
for movie in movie_list:
  movie_name = movie.h3.text.strip()
  top_250.append(movie_name)

print(top_250)

['1. The Shawshank Redemption', '2. The Godfather', '3. The Dark Knight', '4. The Godfather Part II', '5. 12 Angry Men', "6. Schindler's List", '7. The Lord of the Rings: The Return of the King', '8. Pulp Fiction', '9. The Lord of the Rings: The Fellowship of the Ring', '10. The Good, the Bad and the Ugly', '11. Forrest Gump', '12. Fight Club', '13. The Lord of the Rings: The Two Towers', '14. Inception', '15. Star Wars: Episode V - The Empire Strikes Back', '16. The Matrix', '17. Goodfellas', "18. One Flew Over the Cuckoo's Nest", '19. Se7en', "20. It's a Wonderful Life", '21. Interstellar', '22. Seven Samurai', '23. The Silence of the Lambs', '24. Saving Private Ryan', '25. City of God', '26. Life Is Beautiful', '27. The Green Mile', '28. Terminator 2: Judgment Day', '29. Star Wars: Episode IV - A New Hope', '30. Spider-Man: Across the Spider-Verse', '31. Back to the Future', '32. Spirited Away', '33. The Pianist', '34. Parasite', '35. Psycho', '36. Gladiator', '37. The Lion King', '

## Movie MetaData

In [None]:
# Find all div elements with the specified class
div_elements = soup.find_all('div', class_='sc-be6f1408-7 iUtHEN cli-title-metadata')
movie_details = []
# Loop through each div element
for div_element in div_elements:
    # Extract and print the content of each span inside the div
    span_elements = div_element.find_all('span', class_='sc-be6f1408-8 fcCUPU cli-title-metadata-item')
    span = ''
    for span_element in span_elements:
      span = span + " " + span_element.text
    movie_details.append(span)

movie_details

[' 1994 2h 22m R',
 ' 1972 2h 55m R',
 ' 2008 2h 32m PG-13',
 ' 1974 3h 22m R',
 ' 1957 1h 36m Approved',
 ' 1993 3h 15m R',
 ' 2003 3h 21m PG-13',
 ' 1994 2h 34m R',
 ' 2001 2h 58m PG-13',
 ' 1966 2h 58m Approved',
 ' 1994 2h 22m PG-13',
 ' 1999 2h 19m R',
 ' 2002 2h 59m PG-13',
 ' 2010 2h 28m PG-13',
 ' 1980 2h 4m PG',
 ' 1999 2h 16m R',
 ' 1990 2h 25m R',
 ' 1975 2h 13m R',
 ' 1995 2h 7m R',
 ' 1946 2h 10m PG',
 ' 2014 2h 49m PG-13',
 ' 1954 3h 27m Not Rated',
 ' 1991 1h 58m R',
 ' 1998 2h 49m R',
 ' 2002 2h 10m R',
 ' 1997 1h 56m PG-13',
 ' 1999 3h 9m R',
 ' 1991 2h 17m R',
 ' 1977 2h 1m PG',
 ' 2023 2h 20m PG',
 ' 1985 1h 56m PG',
 ' 2001 2h 5m PG',
 ' 2002 2h 30m R',
 ' 2019 2h 12m R',
 ' 1960 1h 49m R',
 ' 2000 2h 35m R',
 ' 1994 1h 28m G',
 ' 1994 1h 50m R',
 ' 2006 2h 31m R',
 ' 1998 1h 59m R',
 ' 2014 1h 46m R',
 ' 2006 2h 10m PG-13',
 ' 1988 1h 29m Not Rated',
 ' 1962 2h 13m Not Rated',
 ' 1995 1h 46m R',
 ' 1942 1h 42m PG',
 ' 2011 1h 52m R',
 ' 1936 1h 27m G',
 ' 1988 2h 3

## IMDB Rating of each Film



In [None]:
# Find all div elements with the specified class
all_ratings = soup.find_all('div', class_ = 'sc-e2dbc1a3-0 ajrIH sc-be6f1408-2 dAeZAQ cli-ratings-container')
all_IMDB_ratings = []
for rating in all_ratings:
  value = rating.find('span', class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text
  all_IMDB_ratings.append(value[:3]) # need only the rating not the number of people had rated.

print(all_IMDB_ratings)

['9.3', '9.2', '9.0', '9.0', '9.0', '9.0', '9.0', '8.9', '8.9', '8.8', '8.8', '8.8', '8.8', '8.8', '8.7', '8.7', '8.7', '8.7', '8.6', '8.6', '8.7', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6', '8.5', '8.6', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '8.6', '8.5', '8.5', '8.5', '8.5', '8.5', '8.5', '9.2', '8.5', '8.5', '8.5', '8.4', '8.5', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.4', '8.3', '8.4', '8.3', '8.4', '8.4', '8.3', '8.4', '8.3', '8.4', '8.4', '8.4', '8.3', '8.3', '8.4', '8.3', '8.4', '8.4', '8.3', '8.4', '8.3', '8.4', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.4', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.3', '8.2', '8.3', '8.2', '8.3', '8.3', '8.2', '8.3', '8.2', '8.2', '8.2', '8.2', '8.3', '8.2', '8.2', '8.2', '8.2', '8.2', '8.2', '8.2', '8.3', '8.2', '8.3', '8.2', '8.2', '8.2', '8.2', '8.2'

## Director and Actors Name


In [None]:
directors_list = soup.find_all('div', class_ = "ipc-promptable-base__focus-lock")
directors_name = []
# for director in directors_list:
#   name = director.find('a', class_ = 'ipc-link ipc-link--baseAlt sc-9bca7e5d-0 ktChvQ').text
#   directors_name.append(name)
# Error : We were tring to parse data from a popup  but it was showing empty data.
# Beautiful Soup is primarily designed for parsing and navigating HTML content, but it doesn't handle dynamic actions like button clicks.
# For handling dynamic actions, you can combine Beautiful Soup with a library like requests or use a headless browser like Selenium.


print(len(directors_list))

0


In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.17.2-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.24.0-py3-none-any.whl (460 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.2/460.2 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting typing_extensions>=4.9.0 (from selenium)
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [

In [None]:
!pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv (from webdriver_manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver_manager
Successfully installed python-dotenv-1.0.1 webdriver_manager-4.0.1


In [None]:
!pip install chromedriver-autoinstaller

Collecting chromedriver-autoinstaller
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl (7.6 kB)
Installing collected packages: chromedriver-autoinstaller
Successfully installed chromedriver-autoinstaller-0.6.4


In [None]:
# from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller

# Set up the WebDriver (make sure to download the appropriate driver for your browser)
# driver = webdriver.Chrome(executable_path='path/to/chromedriver')

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())
# wd.get("https://www.webite-url.com")

# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()

# set the target URL
url = "https://www.imdb.com/chart/top/"

# set up the webdriver
driver = webdriver.Chrome(options=chrome_options)

# Navigate to the website with the button that generates the popup
# driver.get('https://www.imdb.com/chart/top/')

# try:
#     # Find and click the button that generates the popup (replace 'button-id' with the actual ID or other locator)
#     popup_button = WebDriverWait(driver, 10).until(
#         EC.presence_of_element_located((By.ID, 'button-id'))
#     )
#     popup_button.click()

#     # Wait for the popup to appear (replace 'popup-id' with the actual ID or other locator)
#     popup = WebDriverWait(driver, 10).until(
#         EC.presence_of_element_located((By.ID, 'popup-id'))
#     )

#     # Extract the HTML content of the popup
#     popup_html = popup.get_attribute('outerHTML')

#     # Use Beautiful Soup to parse the HTML content
#     soup = BeautifulSoup(popup_html, 'html.parser')

#     # Extract data from the parsed HTML using Beautiful Soup methods
#     # (replace 'data-class' with the actual class or tag used in the popup)
#     data_elements = soup.find_all(class_='data-class')

#     for data_element in data_elements:
#         print(data_element.text)

# finally:
#     # Close the WebDriver
#     driver.quit()


AttributeError: 'NoneType' object has no attribute 'split'