Web Scrapping - Selenium and BeautifulSoup

In [None]:
!pip install selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd

# Initialize the WebDriver for Microsoft Edge
driver = webdriver.Edge()
driver.maximize_window()

# URL of the IMDb home page
PATH = 'https://www.imdb.com/?ref_=nv_home'
# Load the web page
driver.get(PATH)

# Wait for the page to load (you can use different wait strategies)
driver.implicitly_wait(10)

# Search for the actor
search_keyword = "Leonardo DiCaprio"
try:
    search_box = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//input[@id='suggestion-search']"))
    )
    print("Search box found.")
    search_box.send_keys(search_keyword)
    search_box.send_keys(Keys.ENTER)
except TimeoutException:
    print("Timeout while trying to locate the search box.")
    driver.quit()
    exit()

# Wait for the search results page to load and find the section containing the exact matches
try:
    exact_matches_section = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//section[@data-testid='find-results-section-title']"))
    )
    print("Exact matches section found.")
except TimeoutException:
    print("Timeout while trying to locate the exact matches section.")
    driver.quit()
    exit()

# Extract all movie names within the img tag alt value inside the exact matches section
movies_list = []
try:
    img_elements = exact_matches_section.find_elements(By.XPATH, ".//div[contains(@class, 'ipc-media')]//img")
    for img_element in img_elements:
        movie_name = img_element.get_attribute('alt')
        movies_list.append(movie_name)
except Exception as e:
    print(f"An error occurred while extracting movie names: {e}")

# Print the list of movies
for movie in movies_list:
    print(f"Movie: {movie}")

# Optionally, save the data to a DataFrame and then to a CSV file
df = pd.DataFrame(movies_list, columns=['Movie'])
df.to_csv('Leonardo_DiCaprio_Movies.csv', index=False)

# Close the WebDriver
driver.quit()


Search box found.
Exact matches section found.
Movie: Leonardo DiCaprio: Most Wanted! (2021)
Movie: Oppo Find: Leonardo DiCaprio (2011)
Movie: Leonardo DiCaprio: In His Own Words (1998)
Movie: Before the Flood (2016)
Movie: Leonardo DiCaprio - Phantom und Superstar (2015)


In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException
import pandas as pd
import time

# Initialize the WebDriver for Microsoft Edge
driver = webdriver.Edge()
driver.maximize_window()

# URL of the IMDb home page
PATH = 'https://www.imdb.com/?ref_=nv_home'
# Load the web page
driver.get(PATH)

# Wait for the page to load (you can use different wait strategies)
driver.implicitly_wait(10)

# Search for the actor
search_keyword = "Leonardo DiCaprio"
try:
    search_box = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//input[@id='suggestion-search']"))
    )
    print("Search box found.")
    search_box.send_keys(search_keyword)
    search_box.send_keys(Keys.ENTER)
except TimeoutException:
    print("Timeout while trying to locate the search box.")
    driver.quit()
    exit()

# Wait for the search results page to load and find the section containing "More popular matches"
try:
    more_popular_matches = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//span[contains(@class, 'ipc-see-more__text') and text()='More popular matches']"))
    )
    print("More popular matches link found.")
    
    # Scroll to the element to ensure it's in view
    driver.execute_script("arguments[0].scrollIntoView(true);", more_popular_matches)
    time.sleep(1)  # Adding sleep to ensure scrolling is complete
    
    try:
        more_popular_matches.click()
    except ElementClickInterceptedException:
        print("Element click intercepted, trying to click with JavaScript")
        driver.execute_script("arguments[0].click();", more_popular_matches)
except TimeoutException:
    print("Timeout while trying to locate the more popular matches link.")
    driver.quit()
    exit()

# Scroll down to the bottom of the page
driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
time.sleep(0.5)

# Extract all movie names within the img tag alt value inside the more popular matches section
movies_list = []
try:
    img_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'ipc-media')]//img")
    for img_element in img_elements:
        movie_name = img_element.get_attribute('alt')
        movies_list.append(movie_name)
except Exception as e:
    print(f"An error occurred while extracting movie names: {e}")

# Print the list of movies
for movie in movies_list:
    print(f"Movie: {movie}")

# Optionally, save the data to a DataFrame and then to a CSV file
df = pd.DataFrame(movies_list, columns=['Movie'])
df.to_csv('Leonardo_DiCaprio_Movies.csv', index=False)

# Close the WebDriver
driver.quit()


Search box found.
More popular matches link found.
Movie: Leonardo DiCaprio at an event for Inception (2010)
Movie: Leonardo DiCaprio: Most Wanted! (2021)
Movie: Oppo Find: Leonardo DiCaprio (2011)
Movie: Leonardo DiCaprio: In His Own Words (1998)
Movie: Before the Flood (2016)
Movie: Leonardo DiCaprio - Phantom und Superstar (2015)
Movie: Get the IMDb app


In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import pandas as pd
import time

# Initialize the WebDriver for Microsoft Edge
driver = webdriver.Edge()
driver.maximize_window()

# URL of the IMDb home page
PATH = 'https://www.imdb.com/?ref_=nv_home'
# Load the web page
driver.get(PATH)

# Wait for the page to load (you can use different wait strategies)
driver.implicitly_wait(10)

# Search for the actor
search_keyword = "Leonardo DiCaprio"
try:
    search_box = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//input[@id='suggestion-search']"))
    )
    print("Search box found.")
    search_box.send_keys(search_keyword)
    search_box.send_keys(Keys.ENTER)
except TimeoutException:
    print("Timeout while trying to locate the search box.")
    driver.quit()
    exit()

# Wait for the search results page to load and find the section containing the header "People"
try:
    people_header = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//h3[contains(@class, 'ipc-title__text') and text()='People']"))
    )
    print("People header found.")
    
    # Scroll to the element to ensure it's in view
    driver.execute_script("arguments[0].scrollIntoView(true);", people_header)
    time.sleep(1)  # Adding sleep to ensure scrolling is complete
    
    try:
        people_header.click()
    except ElementClickInterceptedException:
        print("Element click intercepted, trying to click with JavaScript")
        driver.execute_script("arguments[0].click();", people_header)
except TimeoutException:
    print("Timeout while trying to locate the People header.")
    driver.quit()
    exit()

# Navigate to the actor's page
try:
    actor_link = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//li[@class='ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click find-result-item find-name-result']//div[@class='ipc-metadata-list-summary-item__tc']//a[@class='ipc-metadata-list-summary-item__t' and @aria-disabled='false']"))
    )
    print("Actor link found.")
    driver.execute_script("arguments[0].scrollIntoView(true);", actor_link)
    time.sleep(1)  # Adding sleep to ensure scrolling is complete
    actor_link.click()
except TimeoutException:
    print("Timeout while trying to locate the actor link.")
    driver.quit()
    exit()

# Wait for the actor's filmography section to load
try:
    filmography_section = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//div[@data-testid='Filmography']//section[contains(@class, 'ipc-page-section')]//div[contains(@class, 'sc-817d5c6e-5 jRSBVq filmo-section-actor')]"))
    )
    print("Filmography section found.")
except TimeoutException:
    print("Timeout while trying to locate the filmography section.")
    driver.quit()
    exit()

# Scroll to the filmography section to ensure visibility
driver.execute_script("arguments[0].scrollIntoView(true);", filmography_section)
time.sleep(1)  # Adding sleep to ensure scrolling is complete

# Expand all the accordion items in the filmography section
try:
    expand_button = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//label[@role='button' and @aria-label='Expand Previous' and @aria-expanded='false']"))
    )
    driver.execute_script("arguments[0].scrollIntoView(true);", expand_button)
    time.sleep(1)  # Adding sleep to ensure scrolling is complete
    driver.execute_script("arguments[0].click();", expand_button)
    time.sleep(5)  # Wait for the accordion to expand
    print("Accordion expanded.")
except TimeoutException:
    print("Timeout while trying to locate and expand the accordion.")
    driver.quit()
    exit()

# Extract movie titles
movies_list = []
try:
    ul_element = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, "//div[@class='ipc-accordion__item__content']//ul[contains(@class, 'ipc-metadata-list')]"))
    )
    movie_elements = ul_element.find_elements(By.XPATH, ".//li[contains(@class, 'ipc-metadata-list-summary-item')]")
    
    for movie in movie_elements:
        try:
            title_element = movie.find_element(By.XPATH, ".//a[contains(@class, 'ipc-metadata-list-summary-item__t')]")
            title = title_element.text
            if title:
                print(f"Movie title found: {title}")
                movies_list.append(title)
            else:
                print("Empty title found, skipping.")
        except NoSuchElementException:
            print("Movie title not found in list element.")
except NoSuchElementException:
    print("Movie list not found.")

# Print the list of movies
for movie in movies_list:
    print(f"Movie: {movie}")

# Optionally, save the data to a DataFrame and then to a CSV file
df = pd.DataFrame(movies_list, columns=['Movie'])
df.to_csv('Leonardo_DiCaprio_Movies.csv', index=False)

# Close the WebDriver
driver.quit()

Search box found.
People header found.
Element click intercepted, trying to click with JavaScript
Actor link found.
Filmography section found.
Accordion expanded.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.
Empty title found, skipping.


In [None]:
------------------------

In [7]:
!pip install beautifulsoup4 requests



In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the Google search page
url = 'https://www.google.com/search?q=Leonardo+DiCaprio+movies'

# Send a GET request to fetch the raw HTML content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all the movie containers
movies = soup.find_all('div', class_='BVG0Nb')

# Extract movie names and years
movie_list = []
for movie in movies:
    name = movie.find('div', class_='BNeawe vvjwJb AP7Wnd').text
    year = movie.find('div', class_='BNeawe UPmit AP7Wnd').text.split(' ')[-1]
    movie_list.append((name, year))

# Create a DataFrame to structure the extracted data
df = pd.DataFrame(movie_list, columns=['Movie Name', 'Year'])

# Print the DataFrame
print(df)

Empty DataFrame
Columns: [Movie Name, Year]
Index: []


In [9]:
import requests
from bs4 import BeautifulSoup

def get_actor_films(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    films = []
    for item in soup.find_all('div', class_='BNeawe deIvCb AP7Wnd'):
        films.append(item.text)

    return films
url = "https://www.google.com/search?q=%22Leonardo+DiCaprio&oq=%22Leonardo+DiCaprio&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQLhhA0gEIMjc0MmowajGoAgCwAgA&sourceid=chrome&ie=UTF-8#wgvs=e&wptab=si:ACC90nyInjzvSQnt4EBukRo6qsOQanuL5iKWn-KNLQkyRapUdSnkLn2qXRP4aI9q1_ASdVzGGOuxcSLkUR4hDhXv1JA1ZSVovOGNq8gTfVCg4STnhD4Jnt4ReEiOefAB70bD-2wa2K5t"
actor_name = "Leonardo DiCaprio"
films = get_actor_films(url)
print(films)

[]


In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define the URL of the Google search page
url = 'https://www.google.com/search?q=Leonardo+DiCaprio+movies'

# Send a GET request to fetch the raw HTML content
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find all span elements with the specific class and attributes
spans = soup.find_all('span', class_='NQyKp Hyaw8c h4wEae Maj6Tc')

# Extract movie names and years from the relevant span tags
movie_list = []
for span in spans:
    # Navigate to the parent element containing movie details
    parent = span.find_parent('div')
    
    if parent:
        # Extract movie name and year
        name_tag = parent.find('div', class_='BNeawe deIvCb AP7Wnd')
        year_tag = parent.find('div', class_='BNeawe tAd8D AP7Wnd')

        if name_tag and year_tag:
            name = name_tag.text
            year = year_tag.text.split()[-1]
            movie_list.append((name, year))

# Create a DataFrame to structure the extracted data
df = pd.DataFrame(movie_list, columns=['Movie Name', 'Year'])

# Print the DataFrame
print(df)

Empty DataFrame
Columns: [Movie Name, Year]
Index: []


In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup

def get_actor_movies(actor_name):
    # Initialize the WebDriver for Chrome (you can change to any other browser)
    driver = webdriver.Chrome()
    driver.maximize_window()

    # Google search URL
    google_url = f"https://www.google.com/search?q={actor_name} movies"

    # Load the Google search results page
    driver.get(google_url)

    try:
        # Wait for the search results to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div[@class='tF2Cxc']")))
        
        # Get the page source after waiting for results
        page_source = driver.page_source

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find all div elements containing movie information
        movie_divs = soup.find_all('div', class_='tF2Cxc')

        # Extract movie names and years from the relevant div tags
        movie_list = []
        for div in movie_divs:
            name_tag = div.find('div', class_='BNeawe deIvCb AP7Wnd')
            year_tag = div.find('div', class_='BNeawe tAd8D AP7Wnd')

            if name_tag and year_tag:
                name = name_tag.text
                year = year_tag.text
                movie_list.append((name, year))

        # Close the WebDriver
        driver.quit()

        return movie_list

    except TimeoutException:
        print("Timeout while waiting for Google search results.")
        driver.quit()
        return []
    except NoSuchElementException:
        print("Element not found while searching for movie information.")
        driver.quit()
        return []

# Example usage:
actor_name = "Leonardo DiCaprio"
movies = get_actor_movies(actor_name)

# Display the extracted movie names and years
if movies:
    print(f"Movies of {actor_name}:")
    for movie in movies:
        print(f"Movie Name: {movie[0]}, Year: {movie[1]}")
else:
    print(f"No movies found for {actor_name}.")


No movies found for Leonardo DiCaprio.


In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time

def get_actor_movies(actor_name):
    # Initialize the WebDriver for Chrome (you can change to any other browser)
    driver = webdriver.Chrome()
    driver.maximize_window()

    # Google search URL
    google_url = f"https://www.google.com/search?q={actor_name}"

    # Load the Google search results page
    driver.get(google_url)

    try:
        # Wait for the search results to load
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "//*[@id='JTPWx']/span[2]")))
        time.sleep(3)  # Adding extra time to ensure all content is loaded

        # Get the page source after waiting for results
        page_source = driver.page_source

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the specific span tag containing movie information using XPath
        span_tag = soup.find('span', id='JTPWx')

        if span_tag:
            # Find all movie divs within the parent container of the span tag
            parent_div = span_tag.find_parent('div')
            movie_divs = parent_div.find_all('div', class_='BNeawe deIvCb AP7Wnd')

            # Extract movie names and years from the relevant div tags
            movie_list = []
            for div in movie_divs:
                try:
                    # Locate the year tag related to each movie
                    year_tag = div.find_next('div', class_='BNeawe tAd8D AP7Wnd')

                    # Extract the movie name and year if both tags are found
                    if year_tag:
                        name = div.text.strip()
                        year = year_tag.text.strip()
                        movie_list.append((name, year))
                except Exception as e:
                    print(f"Error extracting movie details: {str(e)}")

            # Close the WebDriver
            driver.quit()

            return movie_list

        else:
            print("No movies found in the search results.")
            driver.quit()
            return []

    except TimeoutException:
        print("Timeout while waiting for Google search results.")
        driver.quit()
        return []
    except Exception as e:
        print(f"Error: {str(e)}")
        driver.quit()
        return []

# Example usage:
actor_name = "Leonardo DiCaprio"
movies = get_actor_movies(actor_name)

# Display the extracted movie names and years
if movies:
    print(f"Movies of {actor_name}:")
    for movie in movies:
        print(f"Movie Name: {movie[0]}, Year: {movie[1]}")
else:
    print(f"No movies found for {actor_name}.")


No movies found in the search results.
No movies found for Leonardo DiCaprio.
