# Scraping IMDb data using BeautifulSoup & Selenium

In [55]:
from bs4 import BeautifulSoup
import requests
import time
import os
from pprint import pprint
import re
import codecs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
import pandas as pd
import numpy as np

In [56]:
headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,
        "authority": "www.tagesschau.de",
        "method": "GET",
        "path":"/",
        "scheme":"https",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-encoding": "gzip, deflate, br",
        "accept-language": "en-US,en;q=0.9,de;q=0.8",
        "cache-control": "max-age=0",
        "cookie": "atuserid=%7B%22name%22%3A%22atuserid%22%2C%22val%22%3A%2257ea5dd6-4c35-4982-942f-8a7f8b8c3a4b%22%2C%22options%22%3A%7B%22end%22%3A%222023-02-17T05%3A02%3A59.936Z%22%2C%22path%22%3A%22%2F%22%7D%7D; atidvisitor=%7B%22name%22%3A%22atidvisitor%22%2C%22val%22%3A%7B%22vrn%22%3A%22-595936-%22%7D%2C%22options%22%3A%7B%22path%22%3A%22%2F%22%2C%22session%22%3A15724800%2C%22end%22%3A15724800%7D%7D",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "Windows",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "none",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1"
    }

### 1. Go to the page documenting all the movies started by Robert Pattinson.

### 2. Scrape movie name, year, and movie page url.

In [57]:
base_url = "https://www.imdb.com/name/nm1500155/?ref_=fn_al_nm_1"
page = requests.get(base_url, headers=headers)
soup = BeautifulSoup(page.text, "lxml")
filmography = soup.select("div.filmo-category-section")[0]

In [58]:
movie_list = []
movies = filmography.find_all("div", {"class": ["filmo-row odd","filmo-row even"]})
for movie in movies:
    movie_info = {}
    movie_info["Name"] = movie.find('a').contents[0]
    year_str = movie.find("span", {"class": "year_column"}).get_text()
    movie_info['Year'] = re.search(r'(\d{4})',  movie.find("span", {"class": "year_column"}).text).group(1)
    movie_info["Link"] = "https://www.imdb.com" + movie.find('a', href=True)['href']
    movie_list.append(movie_info)
pprint(movie_list)

[{'Link': 'https://www.imdb.com/title/tt1877830/?ref_=nm_flmg_act_1',
  'Name': 'The Batman',
  'Year': '2022'},
 {'Link': 'https://www.imdb.com/title/tt7395114/?ref_=nm_flmg_act_2',
  'Name': 'The Devil All the Time',
  'Year': '2020'},
 {'Link': 'https://www.imdb.com/title/tt6723592/?ref_=nm_flmg_act_3',
  'Name': 'Tenet',
  'Year': '2020'},
 {'Link': 'https://www.imdb.com/title/tt11657754/?ref_=nm_flmg_act_4',
  'Name': "Dior: I'm your Man - Dior Homme",
  'Year': '2020'},
 {'Link': 'https://www.imdb.com/title/tt6149154/?ref_=nm_flmg_act_5',
  'Name': 'Waiting for the Barbarians',
  'Year': '2019'},
 {'Link': 'https://www.imdb.com/title/tt7984766/?ref_=nm_flmg_act_6',
  'Name': 'The King',
  'Year': '2019'},
 {'Link': 'https://www.imdb.com/title/tt7984734/?ref_=nm_flmg_act_7',
  'Name': 'The Lighthouse',
  'Year': '2019'},
 {'Link': 'https://www.imdb.com/title/tt4827558/?ref_=nm_flmg_act_8',
  'Name': 'High Life',
  'Year': '2018'},
 {'Link': 'https://www.imdb.com/title/tt5881528/?r

### 3. Add genres, cast, rating, meta_score, and review link for each movie

In [60]:
for movie in movie_list:
    movie_url = movie["Link"]
    page = requests.get(movie_url, headers=headers)
    soup = BeautifulSoup(page.text, "lxml")
    movie_genres = soup.find_all("a", {"class": "sc-14389611-3 jyOyvn ipc-chip ipc-chip--on-baseAlt"})
    genre_list = []
    for movie_genre in movie_genres:
        genre_list.append(movie_genre.find("span", {"class": "ipc-chip__text"}).text)
    movie["Genres"] = genre_list
    movie["Rating"] = soup.find("span", {"class": "sc-7ab21ed2-1 jGRxWM"}).text
    area = soup.find_all(name="li",attrs={"class":"ipc-metadata-list__item ipc-metadata-list-item--link"})
    
    actors = area[1].find_all(name="li",attrs={'class':'ipc-inline-list__item'})
    actor_list = []
    for actor in actors:
        actor_list.append(actor.text)
    movie["Cast"] = actor_list
    try:
        movie["Meta_score"] = soup.find("span", {"class":"score-meta"}).text
    except:
        movie["Meta_score"] = "None"
    review = soup.find("li", {"class": 'ipc-inline-list__item sc-124be030-1 ghlYSH'})
    review_link = "https://www.imdb.com" + review.find('a', href=True)['href']
    movie['Review Link'] = re.search(r'(.*?)ref', review_link).group(1) + "sort=totalVotes&dir=desc&ratingFilter=0"
pprint(movie_list)

[{'Cast': ['Robert Pattinson', 'Zoë Kravitz', 'Jeffrey Wright'],
  'Genres': ['Action', 'Crime', 'Drama'],
  'Link': 'https://www.imdb.com/title/tt1877830/?ref_=nm_flmg_act_1',
  'Meta_score': '72',
  'Name': 'The Batman',
  'Rating': '8.4',
  'Review Link': 'https://www.imdb.com/title/tt1877830/reviews?sort=totalVotes&dir=desc&ratingFilter=0',
  'Year': '2022'},
 {'Cast': ['Bill Skarsgård', 'Tom Holland', 'Michael Banks Repeta'],
  'Genres': ['Crime', 'Drama', 'Thriller'],
  'Link': 'https://www.imdb.com/title/tt7395114/?ref_=nm_flmg_act_2',
  'Meta_score': '55',
  'Name': 'The Devil All the Time',
  'Rating': '7.1',
  'Review Link': 'https://www.imdb.com/title/tt7395114/reviews?sort=totalVotes&dir=desc&ratingFilter=0',
  'Year': '2020'},
 {'Cast': ['John David Washington', 'Robert Pattinson', 'Elizabeth Debicki'],
  'Genres': ['Action', 'Sci-Fi', 'Thriller'],
  'Link': 'https://www.imdb.com/title/tt6723592/?ref_=nm_flmg_act_3',
  'Meta_score': '69',
  'Name': 'Tenet',
  'Rating': '7.

### 4. Change the data into dataframe and then export as csv.

In [65]:
Name = []
Year = []
Genres = []
Cast = []
Rating = []
Meta_score = []
for movie in movie_list:
    Name.append(movie["Name"])
    Year.append(movie["Year"])
    Genres.append(movie["Genres"])
    Cast.append(movie["Cast"])
    Rating.append(movie["Rating"])
    Meta_score.append(movie["Meta_score"])

data={"Name":Name,
     "Year":Year,
     "Genres":Genres,
     "Cast":Cast,
     "Rating":Rating,
     "Meta_score":Meta_score}
df_movie = pd.DataFrame(data)
df_movie
df_movie.to_csv('movie_info.csv')

### 5. Use Selenium to simulate human to click "Load More" and scrape reviews for each movie.

### 6. Save reviews to csv.

In [44]:
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Driver [/Users/neonzhang/.wdm/drivers/chromedriver/mac64/99.0.4844.51/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [49]:
for movie in movie_list:
    driver.get(movie["Review Link"])
    driver.implicitly_wait(1)
    page = 1
    while page < 2:
        try:
            #find the load more button on the webpage
            load_more = driver.find_element_by_id('load-more-trigger')
            #click on that button
            load_more.click()
            page+=1 
            time.sleep(5)
        except:
            #If couldnt find any button to click, stop
            break
    # After fully expand the page, we will grab data from whole website
    reviews = driver.find_elements_by_class_name('lister-item-content')

    comment = []
    rating = []

    for review in reviews:
        try:
            fcontent = review.find_element_by_class_name('content').text
            frating = review.find_element_by_class_name('rating-other-user-rating').text
            frating = re.search(r'(\d{1,2})\/(\d{2})',frating).group(1)


            #Then add them to the respective list
            comment.append(fcontent)
            rating.append(str(frating))

        except:
            continue
    data = {'Review Rating': rating,
            "Comment":comment
       }

    #Build dataframe for each movie to export
    review_list = pd.DataFrame(data = data) 
    name = movie['Name']
    review_list.to_csv(f'{name}.csv')
driver.quit()

  load_more = driver.find_element_by_id('load-more-trigger')
  reviews = driver.find_elements_by_class_name('lister-item-content')
