# Collect data (Rotten Tomatoes)

#### Import libraries in use:
- requests: send HTTP requests
- bs4: parsing HTML elements
- Selenium: helps render and retrieve JavaScript content

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

#### Collect movie links to crawl data
We get the movie links based on movie genres, we will crawl the data on these links later

In [3]:
links = []
search_params = ['action','adventure','animation','anime','biography','comedy', 'crime', 'documentary', 'drama', 'entertainment',
                 'faith_and_spirituality', 'fantasy', 'game_show', 'history', 'holiday', 'horror', 'kids_and_family', 'music', 
                 'musical', 'mystery_and_thriller', 'reality', 'romance', 'sci_fi', 'sports', 'western', 'war']
# get links for movies
for search in search_params:
    response = requests.get(f"https://www.rottentomatoes.com/browse/movies_at_home/genres:{search}?page=8")
    soup = BeautifulSoup(response.content, 'html.parser')
    link_elements = soup.select('div > tile-dynamic > a')
    for element in link_elements:
        links.append(element['href'])

links = list(set(links))#remove duplicated links
print(len(links))

1205


Remove links from links list that are already in the previous crawled data file

In [4]:
df = pd.read_csv('../../data/raw/data.csv')
#remove crawled links
for url in links[:]:
    if (f"https://www.rottentomatoes.com{url}") in df['url'].values:
        links.remove(url)

#### Collect data from movie links
First, We build a function to get the HTML text/data

In [5]:
def get_element_text(soup, selector):
    element = soup.select_one(selector)
    text =''
    if element:
        text = element.get_text()
        text = text.split()
        text = ' '.join(text)
    else:
        text = ''
    return text

Initial field lists for Dataframe

In [6]:
#init lists to store values for attributes
Name = []
Genre = []
Tomatometer_score = []
Tomatometer_count = []
Audience_score = []
Audience_count = []
Runtime = []
Classification_rating = []
Release_year = []
Original_language = []
Urls = []

We use BeautifulSoup to help retrieve and parse the HTML content. But since the selector of BeautifulSoup doesn't work for some data fields like TomatometerRate and AudienceRate because they're inside shadow-root and the shadow-root object isn't visible in the document root, so we build a method that takes a list of ShadowDom root locators and use Selenium to help execute JavaScript and retrieve the shadow element. So we build a function to retrieve shadow element from it's parent:

In [7]:
#find shadow element from shadow root
def expand_shadow_element(element):
  shadow_root = driver.execute_script('return arguments[0].shadowRoot', element)
  return shadow_root

Crawl data from the movie links: set a number of movies we want to collect for each run and get the HTML text for data field

In [8]:
count = 0        
#crawl data movies
for url in links:
    response = requests.get(f"https://www.rottentomatoes.com{url}")
    soup = BeautifulSoup(response.content, 'html.parser')
    #name
    name = get_element_text(soup, '#scoreboard > h1')
    Name.append(name)

    #tomatometer rate and audience rate (these two have to be crawled seperatedly since they're inside Shadow DOM object)
    driver = webdriver.Chrome()
    driver.get(f"https://www.rottentomatoes.com{url}")
    root1 = driver.find_element(By.CSS_SELECTOR, "#scoreboard")
    shadow_root1 = expand_shadow_element(root1)
    root2a = shadow_root1.find_element(By.CSS_SELECTOR,'div > div.scores-container > div.tomatometer-container > div > score-icon-critic-deprecated')
    shadow_root2a = expand_shadow_element(root2a)
    root2b = shadow_root1.find_element(By.CSS_SELECTOR,'div > div.scores-container > div.audience-container > div > score-icon-audience-deprecated')
    shadow_root2b = expand_shadow_element(root2b) 
    tomatometer_score = shadow_root2a.find_element(By.CSS_SELECTOR,'div > span.percentage').text
    audience_score = shadow_root2b.find_element(By.CSS_SELECTOR,'div > span.percentage').text
    driver.quit()
    Tomatometer_score.append(tomatometer_score)
    Audience_score.append(audience_score)

    #number of tomatometers
    tomatometer_count = get_element_text(soup, '#scoreboard > a:nth-child(3)')
    tomatometer_count = tomatometer_count.split(" ")[0]
    Tomatometer_count.append(tomatometer_count)

    #number of audiences
    audience_count = get_element_text(soup, '#scoreboard > a:nth-child(4)')
    audience_count = audience_count.split(" ")[0]
    Audience_count.append(audience_count)

    #runtime
    runtime = get_element_text(soup,'#scoreboard > p')
    runtime = runtime.split(", ")[-1]
    Runtime.append(runtime)

    #release
    year = get_element_text(soup,'#scoreboard > p')
    year = year.split(", ")[0]
    Release_year.append(year)
    
    #Get Classification, Genre and Original Language
    contents= ['','','']
    for i in range(5):
        temp = get_element_text(soup, f'#info > li:nth-child({i}) > p > b')
        content = get_element_text(soup, f'#info > li:nth-child({i}) > p > span')
        if(temp == 'Rating:'):
            #classification
            contents[0] = content
        if(temp == 'Genre:'):
            #genre
            contents[1] = content
        if(temp == 'Original Language:'):
            #language
            contents[2] = content
    Classification_rating.append(contents[0])
    Genre.append(contents[1])
    Original_language.append(contents[2])

    #link
    Urls.append("https://www.rottentomatoes.com" + url)
    
    count +=1
    #if(count % 1 == 0):
     #   print(count)
    if(count == 1): #set count limit for each crawling time
        break

#### Save collected data

In [9]:
data = pd.DataFrame({'name':Name,
                     'genre':Genre,
                     'tomatometer_score': Tomatometer_score,
                     'tomatometer_count':Tomatometer_count,
                     'audience_score': Audience_score,
                     'audience_count': Audience_count,
                     'classification':Classification_rating,
                     'runtime':Runtime, 
                     'release_year':Release_year,
                     'original_language':Original_language,
                     'url':Urls})

data.to_csv('../../data/raw/data.csv', mode='a', header=False,index=False)

In [10]:
df_copy = pd.read_csv('../../data/raw/data.csv')
df_copy = df_copy.drop_duplicates()

print(df_copy.shape)

(1216, 11)
