# Data Collection

## Install Neccesary Packages and Check for HTML Request

In [1]:
#!pip install requests scrapy pandas

In [1]:
import requests                # for sending HTTP requests
from scrapy import Selector    # for parsing HTML content
import pandas as pd            # for creating dataframe

from pprint import pprint

In [2]:
url = "https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/"
html = requests.get(url).content  # This sends a GET request to the URL and stores the HTML content

b'<!DOCTYPE html>\n<html lang="en-US" class="hitim">\n<head prefix="og: http://ogp.me/ns# flixstertomatoes: http://ogp.me/ns/apps/flixstertomatoes#">\n    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />\n    \n    <!-- OneTrust Cookies Consent Notice start for rottentomatoes.com -->\n    <script src="https://cdn.cookielaw.org/consent/7e979733-6841-4fce-9182-515fac69187f/otSDKStub.js"\n        type="text/javascript"\n        charset="UTF-8"\n        data-domain-script="7e979733-6841-4fce-9182-515fac69187f"\n        integrity="sha384-TKdmlzVmoD70HzftTw4WtOzIBL5mNx8mXSRzEvwrWjpIJ7FZ/EuX758yMDWXtRUN"\n        crossorigin="anonymous" >\n    </script>\n    <script type="text/javascript">\n        function OptanonWrapper() { }\n    </script>\n    <!-- OneTrust Cookies Consent Notice end for rottentomatoes.com -->\n    <!-- OneTrust IAB US Privacy (USP) -->\n    <script src="https://cdn.cookielaw.org/opt-out/otCCPAiab.js"\n        type="text/javascript"\n        charset="

In [3]:
sel = Selector(text=html) # instantiate

import requests                # for sending HTTP requests
from scrapy import Selector    # for parsing HTML content
import pandas as pd

## Create Functions to Gather Data from Website
We will be collecting data from Rotten Tomatoes' top 300 movie list. We first gather the ranking list, and then we find the individual descriptions within each movie by going into their specific pages

In [11]:
# from the ranking page, collect movie hyperlink, movie titile, and movie rank
def get_movie_links(url):
    html = requests.get(url).content
    sel = Selector(text=html)

    movie_links =sel.css('span.details a::attr(href)').getall()
    movie_titles = sel.css('span.details a::text').getall()
    movie_ranks = sel.css('td:nth-child(1)::text').getall()

    return {'link': movie_links,
            'title': movie_titles,
            'rank': movie_ranks}

url = 'https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/'

#pprint(get_movie_links(url))

{'link': ['https://www.rottentomatoes.com/m/la_confidential',
          'https://www.rottentomatoes.com/m/the_godfather',
          'https://www.rottentomatoes.com/m/1003707-casablanca',
          'https://www.rottentomatoes.com/m/seven_samurai_1956',
          'https://www.rottentomatoes.com/m/parasite_2019',
          'https://www.rottentomatoes.com/m/schindlers_list',
          'https://www.rottentomatoes.com/m/top_gun_maverick',
          'https://www.rottentomatoes.com/m/toy_story_2',
          'https://www.rottentomatoes.com/m/chinatown',
          'https://www.rottentomatoes.com/m/on_the_waterfront',
          'https://www.rottentomatoes.com/m/the_battle_of_algiers',
          'https://www.rottentomatoes.com/m/toy_story',
          'https://www.rottentomatoes.com/m/1017289-rear_window',
          'https://www.rottentomatoes.com/m/modern_times',
          'https://www.rottentomatoes.com/m/how_to_train_your_dragon',
          'https://www.rottentomatoes.com/m/1000626-all_about_eve

In [5]:
# convert the gathered data into a dataframe
top_df = pd.DataFrame(get_movie_links(url))
top_df

Unnamed: 0,link,title,rank
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1.
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2.
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3.
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4.
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5.
...,...,...,...
295,https://www.rottentomatoes.com/m/1001902-beaut...,Beauty and the Beast,296.
296,https://www.rottentomatoes.com/m/killing,The Killing,297.
297,https://www.rottentomatoes.com/m/the_rules_of_...,The Rules of the Game,298.
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,299.


In [6]:
# go into each movie page and collect all information on it, from critic rating to box-office performance; if a piece of information is missing, fill it with none
def get_movie_info(url):
    html = requests.get(url).content
    sel = Selector(text=html)

    info_blocks = sel.css('div.content-wrap div.category-wrap')
    info_pairs = {}
    # there are some information which are hyperlinks and others that are pure text, so we have to operate on them differently
    links = ['Director', 'Producer', 'Screenwriter', 'Genre']
    texts = ['Distributor', 'Production Co', 'Rating', 'Original Language', 'Release Date (Theaters)', 'Release Date (Streaming)', 'Box Office (Gross USA)', 'Runtime', 'Sound Mix', 'Aspect Ratio']
    
    # collect description title and info from each text box
    for block in info_blocks:
        if block.css('dt.key rt-text::text').getall()[0] in links:
            info_pairs[f"{block.css('dt.key rt-text::text').getall()[0]}"] = block.css('dd rt-link::text').getall()
        elif block.css('dt.key rt-text::text').getall()[0] in texts:
            info_pairs[f"{block.css('dt.key rt-text::text').getall()[0]}"] = block.css('dd rt-text:not(rt-text.delimiter)::text').getall()
    
    # add rating scores
    info_pairs['critics_score'] = sel.css('rt-button:nth-child(3) > rt-text::text').getall()
    info_pairs['audience_score'] = sel.css('rt-button:nth-child(7) > rt-text::text').getall()

    # if a specific information was not represented on the webpage, change it to none for eventual dataframe
    for tag in links:
        if tag not in info_pairs.keys():
            info_pairs[tag] = None

    for tag in texts:
        if tag not in info_pairs.keys():
            info_pairs[tag] = None

    # reorder the columns to make them consistent for every movie page
    keys = list(info_pairs.keys())
    keys.sort()
    sorted_info_pairs = {i: info_pairs[i] for i in keys}

    return sorted_info_pairs

# test out the top movie page link (it has all datapoints filled)
get_movie_info('https://www.rottentomatoes.com/m/la_confidential')

{'Aspect Ratio': ['35mm', 'Scope (2.35:1)'],
 'Box Office (Gross USA)': ['$64.6M'],
 'Director': ['Curtis Hanson'],
 'Distributor': ['Warner Home Vídeo', 'Warner Bros.'],
 'Genre': ['Crime', 'Drama'],
 'Original Language': ['English'],
 'Producer': ['Michael G. Nathanson', 'Arnon Milchan', 'Curtis Hanson'],
 'Production Co': ['Warner Brothers', 'Regency Enterprises'],
 'Rating': ['R'],
 'Release Date (Streaming)': ['Dec 12, 2015'],
 'Release Date (Theaters)': ['Sep 19, 1997, Original'],
 'Runtime': ['2h 16m'],
 'Screenwriter': ['Curtis Hanson', 'James Ellroy', 'Brian Helgeland'],
 'Sound Mix': ['Surround', 'DTS', 'Dolby Digital'],
 'audience_score': ['94%'],
 'critics_score': ['99%']}

In [7]:
# test out a movie link with some missing data, and we see it handles it alright
pprint(get_movie_info('https://www.rottentomatoes.com/m/saving_private_ryan'))

{'Aspect Ratio': ['Flat (1.85:1)'],
 'Box Office (Gross USA)': None,
 'Director': ['Steven Spielberg'],
 'Distributor': ['Paramount Pictures', 'DreamWorks SKG'],
 'Genre': ['War', 'History', 'Drama'],
 'Original Language': ['English'],
 'Producer': ['Ian Bryce', 'Mark Gordon', 'Gary Levinsohn', 'Steven Spielberg'],
 'Production Co': ['DreamWorks SKG',
                   'Mutual Film Company',
                   'Amblin Entertainment',
                   'Paramount Pictures',
                   'Mark Gordon Productions'],
 'Rating': ['R (Graphic Sequences of War|Graphic Sequences of '
            'Violence|Language)'],
 'Release Date (Streaming)': ['May 27, 2016'],
 'Release Date (Theaters)': ['Jul 24, 1998, Wide'],
 'Runtime': ['2h 49m'],
 'Screenwriter': ['Robert Rodat'],
 'Sound Mix': ['Dolby SR',
               'DTS',
               'Dolby Stereo',
               'Surround',
               'SDDS',
               'Dolby A',
               'Dolby Digital'],
 'audience_score': ['95%'],

In [8]:
# get movie info for all pages in the top 300 list
top_df['info'] = top_df['link'].apply(get_movie_info)
top_df

Unnamed: 0,link,title,rank,info
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1.,"{'Aspect Ratio': ['35mm', 'Scope (2.35:1)'], '..."
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2.,"{'Aspect Ratio': ['Flat (1.85:1)'], 'Box Offic..."
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3.,"{'Aspect Ratio': ['Flat (1.37:1)'], 'Box Offic..."
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4.,"{'Aspect Ratio': ['Flat (1.37:1)'], 'Box Offic..."
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5.,"{'Aspect Ratio': ['Scope (2.35:1)'], 'Box Offi..."
...,...,...,...,...
295,https://www.rottentomatoes.com/m/1001902-beaut...,Beauty and the Beast,296.,"{'Aspect Ratio': ['Flat (1.37:1)'], 'Box Offic..."
296,https://www.rottentomatoes.com/m/killing,The Killing,297.,"{'Aspect Ratio': None, 'Box Office (Gross USA)..."
297,https://www.rottentomatoes.com/m/the_rules_of_...,The Rules of the Game,298.,"{'Aspect Ratio': ['35mm', 'Flat (1.37:1)'], 'B..."
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,299.,"{'Aspect Ratio': None, 'Box Office (Gross USA)..."


In [9]:
# expand the dataframe to seperate each information into individual columns
top_df = pd.concat([top_df.drop(['info'], axis=1), top_df['info'].apply(pd.Series)], axis=1)
top_df

Unnamed: 0,link,title,rank,Aspect Ratio,Box Office (Gross USA),Director,Distributor,Genre,Original Language,Producer,Production Co,Rating,Release Date (Streaming),Release Date (Theaters),Runtime,Screenwriter,Sound Mix,audience_score,critics_score
0,https://www.rottentomatoes.com/m/la_confidential,L.A. Confidential,1.,"[35mm, Scope (2.35:1)]",[$64.6M],[Curtis Hanson],"[Warner Home Vídeo, Warner Bros.]","[Crime, Drama]",[English],"[Michael G. Nathanson, Arnon Milchan, Curtis H...","[Warner Brothers, Regency Enterprises]",[R],"[Dec 12, 2015]","[Sep 19, 1997, Original]",[2h 16m],"[Curtis Hanson, James Ellroy, Brian Helgeland]","[Surround, DTS, Dolby Digital]",[94%],[99%]
1,https://www.rottentomatoes.com/m/the_godfather,The Godfather,2.,[Flat (1.85:1)],[$134.8M],[Francis Ford Coppola],[Paramount Pictures],"[Crime, Drama]",[English],[Albert S. Ruddy],[Paramount Pictures],[R],"[Aug 1, 2013]","[Mar 15, 1972, Wide]",[2h 57m],"[Francis Ford Coppola, Mario Puzo, Mario Puzo]",[Mono],[98%],[97%]
2,https://www.rottentomatoes.com/m/1003707-casab...,Casablanca,3.,[Flat (1.37:1)],,[Michael Curtiz],[Warner Bros. Pictures],[Drama],[English],[Hal B. Wallis],[Warner Brothers],[PG],"[Aug 15, 2008]","[Jan 23, 1943, Wide]",[1h 42m],"[Murray Burnett, Joan Alison, Julius J. Epstei...",[Mono],[95%],[99%]
3,https://www.rottentomatoes.com/m/seven_samurai...,Seven Samurai,4.,[Flat (1.37:1)],[$192.9K],[Akira Kurosawa],[Columbia Pictures],[Action],[Japanese],[Sojiro Motoki],[Toho Company],,"[Nov 29, 2011]","[Nov 19, 1956, Wide]",[3h 28m],"[Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni]",[Mono],[97%],[100%]
4,https://www.rottentomatoes.com/m/parasite_2019,Parasite,5.,[Scope (2.35:1)],[$53.4M],[Bong Joon Ho],[Neon],"[Comedy, Mystery & Thriller, Drama]",[Korean],"[Kwak Sin-ae, Moon Yanggwon]",[Barunson E&A],[R (Sexual Content|Language|Some Violence)],"[Oct 11, 2019]","[Nov 1, 2019, Wide]",[2h 12m],"[Bong Joon Ho, Han Jinwon]","[Dolby Atmos, Dolby Digital]",[90%],[99%]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,https://www.rottentomatoes.com/m/1001902-beaut...,Beauty and the Beast,296.,[Flat (1.37:1)],[$138.2K],[Jean Cocteau],,[Fantasy],[Canadian French],,,,"[Jan 14, 2017]","[Jan 1, 1947, Wide]",[1h 35m],"[Jean Cocteau, Jeanne-Marie Leprince de Beaumo...",[Mono],[90%],[96%]
296,https://www.rottentomatoes.com/m/killing,The Killing,297.,,,[Stanley Kubrick],"[United Artists, Criterion Collection]","[Crime, Drama]",[English],[James B. Harris],[Harris-Kubrick Productions],,"[Mar 5, 2016]","[May 20, 1956, Original]",[1h 23m],"[Stanley Kubrick, Jim Thompson, Lionel White]",,[92%],[96%]
297,https://www.rottentomatoes.com/m/the_rules_of_...,The Rules of the Game,298.,"[35mm, Flat (1.37:1)]",,[Jean Renoir],"[Criterion Collection, Cine Classics]","[Comedy, Drama]",[French (France)],[Claude Renoir],[Nouvelles Éditions de Films (NEF)],,"[Jul 21, 2009]","[Jul 8, 1939, Original]",[1h 50m],"[Carl Koch, Jean Renoir]",,[89%],[97%]
298,https://www.rottentomatoes.com/m/eyes_without_...,Eyes Without a Face,299.,,[$52.7K],[Georges Franju],"[United Artists, Lopert Pictures Corp., Rialto...","[Horror, Drama]",[Canadian French],[Jules Borkon],"[Champs-Élysées Production, Lux Film S.p.a.]",,"[Oct 29, 2016]","[Oct 31, 1962, Original]",[1h 30m],"[Pierre Boileau, Pierre Gascar, Thomas Narceja...",,[87%],[97%]


In [14]:
# save raw data into dataframe for data cleaning
top_df.to_json('../data/raw/movies.json')