In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import os

In [2]:
os.chdir(r'C:\Users\Nate P\Desktop\Rotten Tomatoes Movies Analysis')
rotten_tomatoes_template_url = 'https://www.rottentomatoes.com/m/{}'

In [3]:
url = 'https://www.rottentomatoes.com/m/avengers_endgame/'
html_text = requests.get(url).text
soup = bs(html_text)

In [4]:
# The information we want - Ratings, Director, and Studio - can be found in the <li> tags with class type 'meta-row clearfix'
soup.findAll('li', 'meta-row clearfix')

[<li class="meta-row clearfix">
 <div class="meta-label subtle">Rating: </div>
 <div class="meta-value">PG-13 (for sequences of sci-fi violence and action, and some language)</div>
 </li>, <li class="meta-row clearfix">
 <div class="meta-label subtle">Genre: </div>
 <div class="meta-value">
 <a href="/browse/opening/?genres=1">Action &amp; Adventure</a>, 
                              
                                 <a href="/browse/opening/?genres=9">Drama</a>, 
                              
                                 <a href="/browse/opening/?genres=14">Science Fiction &amp; Fantasy</a>
 </div>
 </li>, <li class="meta-row clearfix">
 <div class="meta-label subtle">Directed By: </div>
 <div class="meta-value">
 <a href="/celebrity/joe_russo">Joe Russo</a>, 
                              
                                 <a href="/celebrity/anthony_russo">Anthony Russo</a>
 </div>
 </li>, <li class="meta-row clearfix">
 <div class="meta-label subtle">Written By: </div>
 <div c

In [6]:
# Below is previous idea for extracting Rating, Directors, and Studio. Goal was to find each category AFTER ':' using
# backwards looking. Issue in the 'Rating' category, as 'G' rated films did NOT have '('. This issue was hard to notice originally,
# as the search would return a 'None' object when a '(' was not found. 
for tag in soup.findAll('li','meta-row clearfix'):
    text = ' '.join(tag.text.split())
    if 'Rating' in text:
        print(re.search(r'(?<=: ).+\(', text)[0].strip('(').strip())
    if 'Directed By' in text:
        print(re.search(r'(?<=: ).+', text)[0])
    if 'Studio' in text:
        print(re.search(r'(?<=: ).+', text)[0])

PG-13
Joe Russo, Anthony Russo
Marvel Studios


In [7]:
# Read in movies df, extract title and year
movies_df = pd.read_csv('movies_df.csv')
movies_df.head()

Unnamed: 0,Title,Box Office in $,Release Year,Rotten Score
0,Avengers: Endgame,2798000000,2019,94
1,Avatar,2790000000,2009,82
2,Titanic,2188000000,1997,89
3,Star Wars: Episode VII - The Force Awakens,2068000000,2015,93
4,Avengers: Infinity War,2048000000,2018,85


In [8]:
# List of Studio, Director, and Rating. List will be populated using a function 'scrape_movie_page'
studios = []
directors = []
ratings = []

In [9]:
# Function for extracting studio, director, and rating
def scrape_movie_page(response):
    # Function is passed a response object. A soup object is created during each function call
    soup = bs(response.text)
    for tag in soup.findAll('li','meta-row clearfix'):
        # Eliminate unnecessary blanks
        text = ' '.join(tag.text.split())
        if 'Rating' in text:
            # Two different regex patterns. One for ratings that have a '(', one for those without (G ratings)
            if '(' in text:
                #print((re.search(r'(?<=: ).+\(', text)[0].strip('(').strip()))
                # Print out of each result was to check what movie caused problems during the iteration of the urls. 
                ratings.append((re.search(r'(?<=: ).+\(', text)[0].strip('(').strip()))
            else:
                #print((re.search(r'(?<=: ).+', text)[0].strip()))
                ratings.append((re.search(r'(?<=: ).+', text)[0].strip()))
        if 'Directed By' in text:
            #print(re.search(r'(?<=: ).+', text)[0])
            directors.append(re.search(r'(?<=: ).+', text)[0])
        if 'Studio' in text:
            #print(re.search(r'(?<=: ).+', text)[0])
            studios.append(re.search(r'(?<=: ).+', text)[0])

In [10]:
# Movie names and release years. Will be used to build urls. 
movie_titles = list(movies_df['Title'])
movie_release_years = list(movies_df['Release Year'])

In [12]:
# Practice with format. Notice, ahve to replace "'" with a blank before combining. 
url = rotten_tomatoes_template_url.format('_'.join(re.sub(r'[:\-]',' ','Toy Story 4').replace('\'','').lower().split()))
url

'https://www.rottentomatoes.com/m/toy_story_4'

In [13]:
# looping over movie titles, creating url
# Had to get creative with URL creation. Could not replace apostrophe ("'s") with blank space, creating an error with 'Avenger's' (avengers_s).
# Could not replace every non letter character with '', as this messed up 'Spider-man, Far from home' (creating spiderman, not spider_man)
# Decided to make non-word a space, while making "'" an empty space ('')
# .split() method then splits the sentence on the blank spaces (" "), then join combines them. 
root_url = 'https://www.rottentomatoes.com/m/{}'
for i, title in enumerate(movie_titles):
    # For loop goes through each movie title, then creates url out of title. 
    url = root_url.format('_'.join(re.sub(r'[:\-]',' ',title).replace('\'','').lower().split()))
    year = '_' + str(movie_release_years[i])
    #print(title)
    # If/else statement checks to see if a request is valid, specifically checking to see 'url+year' is valid
    # Reason is some movies repeat, i.e. lion king is on list twice. Would be great if every movie could be found using 'url' + 'year',
    # but unfortunately not the case. First try request that uses name + year, and if not found, moves on to just name. 
    if requests.get(url + year):
        scrape_movie_page(requests.get(url+year))
    else:
        scrape_movie_page(requests.get(url))

In [16]:
# Check to see if all categories were scraped. 
print(len(studios))
print(len(directors))
print(len(ratings))

50
50
50


In [19]:
#Success!! Save studios, directors, and ratings as DF so no need to scrape 
studios_df = pd.DataFrame([movie_titles, ratings, studios, directors]).T
studios_df.columns = ['Title', 'Ratings','Studio','Director(s)'] 

In [20]:
studios_df.to_csv('studios_df_v2.csv', index = False)

In [21]:
studios_df

Unnamed: 0,Title,Ratings,Studio,Director(s)
0,Avengers: Endgame,PG-13,Marvel Studios,"Joe Russo, Anthony Russo"
1,Avatar,PG-13,20th Century Fox,James Cameron
2,Titanic,PG-13,Paramount Pictures,James Cameron
3,Star Wars: Episode VII - The Force Awakens,PG-13,Walt Disney Pictures,J.J. Abrams
4,Avengers: Infinity War,PG-13,Walt Disney Pictures,"Anthony Russo, Joe Russo"
5,Jurassic World,PG-13,Universal Pictures,Colin Trevorrow
6,The Lion King,PG,Walt Disney Pictures,Jon Favreau
7,Marvel's The Avengers,PG-13,Walt Disney Pictures,"Joss Whedon, Robert Downey Jr."
8,Furious 7,PG-13,Universal Pictures,James Wan
9,Avengers: Age of Ultron,PG-13,Walt Disney Pictures,Joss Whedon
