In [4]:
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd

In [5]:
"""URL is link to Top 50 highest-grossing movies. """
url = 'https://editorial.rottentomatoes.com/article/highest-grossing-movies-all-time/'
top_50_request = requests.get(url).text
top_50_soup = bs(top_50_request)

In [6]:
len(top_50_soup.findAll('div', 'col-sm-17 article_movie_title')) == 50
# Check to see if there are 50 results for above tag. Why? Should only be 50 movies total on top 50

True

In [40]:
"""
h2 Tag cotains everything I want, from the box office amount to the title of the movie. However, it goes every other. 
How should I separate titles and box office score?
"""
for h2 in top_50_soup.findAll('h2')[2:104:2]:
    print(h2.text == '')

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [53]:
# Collect h2 text in a list
# Remove blank line. Blank line occurs ~ movie 19 or 20. Note: Issues before where blank lines disrupt the 'every other'
# pattern. Below correct the issue by ignoring blank lines entirely. 
h2_list = [h2.text for h2 in top_50_soup.findAll('h2')[2:104] if h2.text != '']
# Remove blank line. Blank line occurs ~ movie 19 or 20
#Box Office Revenue. Remember, h2 tags have 'Revenue, Movie, Revenue, Movie' pattern, with box office revenue appearing first
movie_box_office = h2_list[:len(h2_list):2]
# Movie Titles
movie_titles = h2_list[1:len(h2_list):2]

In [54]:
avengers_box_office = movie_box_office[0]
avengers_box_office

'1. $2.798 Billion\xa0'

In [55]:
avengers_box_office = movie_box_office[0]
# Pattern matches a $ then any amount of numerical digits after $, including any decimal points.
pattern = re.compile(r'\$[\d\.]+')
# Test pattern on first movie_box_office entry
pattern.search(avengers_box_office)
# Strip $ from found match
pattern.search(avengers_box_office)[0].strip('$')


'2.798'

In [56]:
# Empty list to store cleaned Box Office numbers
box_office_numerical =[]
# Iterate over each Box Office value
for box_office in movie_box_office:
    # Store box office value to check if value is million or billion (millions will begin with a 9)
    temp1 = pattern.search(box_office)[0].strip('$')
    if temp1.startswith('9'):
        # If number in the millions, covert to numerical by multiplying by 1,000,000
        temp1 = float(temp1) * 1000000
    else:
         # If number in the billion, covert to numerical by multiplying by 1,000,000,000
        temp1 = float(temp1) * 1000000000
    
    box_office_numerical.append(round(temp1))
    



In [57]:
len(box_office_numerical)

50

In [58]:
# Clean movie titles. Remember, want 'Title', 'Year', and 'Rotten Score'
avengers_endgame_title = movie_titles[0]

In [63]:
# Search for Title
re.search(r'[\w:  -]+', avengers_endgame_title)
# Above extracts title! Pattern extracts any word character, the ':' chracter, and the '-' character. 

<re.Match object; span=(0, 18), match='Avengers: Endgame '>

In [64]:
# Search for year
re.search(r'[0-9]{4}', avengers_endgame_title)
# Pattern matches any digits 0-9 four times

<re.Match object; span=(19, 23), match='2019'>

In [65]:
#Search for Rotten Score
re.search(r' [0-9]{2}',avengers_endgame_title)[0].strip(' ')
# pattern matches any two digits 0-9.

'94'

In [66]:
# Create a list of titles, year, and score
movie_title = []
movie_year = []
movie_score = []
for movie in movie_titles:
    movie_title.append(re.search(r'[\w: \-\']+',movie)[0].strip(' '))
    movie_year.append(re.search(r'[0-9]{4}', movie)[0])
    movie_score.append(re.search(r' [0-9]{2}', movie)[0].strip(' '))
    

In [67]:
movie_title

['Avengers: Endgame',
 'Avatar',
 'Titanic',
 'Star Wars: Episode VII - The Force Awakens',
 'Avengers: Infinity War',
 'Jurassic World',
 'The Lion King',
 "Marvel's The Avengers",
 'Furious 7',
 'Avengers: Age of Ultron',
 'Frozen II',
 'Black Panther',
 'Harry Potter and the Deathly Hallows - Part 2',
 'Star Wars: The Last Jedi',
 'Jurassic World: Fallen Kingdom',
 'Frozen',
 'Beauty and the Beast',
 'Incredibles 2',
 'The Fate of the Furious',
 'Iron Man 3',
 'Minions',
 'Captain America: Civil War',
 'Aquaman',
 'Spider-Man: Far From Home',
 'Captain Marvel',
 'Transformers: Dark of the Moon',
 'The Lord of the Rings: The Return of the King',
 'Skyfall',
 'Transformers: Age of Extinction',
 'The Dark Knight Rises',
 'Toy Story 4',
 'Toy Story 3',
 "Pirates of the Caribbean: Dead Man's Chest",
 'Joker',
 'Rogue One: A Star Wars Story',
 'Aladdin',
 'Pirates of the Caribbean: On Stranger Tides',
 'Despicable Me 3',
 'Jurassic Park',
 'Finding Dory',
 'Star Wars: Episode I - The Phan

In [68]:
# Create a dataframe containing the movie title, the box office gross, release year, and rotten tomato score
movie_df = pd.DataFrame([movie_title, box_office_numerical, movie_year, movie_score]).T
movie_df.columns = ['Title','Box Office in $', 'Release Year','Rotten Score']

In [69]:
# Completed! 
movie_df

Unnamed: 0,0,1,2,3
0,Avengers: Endgame,2798000000,2019,94
1,Avatar,2790000000,2009,82
2,Titanic,2188000000,1997,89
3,Star Wars: Episode VII - The Force Awakens,2068000000,2015,93
4,Avengers: Infinity War,2048000000,2018,85
5,Jurassic World,1672000000,2015,72
6,The Lion King,1655000000,2019,53
7,Marvel's The Avengers,1519000000,2012,91
8,Furious 7,1516000000,2015,81
9,Avengers: Age of Ultron,1405000000,2015,75


In [70]:
# Save to csv
movie_df.to_csv('movies_df.csv', index = False)