# MicroFilms: 2019 Box Office Scraped Data 

This notebook is compiled of the scraped data from the top 100 Worlwide Grossing Movies in 2019, according to www.the-numbers.com.

In [344]:
#Import libraries
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

#### Here we're scraping the website for the 2019 Box Office records

In [346]:
#Testing to confirm the response from the website. 200 Response means we're good to go.
url = 'https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-2019'
response = requests.get(url)
response

<Response [200]>

In [347]:
#Using BeautifulSoup to scrape the website
soup = BeautifulSoup(response.content, 'html.parser')

In [351]:
#Scraping the Rank, Movie, Worldwide Box Office, Domestic Box Office, International Box Office & Domestic Share
film=[]
for row in soup.find('div', id = 'main').findAll('td')[3:]:
    film.append(row.text)    
film

['1',
 'Avengers: Endgame',
 '$2,797,800,564',
 '$858,373,000',
 '$1,939,427,564',
 '30.68%',
 '2',
 'The Lion King',
 '$1,654,381,934',
 '$543,638,043',
 '$1,110,743,891',
 '32.86%',
 '3',
 'Frozen II',
 '$1,447,244,586',
 '$477,373,578',
 '$969,871,008',
 '32.99%',
 '4',
 'Spider-Man: Far From Home',
 '$1,131,219,645',
 '$390,532,085',
 '$740,687,560',
 '34.52%',
 '5',
 'Captain Marvel',
 '$1,129,727,388',
 '$426,829,839',
 '$702,897,549',
 '37.78%',
 '6',
 'Toy Story 4',
 '$1,073,080,329',
 '$434,038,008',
 '$639,042,321',
 '40.45%',
 '7',
 'Star Wars: The Rise of Skywalker',
 '$1,072,944,222',
 '$515,202,542',
 '$557,741,680',
 '48.02%',
 '8',
 'Joker',
 '$1,072,507,517',
 '$335,451,311',
 '$737,056,206',
 '31.28%',
 '9',
 'Aladdin',
 '$1,046,759,512',
 '$355,559,216',
 '$691,200,296',
 '33.97%',
 '10',
 'Jumanji: The Next Level',
 '$800,128,637',
 '$316,831,246',
 '$483,297,391',
 '39.60%',
 '11',
 'Fast & Furious Presents: Hobbs & Shaw',
 '$760,372,607',
 '$173,956,935',
 '$586,4

In [361]:
#Convert scraped list into DataFrame
num1=0
num2=6
movies=[]
for num in range(len(soup.find('div', id = 'main').findAll('td')[3:])//6):
    movies.append(film[num1:num2])
    num1+=6
    num2+=6
        
df = pd.DataFrame(movies)
df.columns = ['Rank', 
              'Movie', 
              'Worldwide Box Office', 
              'Domestic Box Office', 
              'International Box Office',
              'Domestic Share']

In [353]:
#Convert Dataframe to CSV File
df.to_csv('2019_Movie_List.csv', index=False)

### The box office numbers are strings. In order to measure them we need to convert them into integers.

In [370]:
print(df[df.columns[2:5]].dtypes)
df[df.columns[2:5]]

Worldwide Box Office        object
Domestic Box Office         object
International Box Office    object
dtype: object


Unnamed: 0,Worldwide Box Office,Domestic Box Office,International Box Office
0,"$2,797,800,564","$858,373,000","$1,939,427,564"
1,"$1,654,381,934","$543,638,043","$1,110,743,891"
2,"$1,447,244,586","$477,373,578","$969,871,008"
3,"$1,131,219,645","$390,532,085","$740,687,560"
4,"$1,129,727,388","$426,829,839","$702,897,549"
...,...,...,...
95,"$67,044,017","$478,949","$66,565,068"
96,"$64,686,515","$28,148,130","$36,538,385"
97,"$63,191,904",,"$63,191,904"
98,"$62,599,159","$32,138,862","$30,460,297"


In [371]:
#Function to convert string of numbers to integer
def convert_numbers(df):
    for col in df.columns[2:6]:
        df[col]=df[col].str.replace(',','').str.replace('$','').str.replace('%','')
        
    for col in df.columns[2:6]:
        df[col]=df[col].replace('','0')
        
    df['Domestic Share']=df['Domestic Share'].astype(float)
    df['Worldwide Box Office']=df['Worldwide Box Office'].astype(int)
    df['Domestic Box Office']=df['Domestic Box Office'].astype(int)
    df['International Box Office']=df['International Box Office'].astype(int)
    
    return df

In [356]:
#Assigning the new DataFrame with integers in the place of string numbers
df=convert_numbers(df)

In [357]:
df

Unnamed: 0,Rank,Movie,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share
0,1,Avengers: Endgame,2797800564,858373000,1939427564,30.68
1,2,The Lion King,1654381934,543638043,1110743891,32.86
2,3,Frozen II,1447244586,477373578,969871008,32.99
3,4,Spider-Man: Far From Home,1131219645,390532085,740687560,34.52
4,5,Captain Marvel,1129727388,426829839,702897549,37.78
...,...,...,...,...,...,...
95,96,ek-si-teu,67044017,478949,66565068,0.71
96,97,Happy Death Day 2U,64686515,28148130,36538385,43.51
97,98,Eiga Doraemon: Nobita no Getsumen Tansaki,63191904,0,63191904,0.00
98,99,Cold Pursuit,62599159,32138862,30460297,51.34


In [358]:
#Add International Share to Dataframe
int_share=[]
for num in range(len(df)):
    int_share.append(100 - float(df['Domestic Share'][num]))
df['International Share']=int_share

In [359]:
df

Unnamed: 0,Rank,Movie,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share,International Share
0,1,Avengers: Endgame,2797800564,858373000,1939427564,30.68,69.32
1,2,The Lion King,1654381934,543638043,1110743891,32.86,67.14
2,3,Frozen II,1447244586,477373578,969871008,32.99,67.01
3,4,Spider-Man: Far From Home,1131219645,390532085,740687560,34.52,65.48
4,5,Captain Marvel,1129727388,426829839,702897549,37.78,62.22
...,...,...,...,...,...,...,...
95,96,ek-si-teu,67044017,478949,66565068,0.71,99.29
96,97,Happy Death Day 2U,64686515,28148130,36538385,43.51,56.49
97,98,Eiga Doraemon: Nobita no Getsumen Tansaki,63191904,0,63191904,0.00,100.00
98,99,Cold Pursuit,62599159,32138862,30460297,51.34,48.66


In [300]:
#Convert Dataframe to CSV File
df.to_csv('2019_Movies_Numbers.csv', index=False)

In [301]:
#Create list of the links for each movie
links=[]
for l in soup.find('div', id = 'main').findAll('a'):
    if "/movie" in l.get('href'):
        links.append(l.get('href'))
links

['/movie/Avengers-Endgame-(2019)#tab=summary',
 '/movie/Lion-King-The-(Live-Action)-(2019)#tab=summary',
 '/movie/Frozen-II-(2019)#tab=summary',
 '/movie/Spider-Man-Far-From-Home-(2019)#tab=summary',
 '/movie/Captain-Marvel-(2019)#tab=summary',
 '/movie/Toy-Story-4-(2019)#tab=summary',
 '/movie/Star-Wars-The-Rise-of-Skywalker-(2019)#tab=summary',
 '/movie/Joker-(2019)#tab=summary',
 '/movie/Aladdin-(2019)#tab=summary',
 '/movie/Jumanji-The-Next-Level#tab=summary',
 '/movie/Fast-and-Furious-Presents-Hobbs-and-Shaw-(2019)#tab=summary',
 '/movie/Ne-Zha-Zhi-Mo-Tong-Jiang-Shi-(China)#tab=summary',
 '/movie/Liu-Lang-Di-Qiu-(China)-(2019)#tab=summary',
 '/movie/How-to-Train-Your-Dragon-The-Hidden-World-(2019)#tab=summary',
 '/movie/Maleficent-Mistress-of-Evil-(2019)#tab=summary',
 '/movie/It-Chapter-Two-(2019)#tab=summary',
 '/movie/Wo-he-wo-de-zu-guo-(China)#tab=summary',
 '/movie/Secret-Life-of-Pets-2-The-(2019)#tab=summary',
 '/movie/Pokemon-Detective-Pikachu-(2019)#tab=summary',
 '/movie/

In [302]:
#Function to scrape each movie's page to see if it's a part of a franchise, re-make (nastalgia), or sequel
def franchise(url): 
    response2 = requests.get(url)
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    for name in soup2.find('div', id = 'summary').findAll('td'): 
        if "Franchise" in name.text or "Remake" in name.text:
            return 'Yes'
    return 'No'

In [303]:
#Creating new series in the dataframe called 'Franchise' to identify movies a part of a Franchise
fran=[]
for l in links: 
    url = 'https://www.the-numbers.com/' + l
    fran.append(franchise(url))
df['Franchise'] = fran

In [304]:
#Convert Dataframe to CSV File
df.to_csv('2019_Franchise_Numbers.csv', index=False)

In [339]:
#Function to pull the ratings from the movie web pages
def rating(url): 
    response3 = requests.get(url)
    soup3 = BeautifulSoup(response3.content, 'html.parser')
    for rate in soup3.find('div', id = 'summary').findAll('a', href=True, text=['PG','PG-13','Not Rated','R','G']):
        return rate.text

In [340]:
#Adding the MPAA Ratings to the dataframe
ratings=[]
for l in links: 
    url = 'https://www.the-numbers.com/' + l
    ratings.append(rating(url))
df['MPAA Rating'] = ratings

In [341]:
#Replace the NaN with 'N/A'
df = df.fillna('N/A')

In [342]:
df

Unnamed: 0,Rank,Movie,Worldwide Box Office,Domestic Box Office,International Box Office,Domestic Share,International Share,Franchise,MPAA Rating
0,1,Avengers: Endgame,2797800564,858373000,1939427564,30.68,69.32,Yes,PG-13
1,2,The Lion King,1654381934,543638043,1110743891,32.86,67.14,Yes,PG
2,3,Frozen II,1447244586,477373578,969871008,32.99,67.01,Yes,PG
3,4,Spider-Man: Far From Home,1131219645,390532085,740687560,34.52,65.48,Yes,PG-13
4,5,Captain Marvel,1129727388,426829839,702897549,37.78,62.22,Yes,PG-13
...,...,...,...,...,...,...,...,...,...
95,96,ek-si-teu,67044017,478949,66565068,0.71,99.29,No,Not Rated
96,97,Happy Death Day 2U,64686515,28148130,36538385,43.51,56.49,Yes,PG-13
97,98,Eiga Doraemon: Nobita no Getsumen Tansaki,63191904,0,63191904,0.00,100.00,No,
98,99,Cold Pursuit,62599159,32138862,30460297,51.34,48.66,Yes,R


In [343]:
#Convert Dataframe to CSV File
df.to_csv('_2019_MPAA_Numbers.csv', index=False)