This notebook contains the code for scrabing film data from the website BoxOfficeMojo.com.
It collects the top 15 films by week for every three weeks (to avoid duplicate film records) from January 2010 to December 2021.

In [1]:
#import the necesary libraries and plugins

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
from IPython.core.display import display, HTML

In [2]:
#begin webscraping

base_url = "https://www.boxofficemojo.com"    #this is the base url for the site, which will get used later.
url_10 = 'https://www.boxofficemojo.com/weekly/by-year/2010/' # the url for scraping films released in 2010
response = requests.get(url_10)               #this grabs the html content from the above url
response.status_code                          #check that html content request was successful. It produced code #200 = success!
page_10 = response.text                       #this grabs the text from the html content. I label it ten for the year 2010. 
soup_10 = bs(page_10, "lxml")                #parse the html on this page using beautiful soup (bs)
all_weeks_10 = soup_10.find('div', id= 'table') #grab the table containing the links to all weeks' top films in 2010. this table gets used in the next cell.


In [3]:
# Grab the links for the 2010 movie release weeks. To reduce duplicates, grab links for every three weeks.
#I will then grab the links for the top 15 movies in each week. This should amount to between 1500 and 2000 movies, depending on how a year is split into weeks and how many duplicates we get.

weeks10 = []
for tr in all_weeks_10:              #loop through table rows in our web-scraped table
    week = tr.find_all('a')[7::9]    #find the links to each week. Position 7 is when the first correct link occurs and then they occur every three links. I want every three weeks so I jump by 9 links
    for w in week:                   #loop through the links list.. the <a> tab links
        link = w['href']             # get the href link inside each <a> tag. 
#        print(link)                 #print to make sure it's going well
        weeks10.append(link)         #compile hrefs in our weeks list to be turned into complete urls next, when combined with base url above.
#print(weeks10)                      #print to check quality control

urls_10 = []                             #create the empty list where full urls for weeks in 2010 will be added.
def get_10(weeks):                        # this function creates the list of urls for weekly standings. This list will be used to gather individual film urls.
    for path in weeks10:                  # loop through the list of partial urls gathered from the table.       
        urls_10.append(base_url + path)   #combine partial url with base url for weekly standing and add to growing list
    return(urls_10)                       #return the complete list of links to weekly standings.
get_10(weeks10)                           #

['https://www.boxofficemojo.com/weekly/2010W53/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2010W50/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2010W47/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2010W44/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2010W41/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2010W38/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2010W35/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2010W32/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2010W29/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2010W26/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2010W23/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2010W20/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2010W17/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2010W14/?ref_=bo_wly_table_40',
 'https:/

After success with the 2010 weekly standings, repeat the steps above for other years of interest. 
I would like to return to this and write the program to run through a list of years without having to craft all of this repeated code manually.

### return to this step and streamline

In [5]:
#Do the same steps above for the movies in 2011
url11 = 'https://www.boxofficemojo.com/weekly/by-year/2011/' 
response = requests.get(url11)
response.status_code             #200 = success!
page_11 = response.text          
soup_11 = bs(page_11, "lxml")    
all_weeks_11 = soup_11.find('div', id= 'table') #grab the table

weeks11 = []
for tr in all_weeks_11:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks11.append(link)          
urls_11 = []
def get_11(weeks):                        
    for path in weeks11:                         
        urls_11.append(base_url + path)   
    return(urls_11)                       
get_11(weeks11)         #urls_11 is the url list to append in all_weeks_urls      

['https://www.boxofficemojo.com/weekly/2011W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2011W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2011W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2011W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2011W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2011W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2011W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2011W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2011W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2011W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2011W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2011W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2011W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2011W13/?ref_=bo_wly_table_40',
 'https:/

In [6]:
#Do the same steps above for the movies in 2012
url12 = 'https://www.boxofficemojo.com/weekly/by-year/2012/' 
response = requests.get(url12)
response.status_code             #200 = success!
page_12 = response.text          
soup_12 = bs(page_12, "lxml")    
all_weeks_12 = soup_12.find('div', id= 'table') #grab the table

weeks12 = []
for tr in all_weeks_12:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks12.append(link)          

urls_12 = []
def get_12(weeks):                        
    for path in weeks12:                         
        urls_12.append(base_url + path)   
    return(urls_12)                       
get_12(weeks12) 

['https://www.boxofficemojo.com/weekly/2012W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2012W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2012W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2012W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2012W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2012W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2012W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2012W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2012W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2012W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2012W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2012W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2012W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2012W13/?ref_=bo_wly_table_40',
 'https:/

In [7]:
#Do the same steps above for the movies in 2013
url13 = 'https://www.boxofficemojo.com/weekly/by-year/2013/' 
response = requests.get(url13)
response.status_code             #200 = success!
page_13 = response.text          
soup_13 = bs(page_13, "lxml")    
all_weeks_13 = soup_13.find('div', id= 'table') #grab the table

weeks13 = []
for tr in all_weeks_13:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks13.append(link)          

urls_13 = []
def get_13(weeks):                        
    for path in weeks13:                         
        urls_13.append(base_url + path)   
    return(urls_13)                       
get_13(weeks13) 

['https://www.boxofficemojo.com/weekly/2013W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2013W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2013W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2013W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2013W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2013W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2013W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2013W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2013W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2013W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2013W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2013W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2013W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2013W13/?ref_=bo_wly_table_40',
 'https:/

In [8]:
#Do the same steps above for the movies in 2014
url14 = 'https://www.boxofficemojo.com/weekly/by-year/2014/' 
response = requests.get(url14)
response.status_code             #200 = success!
page_14 = response.text          
soup_14 = bs(page_14, "lxml")    
all_weeks_14 = soup_14.find('div', id= 'table') #grab the table

weeks14 = []
for tr in all_weeks_14:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks14.append(link)          

urls_14 = []
def get_14(weeks):                        
    for path in weeks14:                         
        urls_14.append(base_url + path)   
    return(urls_14)                       
get_14(weeks14) 

['https://www.boxofficemojo.com/weekly/2014W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2014W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2014W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2014W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2014W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2014W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2014W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2014W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2014W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2014W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2014W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2014W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2014W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2014W13/?ref_=bo_wly_table_40',
 'https:/

In [9]:
#Do the same steps above for the movies in 2015
url15 = 'https://www.boxofficemojo.com/weekly/by-year/2015/' 
response = requests.get(url15)
response.status_code             #200 = success!
page_15 = response.text          
soup_15 = bs(page_15, "lxml")    
all_weeks_15 = soup_15.find('div', id= 'table') #grab the table

weeks15 = []
for tr in all_weeks_15:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks15.append(link)          

urls_15 = []
def get_15(weeks):                        
    for path in weeks15:                         
        urls_15.append(base_url + path)   
    return(urls_15)                       
get_15(weeks15) 

['https://www.boxofficemojo.com/weekly/2015W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2015W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2015W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2015W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2015W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2015W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2015W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2015W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2015W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2015W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2015W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2015W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2015W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2015W13/?ref_=bo_wly_table_40',
 'https:/

In [10]:
#Do the same steps above for the movies in 2016
url16 = 'https://www.boxofficemojo.com/weekly/by-year/2016/' 
response = requests.get(url16)
response.status_code             #200 = success!
page_16 = response.text          
soup_16 = bs(page_16, "lxml")    
all_weeks_16 = soup_16.find('div', id= 'table') #grab the table

weeks16 = []
for tr in all_weeks_16:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks16.append(link)          

urls_16 = []
def get_16(weeks):                        
    for path in weeks16:                         
        urls_16.append(base_url + path)   
    return(urls_16)                       
get_16(weeks16) 

['https://www.boxofficemojo.com/weekly/2016W53/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2016W50/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2016W47/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2016W44/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2016W41/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2016W38/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2016W35/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2016W32/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2016W29/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2016W26/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2016W23/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2016W20/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2016W17/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2016W14/?ref_=bo_wly_table_40',
 'https:/

In [11]:
#Do the same steps above for the movies in 2017
url17 = 'https://www.boxofficemojo.com/weekly/by-year/2017/' 
response = requests.get(url17)
response.status_code             #200 = success!
page_17 = response.text          
soup_17 = bs(page_17, "lxml")    
all_weeks_17 = soup_17.find('div', id= 'table') #grab the table

weeks17 = []
for tr in all_weeks_17:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks17.append(link)          

urls_17 = []
def get_17(weeks):                        
    for path in weeks17:                         
        urls_17.append(base_url + path)   
    return(urls_17)                       
get_17(weeks17) 

['https://www.boxofficemojo.com/weekly/2017W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2017W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2017W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2017W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2017W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2017W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2017W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2017W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2017W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2017W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2017W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2017W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2017W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2017W13/?ref_=bo_wly_table_40',
 'https:/

In [12]:
#Do the same steps above for the movies in 2018
url18 = 'https://www.boxofficemojo.com/weekly/by-year/2018/' 
response = requests.get(url18)
response.status_code             #200 = success!
page_18 = response.text          
soup_18 = bs(page_18, "lxml")    
all_weeks_18 = soup_18.find('div', id= 'table') #grab the table

weeks18 = []
for tr in all_weeks_18:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks18.append(link)          

urls_18 = []
def get_18(weeks):                        
    for path in weeks18:                         
        urls_18.append(base_url + path)   
    return(urls_18)                       
get_18(weeks18) 

['https://www.boxofficemojo.com/weekly/2018W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2018W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2018W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2018W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2018W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2018W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2018W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2018W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2018W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2018W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2018W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2018W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2018W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2018W13/?ref_=bo_wly_table_40',
 'https:/

In [13]:
#Do the same steps above for the movies in 2019
url19 = 'https://www.boxofficemojo.com/weekly/by-year/2019/' 
response = requests.get(url19)
response.status_code             #200 = success!
page_19 = response.text          
soup_19 = bs(page_19, "lxml")    
all_weeks_19 = soup_19.find('div', id= 'table') #grab the table

weeks19 = []
for tr in all_weeks_19:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks19.append(link)          

urls_19 = []
def get_19(weeks):                        
    for path in weeks19:                         
        urls_19.append(base_url + path)   
    return(urls_19)                       
get_19(weeks19) 

['https://www.boxofficemojo.com/weekly/2019W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2019W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2019W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2019W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2019W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2019W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2019W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2019W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2019W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2019W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2019W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2019W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2019W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2019W13/?ref_=bo_wly_table_40',
 'https:/

In [14]:
#Do the same steps above for the movies in 2020
url20 = 'https://www.boxofficemojo.com/weekly/by-year/2020/' 
response = requests.get(url20)
response.status_code             #200 = success!
page_20 = response.text          
soup_20 = bs(page_20, "lxml")    
all_weeks_20 = soup_20.find('div', id= 'table') #grab the table

weeks20 = []
for tr in all_weeks_20:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks20.append(link)          

urls_20 = []
def get_20(weeks):                        
    for path in weeks20:                         
        urls_20.append(base_url + path)   
    return(urls_20)                       
get_20(weeks20) 

['https://www.boxofficemojo.com/weekly/2020W52/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2020W49/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2020W46/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2020W43/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2020W40/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2020W37/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2020W34/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2020W31/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2020W28/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2020W25/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2020W22/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2020W19/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2020W16/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2020W13/?ref_=bo_wly_table_40',
 'https:/

In [15]:
#Do the same steps above for the movies in 2021
url21 = 'https://www.boxofficemojo.com/weekly/by-year/2021/' 
response = requests.get(url21)
response.status_code             #200 = success!
page_21 = response.text          
soup_21 = bs(page_21, "lxml")    
all_weeks_21 = soup_21.find('div', id= 'table') #grab the table

weeks21 = []
for tr in all_weeks_21:              
    week = tr.find_all('a')[7::9]    
    for w in week:                   
        link = w['href']             
#        print(link)                 
        weeks21.append(link)          

urls_21 = []
def get_21(weeks):                        
    for path in weeks21:                         
        urls_21.append(base_url + path)   
    return(urls_21)                       
get_21(weeks21) 

['https://www.boxofficemojo.com/weekly/2021W53/?ref_=bo_wly_table_1',
 'https://www.boxofficemojo.com/weekly/2021W50/?ref_=bo_wly_table_4',
 'https://www.boxofficemojo.com/weekly/2021W47/?ref_=bo_wly_table_7',
 'https://www.boxofficemojo.com/weekly/2021W44/?ref_=bo_wly_table_10',
 'https://www.boxofficemojo.com/weekly/2021W41/?ref_=bo_wly_table_13',
 'https://www.boxofficemojo.com/weekly/2021W38/?ref_=bo_wly_table_16',
 'https://www.boxofficemojo.com/weekly/2021W35/?ref_=bo_wly_table_19',
 'https://www.boxofficemojo.com/weekly/2021W32/?ref_=bo_wly_table_22',
 'https://www.boxofficemojo.com/weekly/2021W29/?ref_=bo_wly_table_25',
 'https://www.boxofficemojo.com/weekly/2021W26/?ref_=bo_wly_table_28',
 'https://www.boxofficemojo.com/weekly/2021W23/?ref_=bo_wly_table_31',
 'https://www.boxofficemojo.com/weekly/2021W20/?ref_=bo_wly_table_34',
 'https://www.boxofficemojo.com/weekly/2021W17/?ref_=bo_wly_table_37',
 'https://www.boxofficemojo.com/weekly/2021W14/?ref_=bo_wly_table_40',
 'https:/

At this point we have all of the links to the weekly standings for every three weeks in 2010-2021. The weeks' links are in lists by year.

Next, combine the links from each year into one list of all weeks' full urls for the entire span of time.

In [17]:
#combine the lists of urls for all of the weeks. Collecting movie data will come next. 

def get_all_weeks(weeks_list):           #function for compiling all of the weeks' urls as an iterable list 
    all_weeks_urls = []                  #establish our final returned variable
    for weeks in weeks_list:             #iterate through the urls for each year's weeks
        for url in weeks:                #second for loop needed to iterate through the urls within each week
            all_weeks_urls.append(url)   #append each link onto our final list
    return (all_weeks_urls)              

weeks_list = [urls_10, urls_11, urls_12, urls_13, urls_14, urls_15, urls_16, urls_17, urls_18, urls_19, urls_20, urls_21]
all_weeks_urls = get_all_weeks(weeks_list) #with this list, we can write a function to get titles, links to movies, and some basic info
print(all_weeks_urls)



['https://www.boxofficemojo.com/weekly/2010W53/?ref_=bo_wly_table_1', 'https://www.boxofficemojo.com/weekly/2010W50/?ref_=bo_wly_table_4', 'https://www.boxofficemojo.com/weekly/2010W47/?ref_=bo_wly_table_7', 'https://www.boxofficemojo.com/weekly/2010W44/?ref_=bo_wly_table_10', 'https://www.boxofficemojo.com/weekly/2010W41/?ref_=bo_wly_table_13', 'https://www.boxofficemojo.com/weekly/2010W38/?ref_=bo_wly_table_16', 'https://www.boxofficemojo.com/weekly/2010W35/?ref_=bo_wly_table_19', 'https://www.boxofficemojo.com/weekly/2010W32/?ref_=bo_wly_table_22', 'https://www.boxofficemojo.com/weekly/2010W29/?ref_=bo_wly_table_25', 'https://www.boxofficemojo.com/weekly/2010W26/?ref_=bo_wly_table_28', 'https://www.boxofficemojo.com/weekly/2010W23/?ref_=bo_wly_table_31', 'https://www.boxofficemojo.com/weekly/2010W20/?ref_=bo_wly_table_34', 'https://www.boxofficemojo.com/weekly/2010W17/?ref_=bo_wly_table_37', 'https://www.boxofficemojo.com/weekly/2010W14/?ref_=bo_wly_table_40', 'https://www.boxoffice

Next create a program that goes into all of these links and gets the links to the top 15 movies in all of these weeks. 

In [18]:
movies = {}                                              #create dictionary for our final list of movies and some basic info, which will be turned into a df
def get_movies(all_weeks_urls):                          #this function turns the links for weeks into a df with movies
    for url in all_weeks_urls:                           #iterating through list of urls for weeks of data
        response = requests.get(url)                     #requesting the info on each webpage 
        page = response.text                             #gathering the text on each page to grab our data
        soup = bs(page,'lxml')                           #parsing the text into html to let our program know what the structure of the text is
        table = soup.find('div', id= 'table')            #getting the table with movies and basic info to begin compiling
        rows = [row for row in table.find_all('tr')]     #creating a variable for all the table rows to iterate through with specific commands
        for row in rows[1:16]:                           #iterating through the top 15 movie rows to begin gathering info
            items = row.find_all('td')                   #getting all data from each row to search
            link = items[2].find('a')                    #grabbing the link for each movie. It is in position 2, and has an 'a' tag
            title, url = link.text, link['href']         #getting the title for each movie and getting the url stub that takes us to that movie. this will be used to get more data
            movies[title] = [url] + [i.text for i in items]  #creating a key and value pair for each movie in our dictionary. The key is the movie title, the values are the url stub, and all of the info in each item of each movie row   
    return movies                                        #we've got all movies into a dictionary!

movies = get_movies(all_weeks_urls)
print(movies)

{'Little Fockers': ['/release/rl929203713/?ref_=bo_wl_table_2', '2', '1', 'Little Fockers', '$16,885,510', '-49.4%', '3,675', '+121', '$4,594', '$127,086,025', '4', 'Universal Pictures\n\n', 'false', 'false'], 'True Grit': ['/release/rl3564602881/?ref_=bo_wl_table_1', '1', '2', 'True Grit', '$19,781,873', '-40.4%', '3,124', '+41', '$6,332', '$115,211,895', '4', 'Paramount Pictures\n\n', 'false', 'false'], 'TRON: Legacy': ['/release/rl4000810497/?ref_=bo_wl_table_4', '4', '3', 'TRON: Legacy', '$13,116,178', '-48.7%', '3,013', '-352', '$4,353', '$151,238,596', '4', 'Walt Disney Studios Motion Pictures\n\n', 'false', 'false'], 'Yogi Bear': ['/release/rl2793047553/?ref_=bo_wl_table_9', '9', '4', 'Yogi Bear', '$7,954,551', '-48.4%', '3,288', '-227', '$2,419', '$76,750,072', '4', 'Warner Bros.\n\n', 'false', 'false'], 'The Fighter': ['/release/rl1213433345/?ref_=bo_wl_table_6', '6', '5', 'The Fighter', '$9,800,222', '-32.2%', '2,528', '-6', '$3,876', '$60,644,664', '5', 'Paramount Pictures\n

In [19]:
#Turn the film dictionary into a dataframe with the following columns
all_movies = pd.DataFrame(movies).T  #transpose
all_movies.columns = ['link_stub', 'rank', 'last week',
                    'title', 'total gross', '%+-LW','theaters',
                    'change', 'average', 'total gross', 'weeks',
                    'Distributor', 'dump','dump2']
all_movies.tail(15)

Unnamed: 0,link_stub,rank,last week,title,total gross,%+-LW,theaters,change,average,total gross.1,weeks,Distributor,dump,dump2
Jathi Ratnalu,/release/rl2725217025/?ref_=bo_wl_table_10,10,-,Jathi Ratnalu,"$470,000",-,130,-,"$3,615","$470,000",1,Flyhigh Cinemas\n\n,True,True
Long Weekend,/release/rl2305655553/?ref_=bo_wl_table_11,11,-,Long Weekend,"$336,481",-,814,-,$413,"$336,481",1,Stage 6 Films\n\n,True,False
Judas and the Black Messiah,/release/rl782991873/?ref_=bo_wl_table_3,3,2,Judas and the Black Messiah,"$1,109,513",-54.7%,1906,+18,$582,"$3,560,574",2,Warner Bros.\n\n,False,False
Lamb of God: The Concert Film,/release/rl527270657/?ref_=bo_wl_table_13,13,-,Lamb of God: The Concert Film,"$223,541",-,104,-,"$2,149","$223,541",1,Excel Entertainment\n\n,True,False
Dutch,/release/rl2573632257/?ref_=bo_wl_table_15,15,-,Dutch,"$190,603",-,202,-,$943,"$190,603",1,Faith Media Distribution\n\n,True,False
Land,/release/rl3798106881/?ref_=bo_wl_table_7,7,6,Land,"$642,800",-42.9%,1251,+20,$513,"$1,769,005",2,Focus Features\n\n,False,False
Nomadland,/release/rl1414169345/?ref_=bo_wl_table_8,8,-,Nomadland,"$503,000",-,1175,-,$428,"$503,000",1,Searchlight Pictures\n\n,True,True
The Mauritanian,/release/rl3395388161/?ref_=bo_wl_table_12,12,12,The Mauritanian,"$190,258",-11.3%,287,+39,$662,"$404,810",2,STX Entertainment\n\n,False,False
Blithe Spirit,/release/rl2874442497/?ref_=bo_wl_table_14,14,-,Blithe Spirit,"$120,453",-,239,-,$503,"$120,453",1,IFC Films\n\n,True,False
Our Friend,/release/rl2321515265/?ref_=bo_wl_table_9,9,8,Our Friend,"$182,191",-41.4%,818,+275,$222,"$493,251",2,Gravitas Ventures\n\n,False,False


At this point, we've got a dataframe containing film data and links. Export it for safe keeping and begin to look at the data.

In [21]:
all_movies.to_csv('All_movies_df.csv')

In [22]:
#how many movies did we get? looks like 1739
all_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1739 entries, Little Fockers to Alien
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   link_stub    1739 non-null   object
 1   rank         1739 non-null   object
 2   last week    1739 non-null   object
 3   title        1739 non-null   object
 4   total gross  1739 non-null   object
 5   %+-LW        1739 non-null   object
 6   theaters     1739 non-null   object
 7   change       1739 non-null   object
 8   average      1739 non-null   object
 9   total gross  1739 non-null   object
 10  weeks        1739 non-null   object
 11  Distributor  1739 non-null   object
 12  dump         1739 non-null   object
 13  dump2        1739 non-null   object
dtypes: object(14)
memory usage: 203.8+ KB


In [23]:
#Do we have any duplicate titles? Looks like no.
all_movies.title.value_counts().sort_values(ascending = False)

Little Fockers                        1
Unstoppable                           1
Yogi Bear                             1
The Fighter                           1
Black Swan                            1
                                     ..
Supernova                             1
Remember the Titans2021 Re-release    1
Shadow in the Cloud                   1
In the Earth                          1
Alien2020 Re-release                  1
Name: title, Length: 1739, dtype: int64

Now compile all movie urls for use in future functions and data scraping

In [20]:

def get_movie_url(all_movies):                   #this functino will compile all film urls for additional data scraping
    all_movies_urls = []                         #Create the empty list variable
    base_url = 'https://www.boxofficemojo.com'   #first part of link

    for link in all_movies.link_stub:            #iterate through the links in our df column "link_stub"
        url = base_url + link                    #Create full url to scrape
        all_movies_urls.append(url)              #put into a list to use later
    return all_movies_urls

all_movies_urls = get_movie_url(all_movies)
print(all_movies_urls)

['https://www.boxofficemojo.com/release/rl929203713/?ref_=bo_wl_table_2', 'https://www.boxofficemojo.com/release/rl3564602881/?ref_=bo_wl_table_1', 'https://www.boxofficemojo.com/release/rl4000810497/?ref_=bo_wl_table_4', 'https://www.boxofficemojo.com/release/rl2793047553/?ref_=bo_wl_table_9', 'https://www.boxofficemojo.com/release/rl1213433345/?ref_=bo_wl_table_6', 'https://www.boxofficemojo.com/release/rl2286388737/?ref_=bo_wl_table_5', 'https://www.boxofficemojo.com/release/rl2657388033/?ref_=bo_wl_table_11', 'https://www.boxofficemojo.com/release/rl980256257/?ref_=bo_wl_table_10', 'https://www.boxofficemojo.com/release/rl4200039937/?ref_=bo_wl_table_12', 'https://www.boxofficemojo.com/release/rl3042739713/?ref_=bo_wl_table_8', 'https://www.boxofficemojo.com/release/rl1819772417/?ref_=bo_wl_table_13', 'https://www.boxofficemojo.com/release/rl341673473/?ref_=bo_wl_table_15', 'https://www.boxofficemojo.com/release/rl1248560641/?ref_=bo_wl_table_14', 'https://www.boxofficemojo.com/rel

# Jenica pick up here trouble shoot breaking code

In [44]:
import dateutil.parser
import re                                           #importing regex to help with finding fields
import time                                         #importing time to set pause because I was reaching an error with boxofficemojo
""" for each film, collect: 
        -title
        -worldwide total gross
        -runtime
        -genres
        -release date (return to this to save as date type in download process)
        -rating        
        -budget
        """
movie_info_dict = {}                                #create empty dict to add all info to, to build our dataframe later
def get_movies(all_movies_urls):                    #this function gets info for all of our movies, which will then be joined with our earlier dataframe
    minutes = int()                                 #establish variable for collecting film runtime
    for url in all_movies_urls:                     #iterate through the list of all movie urls and gather data
        response = requests.get(url)                #gather webpage content
        page = response.text                        #get content text for parsing
        soup = bs(page,'lxml')                      #use beautifulsoup to parse text into a searchable form
#        print(response.status_code)                #check for status code #200 for funtionality success

        def get_movie_value(soup,field_name):        #nested function bypasses issue with global vs local variables for smooth scraping. 
            obj = soup.find(text=re.compile(field_name)) #versatile object that allows us to grab info of interest below
            if not obj:                               #iterate through the text on a film's webpage, looking for exaclty what we want
                return None                           #skip everything we are not interested in
            next_element = obj.findNext()             #on our webpage, the object is labeled, and the next element is the value we want
    
            if next_element:                          #if next element exists..
                return next_element.text              # add the text for the next element to our data
            else:                                     #if there is no next element, return "none"
                return None
        
        def money_to_int(moneystring):                #this function formats any monetary data collected, changing from a string to an integer 
            moneystring = moneystring.replace('$', '').replace(',', '') #remove the dollar sign and commas
            return int(moneystring)                   #convert from string to integer for calculations and modeling later

        def to_date(datestring):                     #this function formats date data, changing date strings to date type
            date = dateutil.parser.parse(datestring) #dateutil is a library we imported above
            return date
        
        def runtime_to_minutes(rt):                  #this function formats the runtime "x hours and y minutes" to total minutes as an integer
            minutes = int()                          #establish the variable and its type: integer
            #runtime = rt                            #
            try:                                     #bypass errors when runtime is absent
                minutes = int(rt[0])*60 + int(rt[2]) #turn runtime into an integer and multiply the first value (the hours) by 60 for minute conversion. Add that to the value of the third position (after a space), converted to an integer.
                return minutes
            except:                                   #If there is no runtime, return "none"
                return None

        

        #Begin now gathering specific data of interest from each film's page
        
        
        #get film title
        title_string = soup.find('title').text      #find the films title on the page
        title = title_string.split('-')[0].strip()   #The title is followed by a str we don't want, split it at the dash, take the first portion (the title) and strip it of the white space
        #print(title)                                #print to test code

        #get worldwide total gross data               
        try:                                         #try is needed in several of films. This alls the program to persist when no wtg is reported for a film
            wtg = soup.find(class_='mojo-performance-summary-table').find_all('span', class_='money')[2].text  
            worldwide_total_gross = money_to_int(wtg) #WTG is in position 2 in the summary table (seen one line up)
        except:                                       #when no wtg is reported,
            worldwide_total_gross = [float('NaN')]    #record 'NaN' for ease of processing and recognition of absent vslue
        #print(worldwide_total_gross)                 #print to test code
        
        #get runtime data
        rt_string = soup.find(text=re.compile('Run')) #handy code shared by Metis. Running Time is the label for runtime. Use regex.compile to approximate the label 'run' for runtime. Search the page for anything that approximates "runtime"?
        rt = rt_string.findNext().text                #The next element after the "run" approximation, is the value we need in our dataframe 
        rt = rt.split()                               #runtime is in the format of hours and mins. Needs converting to a numeric total minutes
        minutes = runtime_to_minutes(rt)              #use the function above to format the runtime data
        #print(minutes)                               #print to test code
        
        # genre
        genre = get_movie_value(soup,'Genres')        #genre often has multiple values per film. Use the function above to get genre data
        if genre == None:                             #if the field is empty, record "NaN" for easy processing and bypassing errors
            genre = [float('NaN')]
        else:                                          
            genre = genre.replace(' ','').split('\n\n') #remove spaces and split the string into a list at line breaks in the genre field when collecting the data
        #print(genre)                                 #print to test code

        
        #Get release date
        try: 
            raw_release_date = get_movie_value(soup,'Release Date').split('-')[0].strip()
        except:
            pass
        try:                                            #occasional problematic release date requires splitting at "("
            release_date = raw_release_date.split('(')[0].strip()
        except:
            pass

        
        # get film rating data
        rating = get_movie_value(soup,'MPAA')            #easy peezy with the formula above. Rating is called 'MPAA' for Motion Picture Association of America


        #get widest release of theaters data             #max number of theaters showing a film will be useful in the linear regression and predicting the target (revenue), though may present some significant collinearity
        try: 
            thtr_count_string = get_movie_value(soup,'Widest Release')
            thtr_count = thtr_count_string.split(' ')[0].strip().replace(',', '') #remove the word "theaters" from the returned value by grabbing only the first oject after splitting. Remove the comma and turn to integer
            thtr_count = int(thtr_count)
        except:
            thtr_count = [float('NaN')]
        
        #get budget data
        try:                                            #budget data will likely be a significant variable in predicting the target (gross revenue) 
            budget = get_movie_value(soup,'Budget')     #use the function above to get the value
            budget = money_to_int(budget)               #use the function above to format into integer
        except:
            budget = [float('NaN')]                             #films with no budget reported will be dropped later. Return here to streamline this in the download process.
        #print(budget)                                 #print to test code


        #Now combine all of the scraped data into a dictionary with film as key and all of the data as values. 
        movie_info_dict[title] = [url] + [worldwide_total_gross] + [minutes] + [genre] + [release_date] + [rating] + [thtr_count] + [budget]
    time.sleep(1)  #slow down the scraping program to avoid error from boxofficemojo
    return movie_info_dict

In [45]:
#run the above code on a small list of films for finetuning the code and checking the output
movie_info_dict_1to10 = get_movies(all_movies_urls[0:11]) 
movie_info_dict_1to10

{'Little Fockers': ['https://www.boxofficemojo.com/release/rl929203713/?ref_=bo_wl_table_2',
  310650585,
  98,
  ['Comedy', 'Romance'],
  'Dec 22, 2010',
  'PG-13',
  3675,
  100000000],
 'True Grit': ['https://www.boxofficemojo.com/release/rl3564602881/?ref_=bo_wl_table_1',
  252276927,
  110,
  ['Drama', 'Western'],
  'Dec 22, 2010',
  'PG-13',
  3464,
  38000000],
 'TRON: Legacy': ['https://www.boxofficemojo.com/release/rl4000810497/?ref_=bo_wl_table_4',
  400062763,
  125,
  ['Action', 'Adventure', 'Sci-Fi'],
  'Dec 17, 2010',
  'PG',
  3451,
  170000000],
 'Yogi Bear': ['https://www.boxofficemojo.com/release/rl2793047553/?ref_=bo_wl_table_9',
  203509374,
  81,
  ['Adventure', 'Animation', 'Comedy', 'Family'],
  'Dec 17, 2010',
  'PG',
  3515,
  80000000],
 'The Fighter': ['https://www.boxofficemojo.com/release/rl1213433345/?ref_=bo_wl_table_6',
  129190869,
  116,
  ['Action', 'Biography', 'Drama', 'Sport'],
  'Dec 10, 2010',
  'R',
  2534,
  25000000],
 'Black Swan': ['https://

Run the functions above for subsets of films to reduce program-crashing errors or interruptsion from the website

In [46]:
movie_info_dict_1to500 = get_movies(all_movies_urls[0:501]) 

In [47]:
movie_info_dict_501to1000 = get_movies(all_movies_urls[501:1001]) 

In [48]:
movie_info_dict_1001to1500 = get_movies(all_movies_urls[1001:1501])

In [49]:
movie_info_dict_1501to1739 = get_movies(all_movies_urls[1501:1740])

Now, combine all of the film dictionaries into one dictionary for conversion to a dataframe and merge with earlier dataframe

In [50]:
#a list of all of the dictionaries we just created
dicts_list = [movie_info_dict_1to500,  
              movie_info_dict_501to1000, 
              movie_info_dict_1001to1500, 
              movie_info_dict_1501to1739
              ]
result = {}                                       #create empty dictionary variable for compiling all films
for d in dicts_list:                              #iterate through the keys (and values) dictionary
    result.update(d)                              #add the keys and vlaues to the dictionary called 'results' for conversion to dataframe
print(result)

{'Little Fockers': ['https://www.boxofficemojo.com/release/rl929203713/?ref_=bo_wl_table_2', 310650585, 98, ['Comedy', 'Romance'], 'Dec 22, 2010', 'PG-13', 3675, 100000000], 'True Grit': ['https://www.boxofficemojo.com/release/rl3564602881/?ref_=bo_wl_table_1', 252276927, 110, ['Drama', 'Western'], 'Dec 22, 2010', 'PG-13', 3464, 38000000], 'TRON: Legacy': ['https://www.boxofficemojo.com/release/rl4000810497/?ref_=bo_wl_table_4', 400062763, 125, ['Action', 'Adventure', 'Sci-Fi'], 'Dec 17, 2010', 'PG', 3451, 170000000], 'Yogi Bear': ['https://www.boxofficemojo.com/release/rl2793047553/?ref_=bo_wl_table_9', 203509374, 81, ['Adventure', 'Animation', 'Comedy', 'Family'], 'Dec 17, 2010', 'PG', 3515, 80000000], 'The Fighter': ['https://www.boxofficemojo.com/release/rl1213433345/?ref_=bo_wl_table_6', 129190869, 116, ['Action', 'Biography', 'Drama', 'Sport'], 'Dec 10, 2010', 'R', 2534, 25000000], 'Black Swan': ['https://www.boxofficemojo.com/release/rl2286388737/?ref_=bo_wl_table_5', 329398046,

In [52]:
#convert the dictionary to a dataframe for cleaning and modeling, and export dataframe to a csv for safe keeping

movie_info_df = pd.DataFrame(result).T  #transpose
movie_info_df.columns = ['url', 'wtg', 'runtime',
                    'genres', 'release_date', 'rating','thtr_count','budget'
                                 ]
movie_info_df.to_csv('all_movies_info_df3.csv')
movie_info_df.sample(n=35)

Unnamed: 0,url,wtg,runtime,genres,release_date,rating,thtr_count,budget
I Feel Pretty,https://www.boxofficemojo.com/release/rl395965...,94539426,110.0,"[Comedy, Romance]","Apr 20, 2018",PG-13,3440,32000000
Jason Bourne,https://www.boxofficemojo.com/release/rl253798...,415484914,123.0,"[Action, Thriller]","Jul 29, 2016",PG-13,4039,120000000
Diary of a Wimpy Kid: Rodrick Rules,https://www.boxofficemojo.com/release/rl312577...,72526996,100.0,"[Comedy, Family]","Mar 25, 2011",PG,3169,21000000
No Good Deed,https://www.boxofficemojo.com/release/rl321123...,54323210,84.0,[nan],"Sep 12, 2014",PG-13,2175,13200000
Ex Machina,https://www.boxofficemojo.com/release/rl148167...,36869414,108.0,"[Drama, Sci-Fi, Thriller]","Apr 10, 2015",R,2004,15000000
Cut Throat City,https://www.boxofficemojo.com/release/rl224339...,864928,123.0,"[Action, Crime, Drama, Thriller]","Aug 21, 2020",R,389,[nan]
Aloha,https://www.boxofficemojo.com/release/rl279009...,26250020,105.0,"[Comedy, Drama, Romance]","May 29, 2015",PG-13,2815,37000000
Spy Kids 4: All the Time in the World,https://www.boxofficemojo.com/release/rl195372...,85564310,89.0,"[Action, Adventure, Animation, Comedy, Crime, ...","Aug 19, 2011",PG,3305,27000000
Cop Out,https://www.boxofficemojo.com/release/rl206407...,55611001,107.0,"[Action, Comedy, Crime]","Feb 26, 2010",R,3150,30000000
You're Next,https://www.boxofficemojo.com/release/rl410167...,26895481,95.0,"[Horror, Thriller]","Aug 23, 2013",R,2437,[nan]


In [53]:
movie_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1726 entries, Little Fockers to Alien 2020 Re
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           1726 non-null   object
 1   wtg           1726 non-null   object
 2   runtime       1686 non-null   object
 3   genres        1726 non-null   object
 4   release_date  1726 non-null   object
 5   rating        1624 non-null   object
 6   thtr_count    1726 non-null   object
 7   budget        1726 non-null   object
dtypes: object(8)
memory usage: 121.4+ KB


In [54]:
movie_df = all_movies.merge(movie_info_df, left_index=True, right_index= True)
movie_df.sample(n=2, random_state = 42)

Unnamed: 0,link_stub,rank,last week,title,total gross,%+-LW,theaters,change,average,total gross.1,...,dump,dump2,url,wtg,runtime,genres,release_date,rating,thtr_count,budget
Whiskey Tango Foxtrot,/release/rl2776401409/?ref_=bo_wl_table_5,5,4,Whiskey Tango Foxtrot,"$6,498,937",-34.8%,2413,+39,"$2,693","$16,470,840",...,False,False,https://www.boxofficemojo.com/release/rl277640...,24972139,112,"[Biography, Comedy, Drama, War]","Mar 4, 2016",R,2413,35000000
Step Up,/release/rl3665004033/?ref_=bo_wl_table_3,3,-,Step Up,"$22,939,221",-,2435,-,"$9,420","$22,939,221",...,True,False,https://www.boxofficemojo.com/release/rl366500...,159291809,107,"[Drama, Music, Romance]","Aug 6, 2010",PG-13,2439,30000000


In [55]:
movie_df3 = movie_df[['title', 'wtg', 'thtr_count', 'Distributor','runtime', 'genres', 'release_date', 'rating', 'budget']]

In [56]:
movie_df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1655 entries, Little Fockers to Shadow in the Cloud
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1655 non-null   object
 1   wtg           1655 non-null   object
 2   thtr_count    1655 non-null   object
 3   Distributor   1655 non-null   object
 4   runtime       1617 non-null   object
 5   genres        1655 non-null   object
 6   release_date  1655 non-null   object
 7   rating        1570 non-null   object
 8   budget        1655 non-null   object
dtypes: object(9)
memory usage: 129.3+ KB


In [57]:
#remove the \n\n from the end of the distributor name
#movie_df3['Distributor'] = movie_df3['Distributor'].str[:-2] #don't run again 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df3['Distributor'] = movie_df3['Distributor'].str[:-2] #don't run again


In [58]:
#delete the rows with no worldwide total gross value
movie_df3 = movie_df3.loc[movie_df3['wtg'] !='NaN']
movie_df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1655 entries, Little Fockers to Shadow in the Cloud
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1655 non-null   object
 1   wtg           1655 non-null   object
 2   thtr_count    1655 non-null   object
 3   Distributor   1655 non-null   object
 4   runtime       1617 non-null   object
 5   genres        1655 non-null   object
 6   release_date  1655 non-null   object
 7   rating        1570 non-null   object
 8   budget        1655 non-null   object
dtypes: object(9)
memory usage: 129.3+ KB


In [59]:
#delete the rows with no budget
movie_df4 = movie_df3.loc[movie_df3['budget'] !='NaN']
movie_df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1655 entries, Little Fockers to Shadow in the Cloud
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1655 non-null   object
 1   wtg           1655 non-null   object
 2   thtr_count    1655 non-null   object
 3   Distributor   1655 non-null   object
 4   runtime       1617 non-null   object
 5   genres        1655 non-null   object
 6   release_date  1655 non-null   object
 7   rating        1570 non-null   object
 8   budget        1655 non-null   object
dtypes: object(9)
memory usage: 129.3+ KB


In [60]:
#pickling for easy import into the next notebook
movie_df4.to_pickle("./movie_df4.pkl")

In [61]:
movie_df4.head()

Unnamed: 0,title,wtg,thtr_count,Distributor,runtime,genres,release_date,rating,budget
Little Fockers,Little Fockers,310650585,3675,Universal Pictures,98,"[Comedy, Romance]","Dec 22, 2010",PG-13,100000000
True Grit,True Grit,252276927,3464,Paramount Pictures,110,"[Drama, Western]","Dec 22, 2010",PG-13,38000000
TRON: Legacy,TRON: Legacy,400062763,3451,Walt Disney Studios Motion Pictures,125,"[Action, Adventure, Sci-Fi]","Dec 17, 2010",PG,170000000
Yogi Bear,Yogi Bear,203509374,3515,Warner Bros.,81,"[Adventure, Animation, Comedy, Family]","Dec 17, 2010",PG,80000000
The Fighter,The Fighter,129190869,2534,Paramount Pictures,116,"[Action, Biography, Drama, Sport]","Dec 10, 2010",R,25000000


In [67]:
#rename the thtr_count column to easier name
movie_df4.rename({'thtr_count':'theaters'}, axis = 1, inplace=True)

In [66]:
movie_df4.head()

Unnamed: 0,title,wtg,theaters,Distributor,runtime,genres,release_date,rating,budget
Little Fockers,Little Fockers,310650585,3675,Universal Pictures,98,"[Comedy, Romance]","Dec 22, 2010",PG-13,100000000
True Grit,True Grit,252276927,3464,Paramount Pictures,110,"[Drama, Western]","Dec 22, 2010",PG-13,38000000
TRON: Legacy,TRON: Legacy,400062763,3451,Walt Disney Studios Motion Pictures,125,"[Action, Adventure, Sci-Fi]","Dec 17, 2010",PG,170000000
Yogi Bear,Yogi Bear,203509374,3515,Warner Bros.,81,"[Adventure, Animation, Comedy, Family]","Dec 17, 2010",PG,80000000
The Fighter,The Fighter,129190869,2534,Paramount Pictures,116,"[Action, Biography, Drama, Sport]","Dec 10, 2010",R,25000000
