In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re
import numpy as np
import time
import matplotlib as plt
import dateutil.parser

In [3]:
year=2010
# create empty lists in which to save the link stubs and movie names
names = []
link_stubs = []

while year !=2020:
    links=f"https://www.boxofficemojo.com/year/{year}/?grossesOption=calendarGrosses"
    year=year+1
   
    for url in links:
        response= requests.get(links)
        time.sleep(1)
        soup = bs(response.text, 'html5lib')
        table = soup.find(id='table')
        rows = [row for row in table.find_all('tr')]

    for row in rows[1:]:
        link = row.find('a')
        if link != None:
            name, link_stub = link.text, link['href']
            names.append(name)
            link_stubs.append(link_stub)

In [4]:
# look at the first 10 items of names and link_stubs lists

print(names[:10])
print(link_stubs[:10])

['Avatar', 'Toy Story 3', 'Alice in Wonderland', 'Iron Man 2', 'The Twilight Saga: Eclipse', 'Inception', 'Harry Potter and the Deathly Hallows: Part 1', 'Despicable Me', 'Shrek Forever After', 'How to Train Your Dragon']
['/release/rl876971521/?ref_=bo_yld_table_1', '/release/rl1383564801/?ref_=bo_yld_table_2', '/release/rl3393226241/?ref_=bo_yld_table_3', '/release/rl1515881985/?ref_=bo_yld_table_4', '/release/rl659654145/?ref_=bo_yld_table_5', '/release/rl2908456449/?ref_=bo_yld_table_6', '/release/rl1248560641/?ref_=bo_yld_table_7', '/release/rl2018477569/?ref_=bo_yld_table_8', '/release/rl57771521/?ref_=bo_yld_table_9', '/release/rl2908259841/?ref_=bo_yld_table_10']


In [5]:
# create a dictionary of each movie and its corresponding link stub
movies = {}
for a, b in zip(names, link_stubs):
    movies[a] = [b]

In [6]:
# create a dataframe out of the dictionary containing each movie and its link stub
raw_movies_df = pd.DataFrame(movies).T  #transpose
raw_movies_df.columns = ['link_stub']

In [7]:
def movie_item(soup, field_name):
    obj=soup.find(text=re.compile(field_name))
    if not obj:
        return None
    element=obj.findNext()
    if element:
        return element.text
    else:
        return None

In [10]:
def get_movie_dict(link):
    base_url='https://www.boxofficemojo.com'
    link_stub=base_url+ link
    new_response=requests.get(link_stub)
    page=new_response.text
    time.sleep(1)
    soup=bs(page,'html5lib')
      
    #Get movie title
    movie_title=soup.find('title').text.split('-')[0].strip()

    #Get release date
    release_date=movie_item(soup,'Release Date')
    def date_(datetime):
        try:
            raw_date=dateutil.parser.parse(datetime)
            return raw_date
        except:
            None 
    date=date_(release_date.split('\n')[0])
    
    #Get domestic gross value
    domestic_gross=movie_item(soup,'Grosses').strip().replace(',','').split('\n')[-1].strip().replace('$','')
    domestic_gross=int(domestic_gross)

    #Get genres
    genres=movie_item(soup,'Genres')
    try:
        genres=genres.replace(' ','').split()
    except:
        np.nan
   
    #Get production budget 
    budget=movie_item(soup,'Budget')
    try:
        budget=budget.replace(',','').replace('$','')
        budget=int(budget)
    except:
        np.nan
    
    #Get name of distributor
    distributor=movie_item(soup,'Distributor')
    distributor=distributor.split('See')[0]

    #Number of theaters
    number_of_theaters=movie_item(soup,'Widest Release').split(' ')[0].replace(',','')
    theaters=int(number_of_theaters)
    
    #Run Time in minutes
    runtime_minutes=movie_item(soup,'Running')
    try:
        runtime_minutes=int(runtime_minutes.split(' ')[0])*60 +int(runtime_minutes.split(' ')[2])
    except:
        np.nan
        
    #Get In-Release in days
    in_release=movie_item(soup,'In Release')
    try: 
        in_release=int(in_release.split('/')[0].replace('days',''))
    except:
        np.nan
    
    #Get rating
    rating=movie_item(soup,'MPAA')
    
    #Create movies dictionary
    columns=['movie_title','release_date','domestic_gross','genres','budget','distributor','theaters','runtime(minutes)','in_release(day)','rating']
    movies_dict=dict(zip(columns,[movie_title,date,domestic_gross,genres,budget,distributor,theaters,runtime_minutes,in_release,rating]))
    
    return movies_dict

In [11]:
movie_info_list=[]
for link in raw_movies_df.link_stub:
    movie_info_list.append(get_movie_dict(link))

In [13]:
print(len(movie_info_list))

1803


In [14]:
#Create a dataframe to be used for modeling
movies_df=pd.DataFrame(movie_info_list)

In [15]:
#Save dataframe to csv 
movies_df.to_csv('movie_data.csv')