# Box Office Crawler

https://www.boxofficemojo.com/release/rl1930593025/?ref_=bo_gr_rls

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import re

In [2]:
def scrape(path, year):
    response = requests.get(path)

    soup = BeautifulSoup(response.text, "html.parser")
    movie_tables = soup.find('div', attrs={'id': 'table'})
    movie_list = movie_tables.find('table')
    movies = movie_list.find_all('tr')[1:] 
    # print(len(movies))
    
    movies_info = []

    for movie in movies:
        link = "https://www.boxofficemojo.com" + movie.find('a')['href']
        response = requests.get(link)
        soup = BeautifulSoup(response.text, 'html.parser')
        soup.find('table')
        soup.find('table').find('a')['href']
        domestic_link = "https://www.boxofficemojo.com" + soup.find('table').find('a')['href']

        # check if current movie satisfy our criteria range
        try:
            response_chk = requests.get(domestic_link)
            soup_chk = BeautifulSoup(response_chk.text, 'html.parser')
            soup_chk.find('table').find_all('tr')[30].find_all('td')[9].text
        except:
            print("Error in movie page: " + domestic_link)

        movies_info = [*movies_info, scrape_movie_page(domestic_link, year)]
            
    return movies_info

def scrape_movie_page(path, year):

    response = requests.get(path)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        info = soup.find('table').find_all('tr')[1:]
    except:
        return []
    # print(info)
    days = []
    daily_bo = []
    theaters_count = []
    gross_change = []
    index = 0

    for row in info:
        if row.find_all('td')[9].text == '31':
            break
        if row.find_all('td')[9].text == '-':
            continue
        try:
            days = [*days, int(row.find_all('td')[9].text)]

            daily_bo = [*daily_bo, int(re.sub('[^0-9]', '', row.find_all('td')[3].text))]
            theaters_count = [*theaters_count, int(re.sub('[^0-9]', '', row.find_all('td')[6].text))]

            g_change = row.find_all('td')[4].text

            if g_change == '-':
                g_change = 0
            else:
                g_change = float(re.sub('[^0-9-+.]', '', g_change))

            gross_change = [*gross_change, g_change]
            index += 1
        except:
            return []

    movie = {
        'movie_name': soup.find('h1').text,
        'year': int(year)
    }

    for day in days:
        # print(daily_bo)
        try:
            movie['daily_bo' + str(day)] = daily_bo[day - 1]
            movie['theaters_count' + str(day)] = theaters_count[day - 1]
            movie['gross_change' + str(day)] = gross_change[day - 1]
        except:
            return []
    
    return movie


In [3]:
years = range(2018, 2024)
movies = []
for year in years:
    movies  = [*movies, scrape(f"https://www.boxofficemojo.com/year/world/{year}/?sort=domesticGrossToDate&ref_=bo_ydw__resort#table", year)]

movies

Error in movie page: https://www.boxofficemojo.com/release/rl1191544321/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl2264958465/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl2432992769/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl3456271873/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl1812301313/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl4110910977/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl4027024897/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl637896193/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl3523708417/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl2382398977/?ref_=bo_gr_rls
Error in movie page: https://www.boxofficemojo.com/release/rl3489760769/?ref_=bo_gr_rls
Error in movie page: https://www.

[[{'movie_name': 'Black Panther',
   'year': 2018,
   'daily_bo1': 75941146,
   'theaters_count1': 4020,
   'gross_change1': 0,
   'daily_bo2': 65995366,
   'theaters_count2': 4020,
   'gross_change2': -13.1,
   'daily_bo3': 60067439,
   'theaters_count3': 4020,
   'gross_change3': -9.0,
   'daily_bo4': 40151729,
   'theaters_count4': 4020,
   'gross_change4': -33.2,
   'daily_bo5': 20857361,
   'theaters_count5': 4020,
   'gross_change5': -48.1,
   'daily_bo6': 14688057,
   'theaters_count6': 4020,
   'gross_change6': -29.6,
   'daily_bo7': 14253324,
   'theaters_count7': 4020,
   'gross_change7': -3.0,
   'daily_bo8': 28789877,
   'theaters_count8': 4020,
   'gross_change8': 102.0,
   'daily_bo9': 47553478,
   'theaters_count9': 4020,
   'gross_change9': 65.2,
   'daily_bo10': 35315480,
   'theaters_count10': 4020,
   'gross_change10': -25.7,
   'daily_bo11': 8098481,
   'theaters_count11': 4020,
   'gross_change11': -77.1,
   'daily_bo12': 10204038,
   'theaters_count12': 4020,
   '

In [4]:
movies_list = []

for year in movies:
    while [] in year:
        year.remove([])
    movies_list = [*movies_list, *year]

In [5]:
movies_list[0].keys()

dict_keys(['movie_name', 'year', 'daily_bo1', 'theaters_count1', 'gross_change1', 'daily_bo2', 'theaters_count2', 'gross_change2', 'daily_bo3', 'theaters_count3', 'gross_change3', 'daily_bo4', 'theaters_count4', 'gross_change4', 'daily_bo5', 'theaters_count5', 'gross_change5', 'daily_bo6', 'theaters_count6', 'gross_change6', 'daily_bo7', 'theaters_count7', 'gross_change7', 'daily_bo8', 'theaters_count8', 'gross_change8', 'daily_bo9', 'theaters_count9', 'gross_change9', 'daily_bo10', 'theaters_count10', 'gross_change10', 'daily_bo11', 'theaters_count11', 'gross_change11', 'daily_bo12', 'theaters_count12', 'gross_change12', 'daily_bo13', 'theaters_count13', 'gross_change13', 'daily_bo14', 'theaters_count14', 'gross_change14', 'daily_bo15', 'theaters_count15', 'gross_change15', 'daily_bo16', 'theaters_count16', 'gross_change16', 'daily_bo17', 'theaters_count17', 'gross_change17', 'daily_bo18', 'theaters_count18', 'gross_change18', 'daily_bo19', 'theaters_count19', 'gross_change19', 'daily

In [6]:
import csv

with open('mycsvfile.csv','w', newline='') as f:
    w = csv.writer(f)
    w.writerow(movies_list[0].keys())
    for movie in movies_list:
        w.writerow(movie.values())