# Code to scrape movie data from Boxoffice Mojo

In [4]:
#Imports
import re
import pandas as pd
import requests
import numpy as np
import dateutil.parser

In [5]:
from bs4 import BeautifulSoup


In [22]:
#Add scrapped items to a dictionary
movies={}
website = 'http://www.boxofficemojo.com'
#num, is a variable generate numbers for the different pages that will be scrapped
num = range(1,6)

#Helper functions to parse the scrapped data
def to_date(datestring):
    date = dateutil.parser.parse(datestring)
    return date

def money_to_int(moneystring):
    if ' million' in moneystring:
        moneystring = moneystring.replace('$', '').replace(' million', '')
        moneystring=int(moneystring)*1000000
        return int(moneystring)
    if '$' in moneystring: 
        moneystring = moneystring.replace('$', '').replace(',', '')
        return int(moneystring)
        
def runtime_to_minutes(runtimestring):
    runtime = runtimestring.split()
    try:
        minutes = int(runtime[0])*60 + int(runtime[2])
        return minutes
    except:
        return None

#1st for loop to loop through multiple pages of data
for i in num:
    url = 'http://www.boxofficemojo.com/genres/chart/?view=main&sort=gross&order=DESC&pagenum='+str(i)+'&id=animation.htm'
    

    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    soup.prettify()
#Find tags and scrape the required data
    tables = soup.find_all('table')
    rows=[row for row in tables[3].find_all('tr')]


#2nd for loop, to loop through the lines in the table
#try and except statements, to move onto to the next item, if data not available
    for row in rows[1:101]:
        try:
            directors=[]
            writers=[]
            actors=[]
            items = row.find_all('td')
            name = items[1].find('a').text
            movie_link= items[1].find('a')['href']
            release_date = to_date(items[7].find('a').text)
            domestic_gross = money_to_int(items[3].find('b').text)
        except:
            next

#movie_url, will take us to the url of the movie, from where we will get data on foreign gross, world gross, prodn budget, distributor etc
        movie_url = website+movie_link
        movie_soup = BeautifulSoup(requests.get(movie_url).text, "lxml")

        movie_table = movie_soup.find('table').find_all('table')[1].find_all('table')[0].find_all('td')
        try:
            
            other_table = movie_soup.find('table').find_all('table')[1].find_all('table')[3].find_all('table')[0].find_all('td')
            people_table = other_table[0].find_all('div', class_ = "mp_box")[2].find_all('a')
            
            foreign_gross = money_to_int(other_table[1].find_all('div')[0].find_all('td')[4].text)
            world_gross = money_to_int(other_table[1].find_all('div')[0].find_all('td')[8].text)
            production_budget = money_to_int(movie_table[1].find_all('td')[7].find('b').text)
            distributor = movie_table[1].find_all('td')[2].find('a').text
            runtime = runtime_to_minutes(movie_table[1].find_all('td')[5].find('b').text)
            rating = movie_table[1].find_all('td')[6].find('b').text





            for i in people_table:
                if '/people/chart/?view=Director' in i['href'] :
                    directors.append(i.text)
                if '/people/chart/?view=Writer' in i['href']:
                    writers.append(i.text)
                if '/people/chart/?view=Actor' in i['href'] and ((i.text)[-1] != '*'):
                    actors.append(i.text)
        except:
            next


#assign the scrapped items to the dictionary
        movies[name] = [domestic_gross,foreign_gross,world_gross,production_budget,release_date,distributor,runtime,rating,directors,writers,actors]
len(movies.items())

423

In [26]:
movies.items()[400]

(u'Sinbad: Legend of the Seven Seas',
 [26483452,
  54284432,
  80767884,
  60000000,
  datetime.datetime(2003, 7, 2, 0, 0),
  u'DreamWorks',
  86,
  u'PG',
  [u'Tim Johnson'],
  [u'John Logan'],
  [u'Brad Pitt', u'Catherine Zeta-Jones', u'Michelle Pfeiffer']])

In [27]:
#convert from dictionary to dataframe
movies_df = pd.DataFrame.from_dict(movies, orient = 'index')

In [28]:
movies_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Everyone's Hero,14523101.0,2104087.0,16627188,,2006-09-15,Fox,88.0,G,[],[],"[Raven-Symone, Forest Whitaker]"
Fire and Ice,760883.0,,857522,,1983-08-26,GKIDS,93.0,PG,[],[],[]
Light Years,370698.0,12572253.0,12947880,,1988-01-29,Eleven Arts,116.0,Unrated,[],[],[]
Ratchet & Clank,8821329.0,,8821329,,2016-04-29,Focus Features,94.0,PG,[],[],"[Rosario Dawson, Paul Giamatti, John Goodman, ..."
Home (2015),177397510.0,208644097.0,386041607,135000000.0,2015-03-27,Fox,94.0,PG,[Tim Johnson],"[Tom J. Astle, Matt Ember]","[Rihanna, Steve Martin, Jennifer Lopez]"


In [29]:
len(movies_df)

423

In [30]:
#Clean-up the data frame
movies_df = movies_df.reset_index()

In [31]:
movies_df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10
0,Everyone's Hero,14523101.0,2104087.0,16627188,,2006-09-15,Fox,88.0,G,[],[],"[Raven-Symone, Forest Whitaker]"
1,Fire and Ice,760883.0,,857522,,1983-08-26,GKIDS,93.0,PG,[],[],[]
2,Light Years,370698.0,12572253.0,12947880,,1988-01-29,Eleven Arts,116.0,Unrated,[],[],[]
3,Ratchet & Clank,8821329.0,,8821329,,2016-04-29,Focus Features,94.0,PG,[],[],"[Rosario Dawson, Paul Giamatti, John Goodman, ..."
4,Home (2015),177397510.0,208644097.0,386041607,135000000.0,2015-03-27,Fox,94.0,PG,[Tim Johnson],"[Tom J. Astle, Matt Ember]","[Rihanna, Steve Martin, Jennifer Lopez]"


In [32]:
movies_df.columns

Index([u'index', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='object')

In [33]:
movies_df.columns = ['movie_name','domestic_gross','foreign_gross','world_gross','production_budget',
                           'release_date','distributor','runtime','rating','directors','writers','actors']

In [34]:
movies_df.columns

Index([u'movie_name', u'domestic_gross', u'foreign_gross', u'world_gross',
       u'production_budget', u'release_date', u'distributor', u'runtime',
       u'rating', u'directors', u'writers', u'actors'],
      dtype='object')

In [35]:
movies_df.head()

Unnamed: 0,movie_name,domestic_gross,foreign_gross,world_gross,production_budget,release_date,distributor,runtime,rating,directors,writers,actors
0,Everyone's Hero,14523101.0,2104087.0,16627188,,2006-09-15,Fox,88.0,G,[],[],"[Raven-Symone, Forest Whitaker]"
1,Fire and Ice,760883.0,,857522,,1983-08-26,GKIDS,93.0,PG,[],[],[]
2,Light Years,370698.0,12572253.0,12947880,,1988-01-29,Eleven Arts,116.0,Unrated,[],[],[]
3,Ratchet & Clank,8821329.0,,8821329,,2016-04-29,Focus Features,94.0,PG,[],[],"[Rosario Dawson, Paul Giamatti, John Goodman, ..."
4,Home (2015),177397510.0,208644097.0,386041607,135000000.0,2015-03-27,Fox,94.0,PG,[Tim Johnson],"[Tom J. Astle, Matt Ember]","[Rihanna, Steve Martin, Jennifer Lopez]"


In [36]:
movies_df = movies_df.fillna('')

In [37]:
movies_df.head()

Unnamed: 0,movie_name,domestic_gross,foreign_gross,world_gross,production_budget,release_date,distributor,runtime,rating,directors,writers,actors
0,Everyone's Hero,14523100.0,2104090.0,16627188,,2006-09-15,Fox,88,G,[],[],"[Raven-Symone, Forest Whitaker]"
1,Fire and Ice,760883.0,,857522,,1983-08-26,GKIDS,93,PG,[],[],[]
2,Light Years,370698.0,12572300.0,12947880,,1988-01-29,Eleven Arts,116,Unrated,[],[],[]
3,Ratchet & Clank,8821330.0,,8821329,,2016-04-29,Focus Features,94,PG,[],[],"[Rosario Dawson, Paul Giamatti, John Goodman, ..."
4,Home (2015),177398000.0,208644000.0,386041607,135000000.0,2015-03-27,Fox,94,PG,[Tim Johnson],"[Tom J. Astle, Matt Ember]","[Rihanna, Steve Martin, Jennifer Lopez]"


In [38]:
movies_df.tail()

Unnamed: 0,movie_name,domestic_gross,foreign_gross,world_gross,production_budget,release_date,distributor,runtime,rating,directors,writers,actors
418,The Wild Life (2016),8005590.0,31538000.0,39543581,,2016-09-09,Lionsgate/Summit,90,PG,[],[],[]
419,Mr. Peabody & Sherman,111506000.0,164192000.0,275698039,145000000.0,2014-03-07,Fox,90,PG,[Rob Minkoff],[Craig Wright],"[Ty Burrell, Stanley Tucci]"
420,Foodfight!,120.0,27186300.0,27187375,,2014-09-20,Millennium Ent.,84,Unrated,[],[],[]
421,Tamala 2010,3386.0,11163600.0,11167501,,2004-04-02,The Bigger Picture,132,Unrated,[],[],[]
422,The Painting (2013),21008.0,,21008,,2013-05-10,GKIDS,76,Unrated,[],[],[]


In [39]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423 entries, 0 to 422
Data columns (total 12 columns):
movie_name           423 non-null object
domestic_gross       423 non-null object
foreign_gross        423 non-null object
world_gross          423 non-null int64
production_budget    423 non-null object
release_date         423 non-null datetime64[ns]
distributor          423 non-null object
runtime              423 non-null object
rating               423 non-null object
directors            423 non-null object
writers              423 non-null object
actors               423 non-null object
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 39.7+ KB


In [40]:
movies_df.production_budget

0              
1              
2              
3              
4      1.35e+08
5       3.7e+07
6              
7         8e+07
8         8e+06
9              
10      1.5e+08
11             
12      1.7e+07
13             
14             
15        3e+07
16             
17             
18             
19        3e+07
20        8e+06
21             
22     1.35e+08
23             
24      8.5e+07
25        4e+07
26             
27             
28      7.5e+07
29             
         ...   
393       2e+08
394            
395       3e+06
396            
397            
398            
399            
400       6e+07
401            
402            
403     1.4e+08
404            
405            
406            
407     2.5e+07
408            
409            
410            
411    1.25e+08
412            
413     2.1e+07
414            
415            
416            
417            
418            
419    1.45e+08
420            
421            
422            
Name: production_budget,

In [41]:
movies_df.to_csv('boxofficemojo_animation.csv')

In [42]:
less boxofficemojo_animation.csv