In [1]:
# import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import string
import pickle

In [29]:
def get_link_list(url):
    '''
    given the first foreign url on box office mojo this function 
    will find all the links to the subsequent pages of the chart
    and return the urls as a list
    '''
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page)
    link_soup = soup.find_all('b')[2]
    
    link_list = []
    for link in link_soup.find_all('a'):
        link_list.append(str(link))
    
    url_list = [url]
    for url in link_list:
        x = url.replace('<a href="', 'http://www.boxofficemojo.com').partition('">')
        y = x[0].replace('amp;', '') 
        url_list.append(y)
    
    return url_list

In [30]:
def get_movie_table(url):
    '''
    given a url for an Box Office Mojo site with a table
    this function will return a dataframe based on the table
    '''
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    table1 =  soup.find_all('table')[2].find('table').find_all('tr')
    
    data = []
    for row in table1:
        cols = row.find_all('td')
        cols =[ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) 
        
    df = pd.DataFrame(data)
    
    return df

In [10]:
def get_film_urls(series):
    '''
    given a series of titles based on Box Office Mojo table
    return a list of urls for the main pages of these films
    '''  
    title_ser = series.apply(lambda x: x.split('('))
    
    title_url_list = []
    for title in title_ser.iteritems():
        urlA = 'http://www.boxofficemojo.com/movies/?id='
        urlB = '.htm'
        title = title[1][0].lower().replace(' ', '')
        title = title.encode('utf-8').translate(None, string.punctuation)
        title_url_list.append(urlA + title + urlB)
    
    return title_url_list

In [32]:
def all_movie_tables(url):
    url_list = get_link_list(url)
    
    # for loop over all the urls
    df = pd.DataFrame()
    for url in url_list:
        table = get_movie_table(url)
        df = df.append(table, ignore_index=True)
    
    df.columns = ['rank', 'title',
                  'studio', 'gross',
                  'theatres_life', 'opening',
                 'theaters_opening', 'date']
    return df

In [33]:
url = 'http://www.boxofficemojo.com/genres/chart/?id=foreign.htm'

df = all_movie_tables(url)

In [34]:
print df.head()
print df.tail()

   rank                                   title  studio  \
0  Rank                   Title (click to view)  Studio   
1     1  Crouching Tiger, Hidden Dragon(Taiwan)     SPC   
2     2                Life Is Beautiful(Italy)   Mira.   
3     3                             Hero(China)   Mira.   
4     4               Instructions Not Included     LGF   

                       gross       theatres_life      opening  \
0  Lifetime Gross / Theaters  Opening / Theaters         Date   
1               $128,078,872               2,027     $663,205   
2                $57,563,264               1,136     $118,920   
3                $53,710,019               2,175  $18,004,319   
4                $44,467,206                 978   $7,846,426   

  theaters_opening      date  
0             None      None  
1               16   12/8/00  
2                6  10/23/98  
3            2,031   8/27/04  
4              348   8/30/13  
                              rank           title    studio       g

In [39]:
with open('dirty_for_table.pk1', 'w') as picklefile:
    pickle.dump(df, picklefile)

In [40]:
!ls

02-luther_link_list.ipynb   get_movie_table.ipynb
design_draft_format.md      main_for_page.pk1
dirty_for_table             main_page_soup.pk1
dirty_for_table.pk1         url_list.pk1
get_all_filmpage_urls.ipynb url_title_list.pk1
get_all_foreign_urls.ipynb


In [42]:
print df

                              rank  \
0                             Rank   
1                                1   
2                                2   
3                                3   
4                                4   
5                                5   
6                                6   
7                                7   
8                                8   
9                                9   
10                              10   
11                              11   
12                              12   
13                              13   
14                              14   
15                              15   
16                              16   
17                              17   
18                              18   
19                              19   
20                              20   
21                              21   
22                              22   
23                              23   
24                              24   
25          

In [5]:
with open('dirty_for_table.pk1', 'r') as picklefile:
    df = pickle.load(picklefile)

In [6]:
print len(df)

1926


In [7]:
# remove non-films from the df

drop_list = ['Rank', 'TOTAL (All Movies):',
            'AVERAGE (All Movies):', 'TOTAL(Wide Releases Only):',
            'AVERAGE(Wide Releases Only):']

df = df[~df['rank'].isin(drop_list)]

In [8]:
print len(df)

1831


In [11]:
# get all film titles

url_list = get_film_urls(df['title'])

In [12]:
with open('all_film_url_list.pk1', 'w') as picklefile:
    pickle.dump(url_list, picklefile)

In [13]:
import urlparse
import os

def get_file(url):
    '''
    Takes url and returns a requests.get(url)
    
    Furthermore, create and saves directories/subdirectories and files from a url into your current working directory.
    Thus, you will not have to make more than one request per url.
    However, the function will not check if the file has been updated just if it was downloaded from before.
    
    Also, if the url is bad then it will print an Error and will return None
    '''
    parsed_url = urlparse.urlparse(url)
    path = []
    for item in parsed_url:
        for x in item.split("/"):
            if x != '':
                path.append(x)
    outfile = path[-1]
    path = path[:-1]
    outpath = '/'.join(path)
    
    if not os.path.exists(outpath):
        os.makedirs(outpath)
    if not os.path.isfile(outpath + '/' + outfile + '.p'):
        r = requests.get(url)
        if r.status_code != requests.codes.ok:
            print 'Error: request.get(url) Status NOT 200'
            return None 
        pickle.dump(r, open('{}'.format(outpath + '/' + outfile + '.p'), 'wb'))
    else:
        r = pickle.load(open('{}'.format(outpath + '/' + outfile + '.p'), 'rb'))
    return r

In [14]:
for url in url_list:
    get_file(url)

Error: request.get(url) Status NOT 200
Error: request.get(url) Status NOT 200


In [1]:
from lxml import html

tree = html.fromstring(page.text)

NameError: name 'page' is not defined