In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def Get_Top_List(url):
    """
    Converts IMDB table into pandas dataframe.
    Input: url link from IMDB table page for "Top Lifetime Grosses"
    Output: formatted dataframe of table
    """
    
    response = requests.get(url)

    page = response.text

    soup = BeautifulSoup(page)

    table = soup.find(id='table')
    
    rows = [row for row in table.find_all('tr')]

    movies = {} #can also make this a global variable outside of the for loop
    #then pass all the URLS--take URL function and pass it through this function
    #append everything to the movies dictionary and 

    for row in rows[1:]:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url = link.text, link['href']
        movies[title] = [url] + [i.text for i in items]
     
    movie_page = pd.DataFrame(movies).T  #transpose
    movie_page.columns = ['Link', 'Rank', 'Title', 'World_Gross', 
                    'Domestic_Gross', 'Domestic_Percentage', 'Foreign_Gross', 'Foreign_Percentage','Year']

    return movie_page 

In [3]:
Get_Top_List('https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/')

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_1,1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_2,2,Avatar,"$2,790,439,092","$760,507,625",27.2%,"$2,029,931,467",72.8%,2009
Titanic,/title/tt0120338/?ref_=bo_cso_table_3,3,Titanic,"$2,195,169,869","$659,363,944",30%,"$1,535,805,925",70%,1997
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_4,4,Star Wars: Episode VII - The Force Awakens,"$2,068,224,036","$936,662,225",45.3%,"$1,131,561,811",54.7%,2015
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018
...,...,...,...,...,...,...,...,...,...
The Revenant,/title/tt1663202/?ref_=bo_cso_table_196,196,The Revenant,"$532,950,503","$183,637,894",34.5%,"$349,312,609",65.5%,2015
The Meg,/title/tt4779682/?ref_=bo_cso_table_197,197,The Meg,"$530,259,473","$145,443,742",27.4%,"$384,815,731",72.6%,2016
Ralph Breaks the Internet,/title/tt5848272/?ref_=bo_cso_table_198,198,Ralph Breaks the Internet,"$529,323,962","$201,091,711",38%,"$328,232,251",62%,2018
Hotel Transylvania 3: Summer Vacation,/title/tt5220122/?ref_=bo_cso_table_199,199,Hotel Transylvania 3: Summer Vacation,"$528,583,774","$167,510,016",31.7%,"$361,073,758",68.3%,2018


In [15]:
def Compile_List(pages):
    """
    Concatenates all the scraped pages of IMDB's "Top Lifetime Grosses" table.
    Input: list of partial URLs referring to pages of IMDB's "Top Lifetime Grosses" table.
    Output: concatenated df of entire table across all pages 
    """
    url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset={}"
    dfs = []
    for counter, page in enumerate (pages):
        page_url = url.format(page)
        counter = Get_Top_List(page_url) #should be a list of the dataframes
        dfs.append(counter) 
    return pd.concat(dfs)

    

In [16]:
pages = [0, 200, 400, 600, 800, 1000]

top_movies_df = Compile_List(pages)

In [17]:
top_movies_df.tail()

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
The Wolfman,/title/tt0780653/?ref_=bo_cso_table_196,1215,The Wolfman,"$139,789,765","$61,979,680",44.3%,"$77,810,085",55.7%,2010
Saw IV,/title/tt0890870/?ref_=bo_cso_table_197,1216,Saw IV,"$139,352,633","$63,300,095",45.4%,"$76,052,538",54.6%,2007
Mr. Six,/title/tt4701702/?ref_=bo_cso_table_198,1217,Mr. Six,"$139,191,345","$1,415,450",1%,"$137,775,895",99%,2015
Brüno,/title/tt0889583/?ref_=bo_cso_table_199,1218,Brüno,"$138,805,831","$60,054,530",43.3%,"$78,751,301",56.7%,2009
Bird on a Wire,/title/tt0099141/?ref_=bo_cso_table_200,1219,Bird on a Wire,"$138,697,012","$70,978,012",51.2%,"$67,719,000",48.8%,1990


In [18]:
top_movies_df.duplicated()

Avengers: Endgame                             False
Avatar                                        False
Titanic                                       False
Star Wars: Episode VII - The Force Awakens    False
Avengers: Infinity War                        False
                                              ...  
The Wolfman                                   False
Saw IV                                        False
Mr. Six                                       False
Brüno                                         False
Bird on a Wire                                False
Length: 1195, dtype: bool

In [22]:
top_movies_df.info

<bound method DataFrame.info of                                                                                Link  \
Avengers: Endgame                             /title/tt4154796/?ref_=bo_cso_table_1   
Avatar                                        /title/tt0499549/?ref_=bo_cso_table_2   
Titanic                                       /title/tt0120338/?ref_=bo_cso_table_3   
Star Wars: Episode VII - The Force Awakens    /title/tt2488496/?ref_=bo_cso_table_4   
Avengers: Infinity War                        /title/tt4154756/?ref_=bo_cso_table_5   
...                                                                             ...   
The Wolfman                                 /title/tt0780653/?ref_=bo_cso_table_196   
Saw IV                                      /title/tt0890870/?ref_=bo_cso_table_197   
Mr. Six                                     /title/tt4701702/?ref_=bo_cso_table_198   
Brüno                                       /title/tt0889583/?ref_=bo_cso_table_199   
Bird on a W

In [24]:
top_movies_df.tail()

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
The Wolfman,/title/tt0780653/?ref_=bo_cso_table_196,1215,The Wolfman,"$139,789,765","$61,979,680",44.3%,"$77,810,085",55.7%,2010
Saw IV,/title/tt0890870/?ref_=bo_cso_table_197,1216,Saw IV,"$139,352,633","$63,300,095",45.4%,"$76,052,538",54.6%,2007
Mr. Six,/title/tt4701702/?ref_=bo_cso_table_198,1217,Mr. Six,"$139,191,345","$1,415,450",1%,"$137,775,895",99%,2015
Brüno,/title/tt0889583/?ref_=bo_cso_table_199,1218,Brüno,"$138,805,831","$60,054,530",43.3%,"$78,751,301",56.7%,2009
Bird on a Wire,/title/tt0099141/?ref_=bo_cso_table_200,1219,Bird on a Wire,"$138,697,012","$70,978,012",51.2%,"$67,719,000",48.8%,1990


In [34]:
import pickle

In [35]:
top_movies_df.to_pickle('top_movies_df.pickle')