In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [8]:
def Get_Top_List(url):
    """
    Converts IMDB table into pandas dataframe.
    Input: url link from IMDB table page for "Top Lifetime Grosses"
    Output: formatted dataframe of table
    """
    
    response = requests.get(url)

    page = response.text

    soup = BeautifulSoup(page)

    table = soup.find(id='table')
    
    rows = [row for row in table.find_all('tr')]

    movies = {} 

    for row in rows[1:]:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url_string = link.text, link['href']
        #split url string into unique movie serial number
        url = url_string.split('?', 1)[0].split('t', 4)[-1].split('/', 1)[0]
        #set serial number as key to avoid duplication in any other category-especially title
        movies[url] = [url_string] +[i.text for i in items]
    
    movie_page = pd.DataFrame(movies).T  #transpose
    movie_page.columns = ['URL', 'Rank', 'Title', 'World_Gross', 
                    'Domestic_Gross', 'Domestic_Percentage', 'Foreign_Gross', 'Foreign_Percentage','Year']

    return movie_page 

In [9]:
df_test = Get_Top_List('https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/')

In [10]:
df_test.head(55)

Unnamed: 0,URL,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
4154796,/title/tt4154796/?ref_=bo_cso_table_1,1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,2019
499549,/title/tt0499549/?ref_=bo_cso_table_2,2,Avatar,"$2,790,439,092","$760,507,625",27.2%,"$2,029,931,467",72.8%,2009
120338,/title/tt0120338/?ref_=bo_cso_table_3,3,Titanic,"$2,195,169,869","$659,363,944",30%,"$1,535,805,925",70%,1997
2488496,/title/tt2488496/?ref_=bo_cso_table_4,4,Star Wars: Episode VII - The Force Awakens,"$2,068,224,036","$936,662,225",45.3%,"$1,131,561,811",54.7%,2015
4154756,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018
369610,/title/tt0369610/?ref_=bo_cso_table_6,6,Jurassic World,"$1,670,401,444","$652,270,625",39%,"$1,018,130,819",61%,2015
6105098,/title/tt6105098/?ref_=bo_cso_table_7,7,The Lion King,"$1,656,963,790","$543,638,043",32.8%,"$1,113,325,747",67.2%,2019
848228,/title/tt0848228/?ref_=bo_cso_table_8,8,The Avengers,"$1,518,815,515","$623,357,910",41%,"$895,457,605",59%,2012
2820852,/title/tt2820852/?ref_=bo_cso_table_9,9,Furious 7,"$1,515,048,151","$353,007,020",23.3%,"$1,162,041,131",76.7%,2015
4520988,/title/tt4520988/?ref_=bo_cso_table_10,10,Frozen II,"$1,450,026,933","$477,373,578",32.9%,"$972,653,355",67.1%,2019


In [11]:
def Compile_List(pages):
    """
    Concatenates all the scraped pages of IMDB's "Top Lifetime Grosses" table.
    Input: list of partial URLs referring to pages of IMDB's "Top Lifetime Grosses" table.
    Output: concatenated df of entire table across all pages 
    """
    url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset={}"
    dfs = []
    for counter, page in enumerate (pages):
        page_url = url.format(page)
        counter = Get_Top_List(page_url) #should be a list of the dataframes
        dfs.append(counter) 
    return pd.concat(dfs)

    

In [12]:
pages = [0, 200, 400, 600, 800, 1000]

top_movies_df = Compile_List(pages)

In [16]:
top_movies_df.shape

(1200, 9)

In [17]:
top_movies_df.duplicated()

4154796    False
0499549    False
0120338    False
2488496    False
4154756    False
           ...  
0780653    False
0890870    False
4701702    False
0889583    False
0099141    False
Length: 1200, dtype: bool

In [18]:
top_movies_df.tail()

Unnamed: 0,URL,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
780653,/title/tt0780653/?ref_=bo_cso_table_196,1215,The Wolfman,"$139,789,765","$61,979,680",44.3%,"$77,810,085",55.7%,2010
890870,/title/tt0890870/?ref_=bo_cso_table_197,1216,Saw IV,"$139,352,633","$63,300,095",45.4%,"$76,052,538",54.6%,2007
4701702,/title/tt4701702/?ref_=bo_cso_table_198,1217,Mr. Six,"$139,191,345","$1,415,450",1%,"$137,775,895",99%,2015
889583,/title/tt0889583/?ref_=bo_cso_table_199,1218,Brüno,"$138,805,831","$60,054,530",43.3%,"$78,751,301",56.7%,2009
99141,/title/tt0099141/?ref_=bo_cso_table_200,1219,Bird on a Wire,"$138,697,012","$70,978,012",51.2%,"$67,719,000",48.8%,1990


In [23]:
#check for duplicate titles--we do have a number that are originals of remakes appearing later on the list
top_movies_df[top_movies_df.duplicated(['Title'])]

Unnamed: 0,URL,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
110357,/title/tt0110357/?ref_=bo_cso_table_51,51,The Lion King,"$968,511,805","$422,783,777",43.6%,"$545,728,028",56.4%,1994
103639,/title/tt0103639/?ref_=bo_cso_table_12,212,Aladdin,"$504,050,219","$217,350,219",43.1%,"$286,700,000",56.9%,1992
101414,/title/tt0101414/?ref_=bo_cso_table_69,269,Beauty and the Beast,"$440,118,382","$218,967,620",49.8%,"$221,150,762",50.2%,1991
2345759,/title/tt2345759/?ref_=bo_cso_table_98,298,The Mummy,"$409,231,607","$80,227,895",19.6%,"$329,003,712",80.4%,2017
120685,/title/tt0120685/?ref_=bo_cso_table_131,331,Godzilla,"$379,014,294","$136,314,294",36%,"$242,700,000",64%,1998
1267297,/title/tt1267297/?ref_=bo_cso_table_36,641,Hercules,"$244,819,862","$72,688,614",29.7%,"$172,131,248",70.3%,2014
1289401,/title/tt1289401/?ref_=bo_cso_table_85,690,Ghostbusters,"$229,147,509","$128,350,574",56%,"$100,796,935",44%,2016
61852,/title/tt0061852/?ref_=bo_cso_table_172,778,The Jungle Book,"$210,310,084","$141,843,612",67.4%,"$68,466,472",32.6%,1967
100758,/title/tt0100758/?ref_=bo_cso_table_7,814,Teenage Mutant Ninja Turtles,"$201,965,915","$135,265,915",67%,"$66,700,000",33%,1990
1386703,/title/tt1386703/?ref_=bo_cso_table_22,829,Total Recall,"$198,467,168","$58,877,969",29.7%,"$139,589,199",70.3%,2012


In [19]:
import pickle

In [20]:
#pickle df
top_movies_df.to_pickle('top_movies_df.pickle')