In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [32]:
def Get_Top_List(url):
    """
    Converts IMDB table into pandas dataframe.
    Input: url link from IMDB table page for "Top Lifetime Grosses"
    Output: formatted dataframe of table
    """
    
    response = requests.get(url)

    page = response.text

    soup = BeautifulSoup(page)

    table = soup.find(id='table')
    
    rows = [row for row in table.find_all('tr')]

    movies = {}

    for row in rows[1:]:
        items = row.find_all('td')
        link = items[1].find('a')
        title, url = link.text, link['href']
        movies[title] = [url] + [i.text for i in items]
     
    movie_page = pd.DataFrame(movies).T  #transpose
    movie_page.columns = ['Link', 'Rank', 'Title', 'World_Gross', 
                    'Domestic_Gross', 'Domestic_Percentage', 'Foreign_Gross', 'Foreign_Percentage','Year']

    return movie_page

In [33]:
Get_Top_List('https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/')

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
Avengers: Endgame,/title/tt4154796/?ref_=bo_cso_table_1,1,Avengers: Endgame,"$2,797,800,564","$858,373,000",30.7%,"$1,939,427,564",69.3%,2019
Avatar,/title/tt0499549/?ref_=bo_cso_table_2,2,Avatar,"$2,790,439,092","$760,507,625",27.2%,"$2,029,931,467",72.8%,2009
Titanic,/title/tt0120338/?ref_=bo_cso_table_3,3,Titanic,"$2,195,169,869","$659,363,944",30%,"$1,535,805,925",70%,1997
Star Wars: Episode VII - The Force Awakens,/title/tt2488496/?ref_=bo_cso_table_4,4,Star Wars: Episode VII - The Force Awakens,"$2,068,224,036","$936,662,225",45.3%,"$1,131,561,811",54.7%,2015
Avengers: Infinity War,/title/tt4154756/?ref_=bo_cso_table_5,5,Avengers: Infinity War,"$2,048,359,754","$678,815,482",33.1%,"$1,369,544,272",66.9%,2018
...,...,...,...,...,...,...,...,...,...
The Revenant,/title/tt1663202/?ref_=bo_cso_table_196,196,The Revenant,"$532,950,503","$183,637,894",34.5%,"$349,312,609",65.5%,2015
The Meg,/title/tt4779682/?ref_=bo_cso_table_197,197,The Meg,"$530,259,473","$145,443,742",27.4%,"$384,815,731",72.6%,2016
Ralph Breaks the Internet,/title/tt5848272/?ref_=bo_cso_table_198,198,Ralph Breaks the Internet,"$529,323,962","$201,091,711",38%,"$328,232,251",62%,2018
Hotel Transylvania 3: Summer Vacation,/title/tt5220122/?ref_=bo_cso_table_199,199,Hotel Transylvania 3: Summer Vacation,"$528,583,774","$167,510,016",31.7%,"$361,073,758",68.3%,2018


In [109]:
def Compile_List(pages):
    url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset={}"
    dfs = []
    for page in pages:
        page_url = url.format(page)
        page_df = Get_Top_List(url) #should be a list of the dataframes
        dfs.append(page_df) 
    return pd.concat(dfs)

    

In [111]:
pages = [200, 400, 600, 800, 1000]

top_movies_df = Compile_List(pages)

In [113]:
top_movies_df.tail()

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
The Revenant,/title/tt1663202/?ref_=bo_cso_table_196,196,The Revenant,"$532,950,503","$183,637,894",34.5%,"$349,312,609",65.5%,2015
The Meg,/title/tt4779682/?ref_=bo_cso_table_197,197,The Meg,"$530,259,473","$145,443,742",27.4%,"$384,815,731",72.6%,2016
Ralph Breaks the Internet,/title/tt5848272/?ref_=bo_cso_table_198,198,Ralph Breaks the Internet,"$529,323,962","$201,091,711",38%,"$328,232,251",62%,2018
Hotel Transylvania 3: Summer Vacation,/title/tt5220122/?ref_=bo_cso_table_199,199,Hotel Transylvania 3: Summer Vacation,"$528,583,774","$167,510,016",31.7%,"$361,073,758",68.3%,2018
The Boss Baby,/title/tt3874544/?ref_=bo_cso_table_200,200,The Boss Baby,"$527,965,936","$175,003,033",33.2%,"$352,962,903",66.8%,2017


In [114]:
top_movies_df.duplicated()

Avengers: Endgame                             False
Avatar                                        False
Titanic                                       False
Star Wars: Episode VII - The Force Awakens    False
Avengers: Infinity War                        False
                                              ...  
The Revenant                                   True
The Meg                                        True
Ralph Breaks the Internet                      True
Hotel Transylvania 3: Summer Vacation          True
The Boss Baby                                  True
Length: 995, dtype: bool

In [65]:
page_2 = Get_Top_List('https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=200')

In [67]:
page_2.tail()

Unnamed: 0,Link,Rank,Title,World_Gross,Domestic_Gross,Domestic_Percentage,Foreign_Gross,Foreign_Percentage,Year
Mary Poppins Returns,/title/tt5028340/?ref_=bo_cso_table_196,397,Mary Poppins Returns,"$349,545,966","$171,958,438",49.2%,"$177,587,528",50.8%,2018
Deep Impact,/title/tt0120647/?ref_=bo_cso_table_197,398,Deep Impact,"$349,464,664","$140,464,664",40.2%,"$209,000,000",59.8%,1998
The Lorax,/title/tt1482459/?ref_=bo_cso_table_198,399,The Lorax,"$348,840,316","$214,030,500",61.4%,"$134,809,816",38.6%,2012
The Maze Runner,/title/tt1790864/?ref_=bo_cso_table_199,400,The Maze Runner,"$348,319,861","$102,427,862",29.4%,"$245,891,999",70.6%,2014
The Smurfs 2,/title/tt2017020/?ref_=bo_cso_table_200,401,The Smurfs 2,"$347,545,360","$71,017,784",20.4%,"$276,527,576",79.6%,2013


In [68]:
def url_filler(pages):
    urls = []
    url = "https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset={}"
    for page in pages:
        page_url = url.format(page)
        urls.append(page_url)
    return urls

In [69]:
pages = [0, 200, 400, 600, 800, 1000]

url_filler(pages)

['https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=0',
 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=200',
 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=400',
 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=600',
 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=800',
 'https://www.boxofficemojo.com/chart/ww_top_lifetime_gross/?offset=1000']