In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def Get_Titles(urls):
    """
    Concatenates scraped titles of The Numbers' "All Time Domestic Sequel Box Office " table.
    Input: list of partial URLs referring to pages of table. 
    For our purposes, we only need the first 3 pages as they're already sorted by top domestic gross.
    Output: concatenated list of title column of table across all inputed pages. 
    """ 
    sequels = []
    
    page_1 = 'https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/sequel/{}'
    
    for url in (urls):
        url = page_1.format(url)
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page)
        table = soup.find('table')
        rows = [row for row in table.find_all('tr')]
        for row in rows[1:]:
            items = row.find_all('td')
            link = items[2].find('a')
            title= link.text
            sequels.append(title)
    return sequels
    
    

In [3]:
urls = [0, 101, 201, 301, 401, 501, 601, 701, 801]
sequels = Get_Titles(urls)

In [4]:
len(sequels)

900

In [5]:
sequels

['Star Wars Ep. VII: The Force Awakens',
 'Avengers: Endgame',
 'Avengers: Infinity War',
 'Jurassic World',
 'Star Wars Ep. VIII: The Last Jedi',
 'Incredibles 2',
 'The Dark Knight',
 'Star Wars: The Rise of Skywalker',
 'Finding Dory',
 'Frozen II',
 'Star Wars Ep. I: The Phantom Menace',
 'Avengers: Age of Ultron',
 'The Dark Knight Rises',
 'Shrek 2',
 'Toy Story 4',
 'The Hunger Games: Catching Fire',
 'Pirates of the Caribbean: Dead Manâ\x80\x99s Chest',
 'Jurassic World: Fallen Kingdom',
 'Toy Story 3',
 'Iron Man 3',
 'Captain America: Civil War',
 'Jumanji: Welcome to the Jungle',
 'Transformers: Revenge of the Fallen',
 'Spider-Man: Far From Home',
 'Guardians of the Galaxy Vol 2',
 'Harry Potter and the Deathly Hallows: Part II',
 'Star Wars Ep. III: Revenge of the Sith',
 'The Lord of the Rings: The Return of the King',
 'Spider-Man 2',
 'Despicable Me 2',
 'Furious 7',
 'Transformers: Dark of the Moon',
 'The Lord of the Rings: The Two Towers',
 'The Hunger Games: Mocking

In [8]:
clean_df = pd.read_pickle('clean_df_with_adaptations.pickle')

In [9]:
clean_df.head()

Unnamed: 0,id,Title,World_Gross,Domestic_Gross,Foreign_Gross,Year,domestic_opening_gross,rating,foreign_market_count,genres,distributor,director,actor_1,actor_2,Years_Since_Release,Is_Adaptation
269,803096,Warcraft,"$439,048,914","$47,365,290","$391,683,624",2016,"$24,166,110",PG-13,66,Action\n \n Adventure\n \n ...,Universal Pictures,Duncan Jones,Travis Fimmel,Paula Patton,4,0
762,1473832,Bridget Jones's Baby,"$211,952,420","$24,252,420","$187,700,000",2016,"$8,571,785",R,61,Comedy\n \n Drama\n \n Rom...,Universal Pictures,Sharon Maguire,Renée Zellweger,Gemma Jones,4,0
403,1293847,xXx: Return of Xander Cage,"$346,118,277","$44,898,413","$301,219,864",2017,"$20,130,142",PG-13,55,Action\n \n Adventure\n \n ...,Paramount Pictures,D.J. Caruso,Vin Diesel,Donnie Yen,3,0
420,2034800,The Great Wall,"$334,933,831","$45,540,830","$289,393,001",2016,"$18,469,620",PG-13,64,Action\n \n Adventure\n \n ...,Universal Pictures,Yimou Zhang,Matt Damon,Tian Jing,4,0
674,453451,Mr. Bean's Holiday,"$232,225,908","$33,302,167","$198,923,741",2007,"$9,889,780",G,68,Comedy\n \n Family,Universal Pictures,Steve Bendelack,Rowan Atkinson,Willem Dafoe,13,0


In [11]:
movies = list(clean_df['Title'])
len(movies)

848

In [12]:
#create new list with 0 being not a sequel and 1 being a sequel, by checking my main df movie titles against
#sequel list on The Numbers and sequel naming conventions
is_sequel = []
for movie in movies:
    #check to see if movie titles are in the list of sequels
    if movie in sequels:
        is_sequel.append(1)
    #check to see if movie title has colon--signifying a sequel/franchise title
    elif ":" in movie:
        is_sequel.append(1)
    else:
        is_sequel.append(0)
      

In [15]:
len(is_sequel)

848

In [16]:
clean_df['Is_Sequel'] = is_sequel

In [17]:
clean_df.head()

Unnamed: 0,id,Title,World_Gross,Domestic_Gross,Foreign_Gross,Year,domestic_opening_gross,rating,foreign_market_count,genres,distributor,director,actor_1,actor_2,Years_Since_Release,Is_Adaptation,Is_Sequel
269,803096,Warcraft,"$439,048,914","$47,365,290","$391,683,624",2016,"$24,166,110",PG-13,66,Action\n \n Adventure\n \n ...,Universal Pictures,Duncan Jones,Travis Fimmel,Paula Patton,4,0,0
762,1473832,Bridget Jones's Baby,"$211,952,420","$24,252,420","$187,700,000",2016,"$8,571,785",R,61,Comedy\n \n Drama\n \n Rom...,Universal Pictures,Sharon Maguire,Renée Zellweger,Gemma Jones,4,0,0
403,1293847,xXx: Return of Xander Cage,"$346,118,277","$44,898,413","$301,219,864",2017,"$20,130,142",PG-13,55,Action\n \n Adventure\n \n ...,Paramount Pictures,D.J. Caruso,Vin Diesel,Donnie Yen,3,0,1
420,2034800,The Great Wall,"$334,933,831","$45,540,830","$289,393,001",2016,"$18,469,620",PG-13,64,Action\n \n Adventure\n \n ...,Universal Pictures,Yimou Zhang,Matt Damon,Tian Jing,4,0,0
674,453451,Mr. Bean's Holiday,"$232,225,908","$33,302,167","$198,923,741",2007,"$9,889,780",G,68,Comedy\n \n Family,Universal Pictures,Steve Bendelack,Rowan Atkinson,Willem Dafoe,13,0,0


In [19]:
clean_df[clean_df['Is_Sequel'] ==1]

Unnamed: 0,id,Title,World_Gross,Domestic_Gross,Foreign_Gross,Year,domestic_opening_gross,rating,foreign_market_count,genres,distributor,director,actor_1,actor_2,Years_Since_Release,Is_Adaptation,Is_Sequel
403,1293847,xXx: Return of Xander Cage,"$346,118,277","$44,898,413","$301,219,864",2017,"$20,130,142",PG-13,55,Action\n \n Adventure\n \n ...,Paramount Pictures,D.J. Caruso,Vin Diesel,Donnie Yen,3,0,1
569,317198,Bridget Jones: The Edge of Reason,"$265,126,918","$40,226,215","$224,900,703",2004,"$8,684,055",R,60,Comedy\n \n Drama\n \n Rom...,Universal Pictures,Beeban Kidron,Renée Zellweger,Colin Firth,16,0,1
717,3062096,Inferno,"$220,021,259","$34,343,574","$185,677,685",2016,"$14,860,425",PG-13,63,Action\n \n Adventure\n \n ...,Sony Pictures Entertainment (SPE),Ron Howard,Tom Hanks,Felicity Jones,4,1,1
298,3416828,Ice Age: Collision Course,"$408,579,038","$64,063,008","$344,516,030",2016,"$21,373,064",PG,53,Adventure\n \n Animation\n \n ...,Twentieth Century Fox,Mike Thurmeier,Ray Romano,Denis Leary,4,0,1
649,1855325,Resident Evil: Retribution,"$240,159,255","$42,345,531","$197,813,724",2012,"$21,052,227",R,52,Action\n \n Horror\n \n Sc...,Screen Gems,Paul W.S. Anderson,Milla Jovovich,Sienna Guillory,8,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,295178,Austin Powers in Goldmember,"$296,938,801","$213,307,889","$83,630,912",2002,"$73,071,188",PG-13,52,Action\n \n Adventure\n \n ...,New Line Cinema,Jay Roach,Mike Myers,Beyoncé,18,0,1
956,1229340,Anchorman 2: The Legend Continues,"$173,649,931","$127,352,707","$46,297,224",2013,"$26,232,425",PG-13,19,Comedy,Paramount Pictures,Adam McKay,Will Ferrell,Christina Applegate,7,0,1
887,1114740,Paul Blart: Mall Cop,"$183,348,429","$146,336,178","$37,012,251",2009,"$31,832,636",PG,47,Action\n \n Comedy\n \n Cr...,Sony Pictures Entertainment (SPE),Steve Carr,Kevin James,Keir O'Donnell,11,0,1
460,2592614,Resident Evil: The Final Chapter,"$312,242,626","$26,830,068","$285,412,558",2016,"$13,601,682",R,58,Action\n \n Horror\n \n Sc...,Screen Gems,Paul W.S. Anderson,Milla Jovovich,Iain Glen,4,0,1


In [21]:
clean_df.to_pickle('clean_df_with_adaptations_sequels.pickle')