In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': str, 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [10]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
3,A Nightmare on Elm Street,A Nightmare on Elm Street 3: Dream Warriors
4,A Nightmare on Elm Street 4: The Dream Master,A Nightmare on Elm Street 3: Dream Warriors


In [11]:
movie_titles = titles[titles.titleType == 'movie']
movie_titles = movie_titles[movie_titles.primaryTitle.isin(first_movie_list + sequel_movie_list)]
movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
movie_titles = pd.merge(movie_titles, names, on = 'nconst')
movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
movie_titles.head()

Unnamed: 0,primaryTitle,startYear,genres,ordering,category,primaryName
0,Before Midnight,1925,Drama,1,actor,William Russell
1,Before Midnight,1925,Drama,2,actress,Barbara Bedford
2,Before Midnight,1925,Drama,3,actor,Brinsley Shaw
3,Before Midnight,1925,Drama,4,actor,Alan Roscoe
4,Before Midnight,1925,Drama,5,director,John G. Adolfi


In [21]:
movie_information = movie_titles[movie_titles.category.isin(['actor','actress','director'])].copy()
print(movie_information.shape)
movie_information.replace(['actor','actress','director'], ['act','act','dir'], inplace = True)
movie_information['role'] = movie_information.category + movie_information.ordering
movie_information.head()
movie_summary = pd.pivot_table(movie_information, columns = 'role', values = 'primaryName', index = ['primaryTitle','startYear','genres'], aggfunc = 'first')
movie_summary = movie_summary[['act1','act2','act3','act4','dir5']].reset_index()
movie_summary.head()

(1112, 6)


role,primaryTitle,startYear,genres,act1,act2,act3,act4,dir5
0,21 Jump Street,2012,"Action,Comedy,Crime",Jonah Hill,Channing Tatum,Ice Cube,Brie Larson,Phil Lord
1,22 Jump Street,2014,"Action,Comedy,Crime",Channing Tatum,Jonah Hill,Ice Cube,Nick Offerman,Phil Lord
2,A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone
3,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven
4,A Nightmare on Elm Street,2010,"Crime,Drama,Horror",Jackie Earle Haley,Rooney Mara,Kyle Gallner,Katie Cassidy,Samuel Bayer


In [0]:
first_movie_titles = movie_summary[movie_summary.primaryTitle.isin(first_movie_list)]
first_movie_titles.columns = 'fm_' + first_movie_titles.columns
sequel_movie_titles = movie_summary[movie_summary.primaryTitle.isin(sequel_movie_list)]
sequel_movie_titles.columns = 's_' + sequel_movie_titles.columns

In [23]:
first_movie_titles.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5
1,22 Jump Street,2014,"Action,Comedy,Crime",Channing Tatum,Jonah Hill,Ice Cube,Nick Offerman,Phil Lord
2,A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone
3,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven
4,A Nightmare on Elm Street,2010,"Crime,Drama,Horror",Jackie Earle Haley,Rooney Mara,Kyle Gallner,Katie Cassidy,Samuel Bayer
6,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell


In [29]:
sequel_table = pd.merge(pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner'),
                        sequel_movie_titles, left_on = 'sequel', right_on = 's_primaryTitle', how = 'inner')
sequel_table = sequel_table[sequel_table.fm_startYear <= sequel_table.s_startYear].copy()
sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,first_movie,sequel,s_primaryTitle,s_startYear,s_genres,s_act1,s_act2,s_act3,s_act4,s_dir5
1,A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone,A Fistful of Dollars,For a Few Dollars More,For a Few Dollars More,1965,Western,Clint Eastwood,Lee Van Cleef,Gian Maria Volontè,Mario Brega,Sergio Leone
2,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street 2: Freddy's Revenge,1985,Horror,Robert Englund,Mark Patton,Kim Myers,Robert Rusler,Jack Sholder
4,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell
7,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street,A Nightmare on Elm Street 4: The Dream Master,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin
9,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 4: The Dream Master,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin


In [30]:
print(sequel_table.shape)

(113, 18)


In [0]:
non_sequel_movies = ['The Searchers','Chisum','She Wore a Yellow Ribbon','There Will Be Blood','Phantom Thread','Little Big Man',
                     'Bonnie and Clyde','Out of Africa','Silkwood','Florence Foster Jenkins','The Shawshank Redemption',
                     'Escape from Alcatraz','Little Big Horn','American Beauty','An American in Paris']

In [0]:
non_sequel_titles = titles[(titles.titleType == 'movie') & (titles.primaryTitle.isin(non_sequel_movies))]

In [53]:
non_sequel_titles

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres
41152,tt0041866,movie,She Wore a Yellow Ribbon,1949,Western
42536,tt0043278,movie,An American in Paris,1951,"Drama,Musical,Romance"
43000,tt0043747,movie,Little Big Horn,1951,Western
48860,tt0049730,movie,The Searchers,1956,"Adventure,Drama,Western"
60255,tt0061418,movie,Bonnie and Clyde,1967,"Action,Biography,Crime"
64264,tt0065547,movie,Chisum,1970,"Biography,Western"
64695,tt0065988,movie,Little Big Man,1970,"Adventure,Comedy,Drama"
77474,tt0079116,movie,Escape from Alcatraz,1979,"Biography,Crime,Drama"
84464,tt0086312,movie,Silkwood,1983,"Biography,Drama,History"
87820,tt0089755,movie,Out of Africa,1985,"Biography,Drama,Romance"


In [0]:
titles1 = titles[titles.titleType == 'movie']
names1 = names[names.primaryName.isin(['Meryl Streep'])]
cast1 = pd.merge(names1, cast, on = 'nconst')
titles2 = pd.merge(cast1, titles1, on = 'tconst')

In [46]:
titles2

Unnamed: 0,nconst,primaryName,tconst,ordering,category,titleType,primaryTitle,startYear,genres
0,nm0000658,Meryl Streep,tt0079417,2,actress,movie,Kramer vs. Kramer,1979,Drama
1,nm0000658,Meryl Streep,tt0079875,3,actress,movie,The Seduction of Joe Tynan,1979,Drama
2,nm0000658,Meryl Streep,tt0082416,1,actress,movie,The French Lieutenant's Woman,1981,"Drama,Romance"
3,nm0000658,Meryl Streep,tt0084707,1,actress,movie,Sophie's Choice,1982,"Drama,Romance"
4,nm0000658,Meryl Streep,tt0084732,2,actress,movie,Still of the Night,1982,"Crime,Drama,Mystery"
5,nm0000658,Meryl Streep,tt0086312,1,actress,movie,Silkwood,1983,"Biography,Drama,History"
6,nm0000658,Meryl Streep,tt0087233,2,actress,movie,Falling in Love,1984,"Drama,Romance"
7,nm0000658,Meryl Streep,tt0089755,1,actress,movie,Out of Africa,1985,"Biography,Drama,Romance"
8,nm0000658,Meryl Streep,tt0089816,1,actress,movie,Plenty,1985,Drama
9,nm0000658,Meryl Streep,tt0091188,1,actress,movie,Heartburn,1986,"Comedy,Drama"


In [33]:
n = 15
n*(n-1)/2

105.0