In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': 'Int64', 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [9]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
3,A Nightmare on Elm Street,A Nightmare on Elm Street 3: Dream Warriors
4,A Nightmare on Elm Street 4: The Dream Master,A Nightmare on Elm Street 3: Dream Warriors


In [40]:
movie_titles = titles[titles.titleType == 'movie']
movie_titles = movie_titles[movie_titles.primaryTitle.isin(first_movie_list + sequel_movie_list)]
movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
movie_titles = pd.merge(movie_titles, names, on = 'nconst')
movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
movie_titles.head()

Unnamed: 0,primaryTitle,startYear,genres,ordering,category,primaryName
0,Before Midnight,1925,Drama,1,actor,William Russell
1,Before Midnight,1925,Drama,2,actress,Barbara Bedford
2,Before Midnight,1925,Drama,3,actor,Brinsley Shaw
3,Before Midnight,1925,Drama,4,actor,Alan Roscoe
4,Before Midnight,1925,Drama,5,director,John G. Adolfi


In [68]:
movie_titles = movie_titles[movie_titles.category.isin(['actor','actress','director'])]
print(movie_titles.shape)
movie_titles = movie_titles[~((movie_titles.category == 'director') & (movie_titles.ordering != 5))]
print(movie_titles.shape)
pd.pivot_table(movie_titles, columns = 'ordering', values = 'primaryName', index = ['primaryTitle','startYear','genres'], aggfunc = 'first')

(203, 6)
(203, 6)


Unnamed: 0_level_0,Unnamed: 1_level_0,ordering,5
primaryTitle,startYear,genres,Unnamed: 3_level_1
21 Jump Street,2012,"Action,Comedy,Crime",Phil Lord
22 Jump Street,2014,"Action,Comedy,Crime",Phil Lord
A Fistful of Dollars,1964,"Drama,Western",Sergio Leone
A Nightmare on Elm Street,1984,Horror,Wes Craven
A Nightmare on Elm Street,2010,"Crime,Drama,Horror",Samuel Bayer
A Nightmare on Elm Street 2: Freddy's Revenge,1985,Horror,Jack Sholder
A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Chuck Russell
A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Renny Harlin
Ace Ventura: Pet Detective,1994,Comedy,Tom Shadyac
Ace Ventura: When Nature Calls,1995,"Adventure,Comedy,Crime",Steve Oedekerk


In [0]:
first_movie_titles = movie_titles[movie_titles.primaryTitle.isin(first_movie_list)]
first_movie_titles.columns = 'fm_' + first_movie_titles.columns
sequel_movie_titles = movie_titles[movie_titles.primaryTitle.isin(sequel_movie_list)]
sequel_movie_titles.columns = 's_' + sequel_movie_titles.columns

In [28]:
first_movie_titles.head()

Unnamed: 0,fm_tconst,fm_titleType,fm_primaryTitle,fm_startYear,fm_genres
57371,tt0058461,movie,A Fistful of Dollars,1964,"Drama,Western"
58467,tt0059578,movie,For a Few Dollars More,1965,Western
65684,tt0066999,movie,Dirty Harry,1971,"Action,Crime,Thriller"
68950,tt0070355,movie,Magnum Force,1973,"Action,Crime,Mystery"
77852,tt0079501,movie,Mad Max,1979,"Action,Adventure,Sci-Fi"


In [29]:
pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner')

Unnamed: 0,fm_tconst,fm_titleType,fm_primaryTitle,fm_startYear,fm_genres,first_movie,sequel
0,tt0058461,movie,A Fistful of Dollars,1964,"Drama,Western",A Fistful of Dollars,For a Few Dollars More
1,tt0058461,movie,A Fistful of Dollars,1964,"Drama,Western",A Fistful of Dollars,"The Good, the Bad, and the Ugly"
2,tt0059578,movie,For a Few Dollars More,1965,Western,For a Few Dollars More,A Fistful of Dollars
3,tt0059578,movie,For a Few Dollars More,1965,Western,For a Few Dollars More,"The Good, the Bad, and the Ugly"
4,tt0066999,movie,Dirty Harry,1971,"Action,Crime,Thriller",Dirty Harry,Magnum Force
5,tt0066999,movie,Dirty Harry,1971,"Action,Crime,Thriller",Dirty Harry,Sudden Impact
6,tt0066999,movie,Dirty Harry,1971,"Action,Crime,Thriller",Dirty Harry,The Enforcer
7,tt0070355,movie,Magnum Force,1973,"Action,Crime,Mystery",Magnum Force,Dirty Harry
8,tt0079501,movie,Mad Max,1979,"Action,Adventure,Sci-Fi",Mad Max,Mad Max: Fury Road
9,tt0079501,movie,Mad Max,1979,"Action,Adventure,Sci-Fi",Mad Max,Mad Max 2


In [33]:
len(sequels)

324

In [0]:
sequels_titles = sequels_titles[~sequels_titles.primaryTitle.duplicated()]
sequels_titles.shape
sequels_titles_kept = list(sequels_titles.primaryTitle)

In [0]:
sequels_pairs_kept = {}
for movie in sequels_pairs:
  if movie in sequels_titles_kept and sequels_pairs[movie] in sequels_titles_kept:
    sequels_pairs_kept[movie] = sequels_pairs[movie]

In [60]:
sequels_pairs_kept.keys() + sequels_pairs_kept.values()

TypeError: ignored

In [0]:
sequels_titles = sequels_titles[sequels_titles.primaryTitle.isin(list(sequels_pairs_kept.keys()) + list(sequels_pairs_kept.values()))]

In [63]:
sequels_titles.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres
65684,tt0066999,movie,Dirty Harry,1971,"Action,Crime,Thriller"
68950,tt0070355,movie,Magnum Force,1973,"Action,Crime,Mystery"
77852,tt0079501,movie,Mad Max,1979,"Action,Adventure,Sci-Fi"
78158,tt0079817,movie,Rocky II,1979,"Drama,Sport"
80672,tt0082418,movie,Friday the 13th Part 2,1981,"Horror,Mystery,Thriller"
81879,tt0083658,movie,Blade Runner,1982,"Action,Sci-Fi,Thriller"
82123,tt0083907,movie,The Evil Dead,1981,Horror
82159,tt0083944,movie,First Blood,1982,"Action,Adventure"
82187,tt0083972,movie,Friday the 13th Part III,1982,"Horror,Thriller"
82797,tt0084602,movie,Rocky III,1982,"Drama,Sport"


In [66]:
sequels_pairs["A Fistful of Dollars"]

'The Good, the Bad, and the Ugly'