In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': str, 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [9]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
3,A Nightmare on Elm Street,A Nightmare on Elm Street 3: Dream Warriors
4,A Nightmare on Elm Street 4: The Dream Master,A Nightmare on Elm Street 3: Dream Warriors


In [16]:
movie_titles = titles[titles.titleType == 'movie']
movie_titles = movie_titles[movie_titles.primaryTitle.isin(first_movie_list + sequel_movie_list)]
movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
movie_titles = pd.merge(movie_titles, names, on = 'nconst')
movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
movie_titles.head()

Unnamed: 0,primaryTitle,startYear,genres,ordering,category,primaryName
0,Before Midnight,1925,Drama,1,actor,William Russell
1,Before Midnight,1925,Drama,2,actress,Barbara Bedford
2,Before Midnight,1925,Drama,3,actor,Brinsley Shaw
3,Before Midnight,1925,Drama,4,actor,Alan Roscoe
4,Before Midnight,1925,Drama,5,director,John G. Adolfi


In [21]:
movie_information = movie_titles[movie_titles.category.isin(['actor','actress','director'])].copy()
print(movie_information.shape)
movie_information.replace(['actor','actress','director'], ['act','act','dir'], inplace = True)
movie_information['role'] = movie_information.category + movie_information.ordering
movie_information.head()
#movie_titles = movie_titles[~((movie_titles.category == 'director') & (movie_titles.ordering != 5))]
#print(movie_information.shape)
pd.pivot_table(movie_information, columns = 'role', values = 'primaryName', index = ['primaryTitle','startYear','genres'], aggfunc = 'first')

(1112, 6)


Unnamed: 0_level_0,Unnamed: 1_level_0,role,act1,act10,act2,act3,act4,act6,act7,act8,act9,dir1,dir2,dir4,dir5,dir6,dir7,dir8,dir9
primaryTitle,startYear,genres,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
21 Jump Street,2012,"Action,Comedy,Crime",Jonah Hill,,Channing Tatum,Ice Cube,Brie Larson,,,,,,,,Phil Lord,Christopher Miller,,,
22 Jump Street,2014,"Action,Comedy,Crime",Channing Tatum,,Jonah Hill,Ice Cube,Nick Offerman,,,,,,,,Phil Lord,Christopher Miller,,,
A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,,,,,,,,Sergio Leone,,,,
A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,,Johnny Depp,Robert Englund,John Saxon,,,,,,,,Wes Craven,,,,
A Nightmare on Elm Street,2010,"Crime,Drama,Horror",Jackie Earle Haley,,Rooney Mara,Kyle Gallner,Katie Cassidy,,,,,,,,Samuel Bayer,,,,
A Nightmare on Elm Street 2: Freddy's Revenge,1985,Horror,Robert Englund,,Mark Patton,Kim Myers,Robert Rusler,,,,,,,,Jack Sholder,,,,
A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,,Robert Englund,Craig Wasson,Patricia Arquette,,,,,,,,Chuck Russell,,,,
A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,,Rodney Eastman,John Beckman,Kisha Brackel,,,,,,,,Renny Harlin,,,,
Ace Ventura: Pet Detective,1994,Comedy,Jim Carrey,,Courteney Cox,Sean Young,Tone Loc,,,,,,,,Tom Shadyac,,,,
Ace Ventura: When Nature Calls,1995,"Adventure,Comedy,Crime",Jim Carrey,,Ian McNeice,Simon Callow,Maynard Eziashi,,,,,,,,Steve Oedekerk,,,,


In [0]:
first_movie_titles = movie_titles[movie_titles.primaryTitle.isin(first_movie_list)]
first_movie_titles.columns = 'fm_' + first_movie_titles.columns
sequel_movie_titles = movie_titles[movie_titles.primaryTitle.isin(sequel_movie_list)]
sequel_movie_titles.columns = 's_' + sequel_movie_titles.columns

In [13]:
first_movie_titles.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_ordering,fm_category,fm_primaryName
83,A Fistful of Dollars,1964,"Drama,Western",1,actor,Clint Eastwood
84,For a Few Dollars More,1965,Western,1,actor,Clint Eastwood
85,Dirty Harry,1971,"Action,Crime,Thriller",1,actor,Clint Eastwood
86,Magnum Force,1973,"Action,Crime,Mystery",1,actor,Clint Eastwood
88,Sudden Impact,1983,"Action,Crime,Thriller",1,actor,Clint Eastwood


In [14]:
pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner')

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_ordering,fm_category,fm_primaryName,first_movie,sequel
0,A Fistful of Dollars,1964,"Drama,Western",1,actor,Clint Eastwood,A Fistful of Dollars,For a Few Dollars More
1,A Fistful of Dollars,1964,"Drama,Western",1,actor,Clint Eastwood,A Fistful of Dollars,"The Good, the Bad, and the Ugly"
2,A Fistful of Dollars,1964,"Drama,Western",2,actor,Gian Maria Volontè,A Fistful of Dollars,For a Few Dollars More
3,A Fistful of Dollars,1964,"Drama,Western",2,actor,Gian Maria Volontè,A Fistful of Dollars,"The Good, the Bad, and the Ugly"
4,A Fistful of Dollars,1964,"Drama,Western",3,actress,Marianne Koch,A Fistful of Dollars,For a Few Dollars More
5,A Fistful of Dollars,1964,"Drama,Western",3,actress,Marianne Koch,A Fistful of Dollars,"The Good, the Bad, and the Ugly"
6,A Fistful of Dollars,1964,"Drama,Western",4,actor,Wolfgang Lukschy,A Fistful of Dollars,For a Few Dollars More
7,A Fistful of Dollars,1964,"Drama,Western",4,actor,Wolfgang Lukschy,A Fistful of Dollars,"The Good, the Bad, and the Ugly"
8,For a Few Dollars More,1965,Western,1,actor,Clint Eastwood,For a Few Dollars More,A Fistful of Dollars
9,For a Few Dollars More,1965,Western,1,actor,Clint Eastwood,For a Few Dollars More,"The Good, the Bad, and the Ugly"
