In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': str, 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [8]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street
3,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
4,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 2: Freddy's Revenge


In [0]:
titles = titles[titles.titleType == 'movie']

In [0]:
def prepare_data(first_movie_list, sequel_movie_list):
  movie_titles = titles[titles.primaryTitle.isin(first_movie_list + sequel_movie_list)]
  movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
  movie_titles = pd.merge(movie_titles, names, on = 'nconst')
  movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
  
  movie_information = movie_titles[movie_titles.category.isin(['actor','actress','director'])].copy()
  movie_information.replace(['actor','actress','director'], ['act','act','dir'], inplace = True)
  movie_information['role'] = movie_information.category + movie_information.ordering
  movie_summary = pd.pivot_table(movie_information, columns = 'role', values = 'primaryName', index = ['primaryTitle','startYear'], aggfunc = 'first') #add genres back
  movie_summary = movie_summary[['act1','act2','act3','act4','dir5']].reset_index()
  
  first_movie_titles = movie_summary[movie_summary.primaryTitle.isin(first_movie_list)]
  first_movie_titles = first_movie_titles.add_prefix('fm_')
  sequel_movie_titles = movie_summary[movie_summary.primaryTitle.isin(sequel_movie_list)]
  sequel_movie_titles = sequel_movie_titles.add_prefix('s_')
  
  sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
  
  sequel_table = pd.merge(pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner'),
                        sequel_movie_titles, left_on = 'sequel', right_on = 's_primaryTitle', how = 'inner')
  sequel_table = sequel_table[sequel_table.fm_startYear <= sequel_table.s_startYear].copy()
  sequel_table.drop(columns = ['first_movie','sequel'], inplace = True)

  return sequel_table

In [0]:
sequel_table = prepare_data(first_movie_list, sequel_movie_list)

In [12]:
sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_act1,s_act2,s_act3,s_act4,s_dir5
1,A Fistful of Dollars,1964,Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone,For a Few Dollars More,1965,Clint Eastwood,Lee Van Cleef,Gian Maria Volontè,Mario Brega,Sergio Leone
2,A Nightmare on Elm Street,1984,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 2: Freddy's Revenge,1985,Robert Englund,Mark Patton,Kim Myers,Robert Rusler,Jack Sholder
5,A Nightmare on Elm Street,1984,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 3: Dream Warriors,1987,Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell
8,A Nightmare on Elm Street,1984,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 4: The Dream Master,1988,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin
10,A Nightmare on Elm Street 3: Dream Warriors,1987,Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell,A Nightmare on Elm Street 4: The Dream Master,1988,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin


In [0]:
non_sequel_movies = ['The Searchers','Chisum','She Wore a Yellow Ribbon','There Will Be Blood','Phantom Thread','Little Big Man',
                     'Bonnie and Clyde','Out of Africa','Silkwood','Florence Foster Jenkins','The Shawshank Redemption',
                     'Escape from Alcatraz','Little Big Horn','American Beauty','An American in Paris']

In [0]:
ns_first_movie_list = []
ns_sequel_movie_list = []
for first_movie in non_sequel_movies:
  for sequel_movie in non_sequel_movies:
    if first_movie != sequel_movie:
      ns_first_movie_list.append(first_movie)
      ns_sequel_movie_list.append(sequel_movie)

In [0]:
non_sequel_table = prepare_data(ns_first_movie_list, ns_sequel_movie_list)

In [16]:
non_sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_act1,s_act2,s_act3,s_act4,s_dir5
1,An American in Paris,1951,Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,The Searchers,1956,John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
6,Little Big Horn,1951,Lloyd Bridges,John Ireland,Marie Windsor,Reed Hadley,Charles Marquis Warren,The Searchers,1956,John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
10,She Wore a Yellow Ribbon,1949,John Wayne,Joanne Dru,John Agar,Ben Johnson,John Ford,The Searchers,1956,John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
15,An American in Paris,1951,Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,Chisum,1970,John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen
16,Bonnie and Clyde,1967,Warren Beatty,Faye Dunaway,Michael J. Pollard,Gene Hackman,Arthur Penn,Chisum,1970,John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen


In [0]:
def get_plot(movie_table):
  plot_list = {}
  movie_list = np.array(pd.concat([movie_table.fm_primaryTitle, movie_table.s_primaryTitle]))
  year_list = np.array(pd.concat([movie_table.fm_startYear, movie_table.s_startYear]))
  for movie, year in zip(movie_list, year_list):
    if movie not in plot_list:
      movie_y_f, movie_f = movie + ' (' + str(year) + ' film)', movie + ' (film)'
      search1 = wikipedia.search(movie_y_f)[0]
      search2 = wikipedia.search(movie_f)[0]
      if search1 == movie_y_f:
        search = movie_y_f
      elif search2 == movie_f:
        search = movie_f
      else:
        search = movie
      page = wikipedia.page(search)
      if not page:
        print('movie not found: %s' %movie)
      plot = page.section("Plot")
      if not plot:
        plot = page.section("Synopsis")
      if plot:
        plot_list[movie] = plot
      else:
        print('plot not found: %s' %movie)
      
  return plot_list

In [0]:
non_sequel_plot_list = get_plot(non_sequel_table)

In [46]:
non_sequel_plot_list

{'American Beauty': "Lester Burnham is a middle-aged magazine executive who despises his job and is unhappily married to Carolyn, a neurotic and ambitious real estate broker; their sixteen-year-old daughter, Jane, abhors her parents and has low self-esteem. The Burnhams' new neighbors are retired US Marine colonel Frank Fitts and his near-catatonic wife, Barbara. The Fitts' teenage son, Ricky, obsessively films his surroundings with a camcorder, collecting hundreds of recordings on video tapes in his bedroom, while using his part-time job as a waiter to serve as a front for his secret marijuana dealings. Frank is a strict disciplinarian who has previously forced Ricky into a military academy and a psychiatric hospital. Jim Olmeyer and Jim Berkley, a gay couple who live nearby, welcome the family to the neighborhood; Frank later reveals his homophobia when angrily discussing the incident with Ricky.\nLester becomes infatuated with Jane's vain cheerleader friend, Angela Hayes, after seei

In [47]:
sequel_plot_list = get_plot(sequel_table)

plot not found: Jackass: The Movie
plot not found: Bad Boys
plot not found: Ice Age
plot not found: Jackass Number Two
plot not found: Rush Hour


In [60]:
movie = 'Ice Age'
year = 2015
movie_y_f, movie_f = movie + ' (' + str(year) + ' film)', movie + ' (film)'
search1 = wikipedia.search(movie_y_f)[0]
print(search1)
search2 = wikipedia.search(movie_f)[0]
print(search2)
if search1 == movie_y_f:
  search = movie_y_f
elif search2 == movie_f:
  search = movie_f
else:
  search = movie
page = wikipedia.page(search)

Ice Age (franchise)
Ice Age (franchise)


In [53]:
page.section("Plot")

"A saber-toothed squirrel known as Scrat attempts to find a place to store his acorn for the winter. Eventually, as he tries to stomp it into the ground, he inadvertently causes a large crack to form in the ice that extends for miles before setting off a large avalanche which nearly crushes him. He barely escapes but finds himself getting stepped on by a herd of prehistoric animals migrating south in order to escape the forthcoming ice age. Sid, a clumsy ground sloth, is left behind by his family and decides to move on by himself, but is attacked by two prehistoric rhinos after ruining their meal and making them angry. Sid is soon rescued by Manny, a gruff woolly mammoth heading north, who fights the rhinos off and continues on his path. Sid joins Manny, not wanting to be alone and unprotected. Manny is annoyed by Sid's outgoing demeanor and wishes to migrate on his own, but Sid nonetheless continues to follow Manny. Meanwhile, Soto, the leader of a pack of saber-toothed tigers, wants 

In [59]:
sequel_table[sequel_table.s_primaryTitle == 'Ice Age']

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_act1,s_act2,s_act3,s_act4,s_dir5
144,Ice Age: The Meltdown,2006,Ray Romano,John Leguizamo,Denis Leary,Seann William Scott,Carlos Saldanha,Ice Age,2015,Mahtab Keramati,Bahram Radan,Farhad Aslani,Sahar Dolatshahi,Mostafa Kiayee


In [61]:
titles[titles.primaryTitle == 'Ice Age']

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres
70232,tt0071672,movie,Ice Age,1975,
257384,tt0268380,movie,Ice Age,2002,
3643240,tt4418168,movie,Ice Age,2015,


In [0]:
### TODO get rid of duplicates