In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': str, 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [9]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street
3,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
4,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 2: Freddy's Revenge


In [0]:
titles = titles[titles.titleType == 'movie']

In [0]:
def prepare_data(first_movie_list, sequel_movie_list):
  movie_titles = titles[titles.primaryTitle.isin(first_movie_list + sequel_movie_list)]
  movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
  movie_titles = pd.merge(movie_titles, names, on = 'nconst')
  movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
  
  movie_information = movie_titles[movie_titles.category.isin(['actor','actress','director'])].copy()
  movie_information.replace(['actor','actress','director'], ['act','act','dir'], inplace = True)
  movie_information['role'] = movie_information.category + movie_information.ordering
  movie_information.head()
  movie_summary = pd.pivot_table(movie_information, columns = 'role', values = 'primaryName', index = ['primaryTitle','startYear','genres'], aggfunc = 'first')
  movie_summary = movie_summary[['act1','act2','act3','act4','dir5']].reset_index()
  
  first_movie_titles = movie_summary[movie_summary.primaryTitle.isin(first_movie_list)]
  first_movie_titles.columns = 'fm_' + first_movie_titles.columns
  sequel_movie_titles = movie_summary[movie_summary.primaryTitle.isin(sequel_movie_list)]
  sequel_movie_titles.columns = 's_' + sequel_movie_titles.columns
  
  sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
  
  sequel_table = pd.merge(pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner'),
                        sequel_movie_titles, left_on = 'sequel', right_on = 's_primaryTitle', how = 'inner')
  sequel_table = sequel_table[sequel_table.fm_startYear <= sequel_table.s_startYear].copy()
  sequel_table.drop(columns = ['first_movie','sequel'], inplace = True)

  return sequel_table

In [0]:
sequel_table = prepare_data(first_movie_list, sequel_movie_list)

In [13]:
sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_genres,s_act1,s_act2,s_act3,s_act4,s_dir5
1,A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone,For a Few Dollars More,1965,Western,Clint Eastwood,Lee Van Cleef,Gian Maria Volontè,Mario Brega,Sergio Leone
2,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 2: Freddy's Revenge,1985,Horror,Robert Englund,Mark Patton,Kim Myers,Robert Rusler,Jack Sholder
5,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell
8,A Nightmare on Elm Street,1984,Horror,Heather Langenkamp,Johnny Depp,Robert Englund,John Saxon,Wes Craven,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin
10,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin


In [0]:
non_sequel_movies = ['The Searchers','Chisum','She Wore a Yellow Ribbon','There Will Be Blood','Phantom Thread','Little Big Man',
                     'Bonnie and Clyde','Out of Africa','Silkwood','Florence Foster Jenkins','The Shawshank Redemption',
                     'Escape from Alcatraz','Little Big Horn','American Beauty','An American in Paris']

In [0]:
ns_first_movie_list = []
ns_sequel_movie_list = []
for first_movie in non_sequel_movies:
  for sequel_movie in non_sequel_movies:
    if first_movie != sequel_movie:
      ns_first_movie_list.append(first_movie)
      ns_sequel_movie_list.append(sequel_movie)

In [0]:
non_sequel_table = prepare_data(ns_first_movie_list, ns_sequel_movie_list)

In [17]:
non_sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_genres,s_act1,s_act2,s_act3,s_act4,s_dir5
1,An American in Paris,1951,"Drama,Musical,Romance",Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
6,Little Big Horn,1951,Western,Lloyd Bridges,John Ireland,Marie Windsor,Reed Hadley,Charles Marquis Warren,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
10,She Wore a Yellow Ribbon,1949,Western,John Wayne,Joanne Dru,John Agar,Ben Johnson,John Ford,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
15,An American in Paris,1951,"Drama,Musical,Romance",Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,Chisum,1970,"Biography,Western",John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen
16,Bonnie and Clyde,1967,"Action,Biography,Crime",Warren Beatty,Faye Dunaway,Michael J. Pollard,Gene Hackman,Arthur Penn,Chisum,1970,"Biography,Western",John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen


In [0]:
def get_plot(movie_list):
  plot_list = []
  for movie in movie_list:
    search = wikipedia.search(movie)
    if movie + ' (film)' in search:
      search_movie = movie + ' (film)'
    elif movie in search:
      search_movie = movie
    else:
      print(movie)
    page = wikipedia.page(search_movie)
    plot = page.section("Plot")
    if not plot:
      plot = page.section("Synopsis")
    if plot:
      plot_list.append(plot)
    else:
      print(movie)
  return plot_list
      


In [56]:
non_sequel_plot_list = get_plot(non_sequel_movies)

American Beauty


In [57]:
l = get_plot(np.unique(pd.concat([sequel_table.fm_primaryTitle,sequel_table.s_primaryTitle])))

Bad Boys
Friday
Hellboy
Ice Age
Jackass Number Two
Jackass: The Movie
Rush Hour
The Enforcer
The Mummy


In [52]:
movie = 'Die Hard (film)'
wikipedia.page(movie)

<WikipediaPage 'Die Hard (film series)'>

In [54]:
wikipedia.search(movie)

['Die Hard (film series)',
 'Die Hard',
 'Live Free or Die Hard',
 'Die Hard with a Vengeance',
 'A Good Day to Die Hard',
 'Die Hard 2',
 'Die hard',
 'Die Hard Trilogy',
 'List of Die Hard characters',
 'To Die Is Hard']

In [62]:
wikipedia.page("Amadeus film 1984").summary

"Amadeus is a 1984 American period drama film directed by Miloš Forman and adapted by Peter Shaffer from his stage play Amadeus. The story is set in Vienna, Austria during the latter half of the 18th century, and is a fictionalized biography of Wolfgang Amadeus Mozart. Mozart's music is heard extensively in the soundtrack of the film. The film follows a fictional rivalry between Mozart and Italian composer Antonio Salieri at the court of Emperor Joseph II.\nAmadeus is considered one of the best films of all time. It was nominated for 53 awards and received 40, which included eight Academy Awards, four BAFTA Awards, four Golden Globe Awards, and a Directors Guild of America award. As of 2019, it is the most recent film to have more than one nomination in the Academy Award for Best Actor category. In 1998, the American Film Institute ranked it 53rd on its 100 Years... 100 Movies list."