In [0]:
%%capture
!pip install wikipedia -q
import wikipedia
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [0]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "List_of_film_sequels_by_box-office_improvement",
    "prop": "links",
    "pllimit": 500
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

PAGES = DATA["query"]["pages"]

sequels = []
for k, v in PAGES.items():
    for l in v["links"]:
        sequels.append(l["title"])

In [0]:
first_movie_list = []
sequel_movie_list = []
for sequel in sequels:
    page = wikipedia.page(sequel)
    summary = page.summary
    for first_movie in sequels:
        if first_movie in summary and first_movie != sequel:
            first_movie_list.append(first_movie)
            sequel_movie_list.append(sequel)

In [0]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category'],
                   dtype = {'tconst': str, 'ordering': str, 'nconst': str, 'category': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear','titleType','genres'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64','titleType':str, 'genres': str})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName'], 
                    dtype = {'nconst': str, 'primaryName': str})

In [0]:
def remove_film_year(sequel):
  if "(" in sequel:
    i = sequel.index("(")
    return sequel[0:i-1]
  return sequel

In [0]:
first_movie_list = [remove_film_year(first_movie) for first_movie in first_movie_list]
sequel_movie_list = [remove_film_year(sequel) for sequel in sequel_movie_list]

In [9]:
sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
sequel_pairs.head()

Unnamed: 0,first_movie,sequel
0,22 Jump Street,21 Jump Street
1,For a Few Dollars More,A Fistful of Dollars
2,A Nightmare on Elm Street 2: Freddy's Revenge,A Nightmare on Elm Street
3,A Nightmare on Elm Street,A Nightmare on Elm Street 2: Freddy's Revenge
4,A Nightmare on Elm Street 3: Dream Warriors,A Nightmare on Elm Street 2: Freddy's Revenge


In [0]:
titles = titles[titles.titleType == 'movie']

In [0]:
def prepare_data(first_movie_list, sequel_movie_list):
  movie_titles = titles[titles.primaryTitle.isin(first_movie_list + sequel_movie_list)].copy()
  movie_titles.drop_duplicates(subset = 'primaryTitle', keep = False, inplace = True)
  movie_titles = pd.merge(movie_titles, cast, on = 'tconst')
  movie_titles = pd.merge(movie_titles, names, on = 'nconst')
  movie_titles = movie_titles.drop(columns = ['titleType','tconst','nconst'])
  
  movie_information = movie_titles[movie_titles.category.isin(['actor','actress','director'])].copy()
  movie_information.replace(['actor','actress','director'], ['act','act','dir'], inplace = True)
  movie_information['role'] = movie_information.category + movie_information.ordering
  movie_summary = pd.pivot_table(movie_information, columns = 'role', values = 'primaryName', index = ['primaryTitle','startYear','genres'], aggfunc = 'first') #add genres back
  movie_summary = movie_summary[['act1','act2','act3','act4','dir5']].reset_index()
  
  first_movie_titles = movie_summary[movie_summary.primaryTitle.isin(first_movie_list)]
  first_movie_titles = first_movie_titles.add_prefix('fm_')
  sequel_movie_titles = movie_summary[movie_summary.primaryTitle.isin(sequel_movie_list)]
  sequel_movie_titles = sequel_movie_titles.add_prefix('s_')
  
  sequel_pairs = pd.DataFrame({"first_movie" : first_movie_list, "sequel" : sequel_movie_list})
  
  sequel_table = pd.merge(pd.merge(first_movie_titles, sequel_pairs, left_on = 'fm_primaryTitle', right_on = 'first_movie', how = 'inner'),
                        sequel_movie_titles, left_on = 'sequel', right_on = 's_primaryTitle', how = 'inner')
  sequel_table = sequel_table[sequel_table.fm_startYear <= sequel_table.s_startYear].copy()
  sequel_table.drop(columns = ['first_movie','sequel'], inplace = True)

  return sequel_table

In [0]:
sequel_table = prepare_data(first_movie_list, sequel_movie_list)

In [127]:
sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_genres,s_act1,s_act2,s_act3,s_act4,s_dir5
1,A Fistful of Dollars,1964,"Drama,Western",Clint Eastwood,Gian Maria Volontè,Marianne Koch,Wolfgang Lukschy,Sergio Leone,For a Few Dollars More,1965,Western,Clint Eastwood,Lee Van Cleef,Gian Maria Volontè,Mario Brega,Sergio Leone
3,A Nightmare on Elm Street 3: Dream Warriors,1987,"Fantasy,Horror",Heather Langenkamp,Robert Englund,Craig Wasson,Patricia Arquette,Chuck Russell,A Nightmare on Elm Street 4: The Dream Master,1988,Horror,Robert Englund,Rodney Eastman,John Beckman,Kisha Brackel,Renny Harlin
5,Ace Ventura: Pet Detective,1994,Comedy,Jim Carrey,Courteney Cox,Sean Young,Tone Loc,Tom Shadyac,Ace Ventura: When Nature Calls,1995,"Adventure,Comedy,Crime",Jim Carrey,Ian McNeice,Simon Callow,Maynard Eziashi,Steve Oedekerk
7,Alvin and the Chipmunks,2007,"Animation,Comedy,Family",Jason Lee,Ross Bagdasarian Jr.,Janice Karman,David Cross,Tim Hill,Alvin and the Chipmunks: The Squeakquel,2009,"Animation,Comedy,Family",Jason Lee,Zachary Levi,David Cross,Justin Long,Betty Thomas
9,Anchorman: The Legend of Ron Burgundy,2004,Comedy,Will Ferrell,Christina Applegate,Steve Carell,Paul Rudd,Adam McKay,Anchorman 2: The Legend Continues,2013,Comedy,Will Ferrell,Christina Applegate,Paul Rudd,Steve Carell,Adam McKay


In [0]:
non_sequel_movies = ['The Searchers','Chisum','She Wore a Yellow Ribbon','There Will Be Blood','Phantom Thread','Little Big Man',
                     'Bonnie and Clyde','Out of Africa','Silkwood','Florence Foster Jenkins','The Shawshank Redemption',
                     'Escape from Alcatraz','Little Big Horn','American Beauty','An American in Paris']

In [0]:
ns_first_movie_list = []
ns_sequel_movie_list = []
for first_movie in non_sequel_movies:
  for sequel_movie in non_sequel_movies:
    if first_movie != sequel_movie:
      ns_first_movie_list.append(first_movie)
      ns_sequel_movie_list.append(sequel_movie)

In [0]:
non_sequel_table = prepare_data(ns_first_movie_list, ns_sequel_movie_list)

In [131]:
non_sequel_table.head()

Unnamed: 0,fm_primaryTitle,fm_startYear,fm_genres,fm_act1,fm_act2,fm_act3,fm_act4,fm_dir5,s_primaryTitle,s_startYear,s_genres,s_act1,s_act2,s_act3,s_act4,s_dir5
1,An American in Paris,1951,"Drama,Musical,Romance",Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
6,Little Big Horn,1951,Western,Lloyd Bridges,John Ireland,Marie Windsor,Reed Hadley,Charles Marquis Warren,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
10,She Wore a Yellow Ribbon,1949,Western,John Wayne,Joanne Dru,John Agar,Ben Johnson,John Ford,The Searchers,1956,"Adventure,Drama,Western",John Wayne,Jeffrey Hunter,Vera Miles,Ward Bond,John Ford
15,An American in Paris,1951,"Drama,Musical,Romance",Gene Kelly,Leslie Caron,Oscar Levant,Georges Guétary,Vincente Minnelli,Chisum,1970,"Biography,Western",John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen
16,Bonnie and Clyde,1967,"Action,Biography,Crime",Warren Beatty,Faye Dunaway,Michael J. Pollard,Gene Hackman,Arthur Penn,Chisum,1970,"Biography,Western",John Wayne,Forrest Tucker,Christopher George,Ben Johnson,Andrew V. McLaglen


In [0]:
def get_plot(movie_table):
  plot_list = {}
  movie_list = np.array(pd.concat([movie_table.fm_primaryTitle, movie_table.s_primaryTitle]))
  year_list = np.array(pd.concat([movie_table.fm_startYear, movie_table.s_startYear]))
  for movie, year in zip(movie_list, year_list):
    if movie not in plot_list:
      movie_y_f, movie_f = movie + ' (' + str(year) + ' film)', movie + ' (film)'
      search1 = wikipedia.search(movie_y_f)[0]
      search2 = wikipedia.search(movie_f)[0]
      if search1 == movie_y_f:
        search = movie_y_f
      elif search2 == movie_f:
        search = movie_f
      else:
        search = movie
      page = wikipedia.page(search)
      if not page:
        print('movie not found: %s' %movie)
      plot = page.section("Plot")
      if not plot:
        plot = page.section("Synopsis")
      if not plot:
        plot = page.section("Plot summary")
      if not plot:
        plot = page.section("Summary")
      if plot:
        plot_list[movie] = plot
      else:
        print('plot not found: %s' %movie)
      
  return plot_list

In [0]:
non_sequel_plot_list = get_plot(non_sequel_table)

In [134]:
non_sequel_plot_list

{'American Beauty': "Lester Burnham is a middle-aged magazine executive who despises his job and is unhappily married to Carolyn, a neurotic and ambitious real estate broker. Their sixteen-year-old daughter, Jane, abhors her parents and has low self-esteem. The Burnhams' new neighbors are retired US Marine colonel Frank Fitts and his near-catatonic wife, Barbara. The Fitts' teenage son, Ricky, obsessively films his surroundings with a camcorder, collecting hundreds of recordings on video tapes in his bedroom, while using his part-time job as a waiter to serve as a front for his secret marijuana dealings. Frank is a strict disciplinarian who has previously forced Ricky into a military academy and a psychiatric hospital. Jim Olmeyer and Jim Berkley, a gay couple who live nearby, welcome the family to the neighborhood; Frank later reveals his homophobia when angrily discussing the incident with Ricky.\nLester becomes infatuated with Jane's vain cheerleader friend, Angela Hayes, after seei

In [0]:
sequel_plot_list = get_plot(sequel_table)

In [0]:
documents = list(sequel_plot_list.values()) + list(non_sequel_plot_list.values())
doc_titles = list(sequel_plot_list.keys()) + list(non_sequel_plot_list.keys())

In [47]:
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
stopwords_set = set(stopwords.words('english'))

In [0]:
def clean_text(document):
  document_lower = document.lower() # set everything to lower case
  document_no_punctuation = document_lower.translate(str.maketrans('', '', string.punctuation)) # strip all punctuation
  words = document_no_punctuation.split()
  words_no_stopwords = [word for word in words if word not in stopwords_set]
  return words_no_stopwords[0:100]

In [0]:
documents_text = [clean_text(document) for document in documents]

In [0]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [0]:
documents_tagged = [TaggedDocument(doc, [doc_titles[i]]) for i, doc in enumerate(documents_text)]
model = Doc2Vec(documents_tagged, vector_size=100, window=2, min_count=1, workers=4)

In [104]:
max_epochs = 100
vec_size = 100
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(documents_tagged)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(documents_tagged,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [105]:
model.docvecs[0]

array([ 0.3804458 ,  3.691443  , -1.954659  , -0.6016292 ,  1.9102367 ,
       -1.0174367 , -2.2215443 , -2.168076  ,  2.8967307 , -1.053612  ,
       -0.22260585, -0.72344714,  0.986865  ,  3.0682468 , -2.9749432 ,
        0.06206861, -4.0364714 ,  1.9659051 ,  5.499632  , -0.48202178,
        3.675675  ,  0.40855452, -4.282472  ,  1.7784104 , -0.84649026,
       -2.3853812 ,  1.3036832 ,  1.5193775 , -1.5720742 ,  1.7434992 ,
       -0.77530384, -2.8826067 ,  1.5125058 , -0.3398718 , -1.2742558 ,
       -1.7283915 , -0.77502555,  2.7208173 ,  0.17053077, -2.7088947 ,
        5.568589  ,  1.7030735 ,  0.8085599 , -2.9064994 ,  3.079656  ,
        4.700389  ,  0.7404981 , -1.9263504 ,  0.23175745, -1.6121739 ,
       -1.9777411 , -2.3657954 , -2.0585055 ,  0.7524896 , -1.3533297 ,
        0.712349  ,  3.853114  ,  0.13830939, -4.4413548 ,  3.0549693 ,
       -0.6353029 ,  1.1829141 ,  3.9013078 ,  5.6095    ,  1.6636267 ,
       -4.0986805 , -0.07022345,  4.008235  , -2.0157113 ,  5.36

In [107]:
model.docvecs.most_similar([model.docvecs[2]])

  if np.issubdtype(vec.dtype, np.int):


[('Ace Ventura: Pet Detective', 1.0000001192092896),
 ('Ace Ventura: When Nature Calls', 0.6049174070358276),
 ('An American in Paris', 0.5183841586112976),
 ('Toy Story 2', 0.5153937339782715),
 ('Iron Man 3', 0.5052570104598999),
 ('Blade Runner', 0.47711917757987976),
 ('Alvin and the Chipmunks', 0.47579318284988403),
 ("Madagascar 3: Europe's Most Wanted", 0.4736562967300415),
 ('Avengers: Infinity War', 0.46840426325798035),
 ('Rocky III', 0.46799221634864807)]

In [117]:
model.docvecs.most_similar([model.docvecs['Magnum Force']], topn = 5)

  if np.issubdtype(vec.dtype, np.int):


[('Magnum Force', 0.9999999403953552),
 ('Dirty Harry', 0.5580133199691772),
 ('John Wick', 0.5205960273742676),
 ('The Matrix', 0.48892438411712646),
 ('The Transporter', 0.4875659644603729)]

In [168]:
model.most_similar('revolver')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('colt', 0.9125268459320068),
 ('38', 0.8047251105308533),
 ('detective', 0.7897595167160034),
 ('special', 0.7544097900390625),
 ('labatouche', 0.7451115250587463),
 ('commissioner', 0.7410504817962646),
 ('head—from', 0.7282149791717529),
 ('department', 0.707581639289856),
 ('tactics', 0.6921036243438721),
 ('spencer', 0.6731761693954468)]

In [0]:
from sklearn.metrics.pairwise import cosine_similarity

In [181]:
cosine_similarity([model.docvecs['Dirty Harry']],[model.docvecs['Magnum Force']])

array([[0.5580134]], dtype=float32)