# Exploring the dataset

In [None]:
import pandas as pd
import nltk
import re
import numpy as np
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import os
import time
from gensim.models import KeyedVectors
import faiss
from sentence_transformers import SentenceTransformer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.max_colwidth =  None
df = pd.read_csv('wiki_movie_plots_deduped.csv')


In [2]:
df.shape

(34886, 8)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [4]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\r\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."


In [5]:
df.isnull().sum()

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

In [6]:
df.isna().sum()

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

In [7]:
print('Number of of unknown values in each column:')
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"{col}: {df[col].str.contains('unknown',case=False,na=True).sum()}")

Number of of unknown values in each column:
Title: 18
Origin/Ethnicity: 0
Director: 1124
Cast: 1423
Genre: 6083
Wiki Page: 18
Plot: 1322


In [8]:
df['Genre'].str.lower().unique()

array(['unknown', 'western', 'comedy', ...,
       'adventure, romance, fantasy film', 'ero',
       'horror romantic comedy'], dtype=object)

In [9]:
len(df['Genre'].str.lower().unique())

2265

In [10]:
df['Genre'].str.lower().value_counts()

Genre
unknown                                                                              6083
drama                                                                                5964
comedy                                                                               4379
horror                                                                               1167
action                                                                               1098
thriller                                                                              966
romance                                                                               923
western                                                                               865
crime                                                                                 568
adventure                                                                             526
musical                                                                               467
crim

In [11]:
len(df['Genre'].str.lower().value_counts())

2265

In [12]:
df[df['Genre'].str.contains('unknown',case=False,na=True)].head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\r\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film),"Alice follows a large white rabbit down a ""Rabbit-hole"". She finds a tiny door. When she finds a bottle labeled ""Drink me"", she does, and shrinks, but not enough to pass through the door. She then eats something labeled ""Eat me"" and grows larger. She finds a fan when enables her to shrink enough to get into the ""Garden"" and try to get a ""Dog"" to play with her. She enters the ""White Rabbit's tiny House,"" but suddenly resumes her normal size. In order to get out, she has to use the ""magic fan.""\r\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. ""The Duchess's Cheshire Cat"" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's ""Mad Tea-Party."" After a while, she leaves.\r\nThe Queen invites Alice to join the ""ROYAL PROCESSION"": a parade of marching playing cards and others headed by the White Rabbit. When Alice ""unintentionally offends the Queen"", the latter summons the ""Executioner"". Alice ""boxes the ears"", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train_Robbery,"The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The ""Bandit Queen,"" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The ""Bandit Queen"" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.\r\nThe next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the ""valuables,"" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.\r\nIn the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the ""plunder."" The police, however, have struck the right trail and are in close pursuit. While the ""plunder"" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the ""Bandit Queen."""
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film),"Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents."
25,1909,The Lure of the Gown,American,D.W. Griffith,Marion Leonard,unknown,https://en.wikipedia.org/wiki/The_Lure_of_the_Gown,The story as told by Moving Picture World reads:\r\n
27,1910,A Christmas Carol,American,J. Searle Dawley,"Marc McDermott, Charles Stanton Ogle",unknown,https://en.wikipedia.org/wiki/A_Christmas_Carol_(1910_film),"The day before Christmas, Ebenezer Scrooge refuses to contribute to the Charity Relief Committee, and then rudely rejects his nephew Fred when he visits Scrooge in his office. When Scrooge returns home, he sees the ghost of his former business partner Jacob Marley, who warns him of the punishment he will suffer in the next life if he does not change his ways. That night, Scrooge is visited by three more spirits, who show him his past, present, and future him."


Find the duplicates with the same Title

In [13]:
df[df['Title'].duplicated(keep=False)].shape

(4525, 8)

In [14]:
df[df['Title'].duplicated(keep=False)].head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film),"Alice follows a large white rabbit down a ""Rabbit-hole"". She finds a tiny door. When she finds a bottle labeled ""Drink me"", she does, and shrinks, but not enough to pass through the door. She then eats something labeled ""Eat me"" and grows larger. She finds a fan when enables her to shrink enough to get into the ""Garden"" and try to get a ""Dog"" to play with her. She enters the ""White Rabbit's tiny House,"" but suddenly resumes her normal size. In order to get out, she has to use the ""magic fan.""\r\nShe enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. ""The Duchess's Cheshire Cat"" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's ""Mad Tea-Party."" After a while, she leaves.\r\nThe Queen invites Alice to join the ""ROYAL PROCESSION"": a parade of marching playing cards and others headed by the White Rabbit. When Alice ""unintentionally offends the Queen"", the latter summons the ""Executioner"". Alice ""boxes the ears"", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film),"Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents."
12,1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film),"Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.\r\nFilm historian Charles Musser writes of Porter's adaptation, ""O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.""[1]"
15,1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film,"The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets ""catches"" the laughter from her, including a vendor and police officers."
19,1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film),"A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as ""The Biograph Girl."""
20,1908,A Christmas Carol,American,Unknown,Tom Ricketts,drama,https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film),"No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life."
27,1910,A Christmas Carol,American,J. Searle Dawley,"Marc McDermott, Charles Stanton Ogle",unknown,https://en.wikipedia.org/wiki/A_Christmas_Carol_(1910_film),"The day before Christmas, Ebenezer Scrooge refuses to contribute to the Charity Relief Committee, and then rudely rejects his nephew Fred when he visits Scrooge in his office. When Scrooge returns home, he sees the ghost of his former business partner Jacob Marley, who warns him of the punishment he will suffer in the next life if he does not change his ways. That night, Scrooge is visited by three more spirits, who show him his past, present, and future him."
28,1910,Frankenstein,American,J. Searle Dawley,"Augustus Phillips, Charles Stanton Ogle, Mary Fuller",unknown,https://en.wikipedia.org/wiki/Frankenstein_(1910_film),"Described as ""a liberal adaptation of Mrs. Shelley's famous story"", the plot description in the Edison Kinetogram was:[3]"
32,1910,Pocahontas,American,Unknown,"Anna Rosemond, George Barnes, Frank H. Crane",short fantasy,https://en.wikipedia.org/wiki/Pocahontas_(1910_film),"Though the film is presumed lost, a synopsis survives in The Moving Picture World from October 15, 1910. It states: ""Captain John Smith comes to America as the head of a band of English colonists and settles in Jamestown, Virginia. While at the head of the colony Smith makes a trip of exploration into the interior and is captured there by King Powhatan, the acknowledged head of all of the red men in Virginia. Powhatan orders his prisoner's execution. Just as the fatal club is about to descend, Pocahontas, the favorite daughter of the King, throws herself before her father. She begs so fervently that the white man's life be spared that Powhatan relents and orders his release. Captain Smith returns in safety to his friends. Later Pocahontas is taken prisoner by the English and held as hostage. While a prisoner, she is converted to Christianity, and falls in love with Rolfe, a handsome young Englishman. They are married in a rude little church at Jamestown, and the Indian princess sails away with her husband to England. There she is received with royal honors by King James I, but the foreign flower cannot stand transplanting. She soon sickens and dies, and in her last hours is visited by visions of the home in the wilderness that she would fly back to if she could.""[1]"


In [15]:
df[df['Title'].str.contains('Jack and the Beanstalk',case=False,na=True)].head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."
5651,1952,Jack and the Beanstalk,American,Jean Yarbrough,Abbott and Costello,comedy,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1952_film),"Mr. Dinkle and Jack (Abbott and Costello) look for work at the Cosman Employment Agency. Jack makes advances to Cosman employee Polly (Dorothy Ford), but he is thwarted by the arrival of her boyfriend, a towering police officer (Buddy Baer). Polly assigns Dinkle and Jack to babysit for Eloise Larkin's brother and infant sister, while Eloise (Shaye Cogan) and her fiancé (James Alexander) are out for the evening. The babysitting duties are complicated by the fact that Donald (David Stollery) is something of a prodigy, as well as a self-proclaimed ""problem child"". The dull-witted Jack is soon outclassed by the child, and an attempt to lull the boy to sleep by reading the fairy tale Jack and the Beanstalk (Jack's ""favorite novel"") aloud fails when Jack stumbles over the larger words. Bemused by Jack's incompetence, Donald reads the story instead--a role-reversal made complete when Jack falls asleep as Donald reads. In his slumber, Jack dreams that he is the young Jack of the fairy tale.\r\nIn his dream Jack learns that the Giant (Buddy Baer), who lives in a castle in the sky, has stolen all of the land's wealth and food. The situation obliges the kingdom's princess to marry The Prince (James Alexander) of a neighboring kingdom, whom she has never met.\r\nJack must also make sacrifices, when his mother sends him to sell the last family possession, their beloved cow ""Henry"", to the local butcher, Mr. Dinklepuss. Along the way Jack meets The Prince, disguised as a troubador (who is kidnapped by the Giant soon afterward). The unscrupulous Dinklepuss pays Jack five ""magic"" beans for the cow. Upon returning home, Jack learns that the Giant has also kidnapped The Princess (Shaye Cogan) and Henry.\r\nUndeterred by his mother's disappointment over bringing home only beans, Jack plants them and a gigantic beanstalk grows overnight. He decides to climb the beanstalk to rescue everyone from the Giant's clutches, as well as to retrieve ""Nellie"", the golden-egg laying hen that the Giant previously stole from Jack's family. Upon learning of Nellie's existence, Dinklepuss decides to join Jack on the adventure.\r\nWhen they reach the top of the beanstalk Jack and Dinklepuss are captured by the Giant and imprisoned with the prince and princess. The princess falls for the troubador after he serenades her, only to later learn this is the same prince she was betrothed. After the Giant releases Dinklepuss and Jack from the dungeon in order to toil around his castle, they befriend his housekeeper, Polly, who helps them escape over the castle wall along with the royal prisoners, Nellie and some of the Giant's stolen gems (pilfered by the greedy Dinklepuss). They flee down the beanstalk with the Giant in pursuit, as Polly escapes the castle behind him, astride Henry. During the descent, Dinklepuss loses Nellie (who falls into the arms of Jack's mother) and then the gems, which rain down upon the impoverished townsfolk below. Once all are on the ground, Jack chops down the beanstalk, sending the Giant falling to his death. The villagers rejoice over their liberation by dancing around the hole the Giant made from his fall while amusingly singing ""He Never Looked Better in his Life"".\r\nJust before being rewarded by the King for heroism, Jack is rudely awakened from his dream by Donald, who breaks a vase over Jack's head as Eloise and Arthur return home. Jack's angry outburst over Donald's behavior results in a second blow to the head from Dinkle, which returns Jack to his dream state. After greeting the others as their storybook counterparts, Jack walks off into the night with the bravado of ""Jack the Giant-Killer""."


Find the duplicates with the same title and release year

In [16]:
df.duplicated(subset=['Title','Release Year'], keep=False).sum()

551

Remove the duplicates

In [17]:
df.drop_duplicates(subset=['Title', 'Release Year'], keep='first',inplace=True)

check if there are any duplicates title release year

In [18]:
df.duplicated(subset=['Title','Release Year'], keep=False).sum()

0

In [4]:
df.shape

(34886, 8)

Add the release year to the duplicated titles only

In [20]:
duplicates = df[df['Title'].duplicated(keep=False)]

df.loc[duplicates.index, 'Title'] = (
    df.loc[duplicates.index, 'Title'] + ' ' + df.loc[duplicates.index, 'Release Year'].astype(str) 
)
 

check for duplicated titles again

In [21]:
df[df['Title'].duplicated(keep=False)].shape

(0, 8)

In [22]:
df[df['Title'].str.contains('Jack and the Beanstalk',case=False,na=True)].head(10)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
4,1902,Jack and the Beanstalk 1902,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."
5651,1952,Jack and the Beanstalk 1952,American,Jean Yarbrough,Abbott and Costello,comedy,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1952_film),"Mr. Dinkle and Jack (Abbott and Costello) look for work at the Cosman Employment Agency. Jack makes advances to Cosman employee Polly (Dorothy Ford), but he is thwarted by the arrival of her boyfriend, a towering police officer (Buddy Baer). Polly assigns Dinkle and Jack to babysit for Eloise Larkin's brother and infant sister, while Eloise (Shaye Cogan) and her fiancé (James Alexander) are out for the evening. The babysitting duties are complicated by the fact that Donald (David Stollery) is something of a prodigy, as well as a self-proclaimed ""problem child"". The dull-witted Jack is soon outclassed by the child, and an attempt to lull the boy to sleep by reading the fairy tale Jack and the Beanstalk (Jack's ""favorite novel"") aloud fails when Jack stumbles over the larger words. Bemused by Jack's incompetence, Donald reads the story instead--a role-reversal made complete when Jack falls asleep as Donald reads. In his slumber, Jack dreams that he is the young Jack of the fairy tale.\r\nIn his dream Jack learns that the Giant (Buddy Baer), who lives in a castle in the sky, has stolen all of the land's wealth and food. The situation obliges the kingdom's princess to marry The Prince (James Alexander) of a neighboring kingdom, whom she has never met.\r\nJack must also make sacrifices, when his mother sends him to sell the last family possession, their beloved cow ""Henry"", to the local butcher, Mr. Dinklepuss. Along the way Jack meets The Prince, disguised as a troubador (who is kidnapped by the Giant soon afterward). The unscrupulous Dinklepuss pays Jack five ""magic"" beans for the cow. Upon returning home, Jack learns that the Giant has also kidnapped The Princess (Shaye Cogan) and Henry.\r\nUndeterred by his mother's disappointment over bringing home only beans, Jack plants them and a gigantic beanstalk grows overnight. He decides to climb the beanstalk to rescue everyone from the Giant's clutches, as well as to retrieve ""Nellie"", the golden-egg laying hen that the Giant previously stole from Jack's family. Upon learning of Nellie's existence, Dinklepuss decides to join Jack on the adventure.\r\nWhen they reach the top of the beanstalk Jack and Dinklepuss are captured by the Giant and imprisoned with the prince and princess. The princess falls for the troubador after he serenades her, only to later learn this is the same prince she was betrothed. After the Giant releases Dinklepuss and Jack from the dungeon in order to toil around his castle, they befriend his housekeeper, Polly, who helps them escape over the castle wall along with the royal prisoners, Nellie and some of the Giant's stolen gems (pilfered by the greedy Dinklepuss). They flee down the beanstalk with the Giant in pursuit, as Polly escapes the castle behind him, astride Henry. During the descent, Dinklepuss loses Nellie (who falls into the arms of Jack's mother) and then the gems, which rain down upon the impoverished townsfolk below. Once all are on the ground, Jack chops down the beanstalk, sending the Giant falling to his death. The villagers rejoice over their liberation by dancing around the hole the Giant made from his fall while amusingly singing ""He Never Looked Better in his Life"".\r\nJust before being rewarded by the King for heroism, Jack is rudely awakened from his dream by Donald, who breaks a vase over Jack's head as Eloise and Arthur return home. Jack's angry outburst over Donald's behavior results in a second blow to the head from Dinkle, which returns Jack to his dream state. After greeting the others as their storybook counterparts, Jack walks off into the night with the bravado of ""Jack the Giant-Killer""."


In [51]:
df.to_csv('wiki_movie_plots_deduped_updated.csv', index=False)

# Preprocessing

In [23]:


notebook_dir = os.path.dirname(os.path.abspath('__file__'))

nltk_data_path = os.path.join(notebook_dir, 'nltk_data')

if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

nltk.data.path.append(nltk_data_path)

nltk.download('punkt_tab', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)

print(f"Resources downloaded to: {nltk_data_path}")

[nltk_data] Downloading package punkt_tab to f:\Projects\python
[nltk_data]     projects\NLP_Project\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to f:\Projects\python
[nltk_data]     projects\NLP_Project\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to f:\Projects\python
[nltk_data]     projects\NLP_Project\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Resources downloaded to: f:\Projects\python projects\NLP_Project\nltk_data


In [24]:


def preprocess_text (plot ,lemmatize = True,return_tokens = False):



    plot = re.sub(r'[\r\n]+', ' ', plot)  
    plot = re.sub(r'\s+', ' ', plot) 
    plot = re.sub(r"'s\b", "", plot) 

    plot = plot.lower()

    tokens = word_tokenize(plot)

    stop_words = set(stopwords.words('english'))
    puncts = set(string.punctuation)
    puncts.update(["``", "''", '"', "'", "`","--"])
    

    tokens = [word for word in tokens if word not in stop_words]

    tokens = [word for word in tokens if word not in puncts]

    tokens = [word for word in tokens if not word.isnumeric()]

    if lemmatize:
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    if return_tokens:
        return tokens    

    return ' '.join(tokens)



In [28]:
text = df.iloc[5651]['Plot']
print(preprocess_text(text))

mr. dinkle jack abbott costello look work cosman employment agency jack make advance cosman employee polly dorothy ford thwarted arrival boyfriend towering police officer buddy baer polly assigns dinkle jack babysit eloise larkin brother infant sister eloise shaye cogan fiancé james alexander evening babysitting duty complicated fact donald david stollery something prodigy well self-proclaimed problem child dull-witted jack soon outclassed child attempt lull boy sleep reading fairy tale jack beanstalk jack favorite novel aloud fails jack stumble larger word bemused jack incompetence donald read story instead role-reversal made complete jack fall asleep donald read slumber jack dream young jack fairy tale dream jack learns giant buddy baer life castle sky stolen land wealth food situation obliges kingdom princess marry prince james alexander neighboring kingdom never met jack must also make sacrifice mother sends sell last family possession beloved cow henry local butcher mr. dinklepuss

In [27]:
text = df.iloc[4]['Plot']
tokens =preprocess_text(text)
print(tokens)

earliest known adaptation classic fairytale film show jack trading cow bean mother forcing drop front yard beig forced upstairs sleep jack visited fairy show glimpse await ascends bean stalk version jack son deposed king jack wake find beanstalk grown climb top enters giant home giant find jack narrowly escape giant chase jack bean stalk jack able cut giant get safety fall killed jack celebrates fairy reveals jack may return home prince


In [27]:
text = df.iloc[5651]['Plot']
print(preprocess_text(text,return_tokens=True))

['mr.', 'dinkle', 'jack', 'abbott', 'costello', 'look', 'work', 'cosman', 'employment', 'agency', 'jack', 'make', 'advance', 'cosman', 'employee', 'polly', 'dorothy', 'ford', 'thwarted', 'arrival', 'boyfriend', 'towering', 'police', 'officer', 'buddy', 'baer', 'polly', 'assigns', 'dinkle', 'jack', 'babysit', 'eloise', 'larkin', 'brother', 'infant', 'sister', 'eloise', 'shaye', 'cogan', 'fiancé', 'james', 'alexander', 'evening', 'babysitting', 'duty', 'complicated', 'fact', 'donald', 'david', 'stollery', 'something', 'prodigy', 'well', 'self-proclaimed', 'problem', 'child', 'dull-witted', 'jack', 'soon', 'outclassed', 'child', 'attempt', 'lull', 'boy', 'sleep', 'reading', 'fairy', 'tale', 'jack', 'beanstalk', 'jack', 'favorite', 'novel', 'aloud', 'fails', 'jack', 'stumble', 'larger', 'word', 'bemused', 'jack', 'incompetence', 'donald', 'read', 'story', 'instead', 'role-reversal', 'made', 'complete', 'jack', 'fall', 'asleep', 'donald', 'read', 'slumber', 'jack', 'dream', 'young', 'jack',

# Text Representation

## TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess_text,min_df=2,max_df=0.85)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['Plot'])

tfidf_features = tfidf_vectorizer.get_feature_names_out()

time to create Vectorizer and matrix : 1 minute and 50 seconds

save tfidfc vectorizer and matrix

In [32]:
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('models/tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

In [31]:
tfidf_matrix.shape

(34608, 63422)

In [None]:
"""
with open('tfidf_vocabulary.txt', 'w', encoding='utf-8') as f:
    for word in sorted(tfidf_vectorizer.get_feature_names_out()):
        f.write(word + '\n')
print(f"\nComplete vocabulary saved to 'tfidf_vocabulary.txt'")
"""


Complete vocabulary saved to 'tfidf_vocabulary.txt'


## Word2Vec

In [29]:
model = KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)


Time to load : 1 minute and 30 seconds

In [30]:
def get_plot_vector(plot):
    tokens = preprocess_text(plot, return_tokens=True)
    
    if not tokens:
        return np.zeros(model.vector_size)
    
    vectors = [model[token] for token in tokens if token in model]

    if not vectors:
        return np.zeros(model.vector_size)

    plot_vector = np.mean(vectors, axis=0)
    return plot_vector

In [31]:
word_embedding = get_plot_vector(df.iloc[26820]['Plot'])
print(word_embedding)
print("-----------------------------------------------")
print(f"Word embedding shape: {word_embedding.shape}")

[ 0.05745317  0.01896296  0.0087656   0.05588874 -0.07189389  0.03924698
  0.07147234 -0.0317508   0.09356     0.07413302  0.01116326 -0.13643046
 -0.06437306  0.04764051 -0.13249078  0.12053895  0.05570967  0.06536477
 -0.03160018 -0.04461867  0.02660044  0.01450879  0.08709829 -0.0029158
  0.00975242 -0.06069706 -0.08111907  0.01925939  0.02027602 -0.01563391
 -0.0430003  -0.04475626 -0.00397393  0.06320036  0.01091874 -0.0047376
  0.06479979  0.0011775   0.06642271  0.03420335  0.05655525 -0.05378997
  0.05097353  0.00720232 -0.07425349 -0.06611399 -0.07033359  0.01216931
  0.0055338   0.03204243 -0.04512915  0.00675021 -0.00782964 -0.01604196
 -0.0055173   0.01723652 -0.03263564 -0.08134426  0.00965418 -0.0896273
 -0.00978791  0.08051286 -0.02225023 -0.03395338 -0.02719939  0.03610409
 -0.00805793  0.03660189 -0.04258642  0.03780339  0.05120951  0.03800634
  0.05219569  0.02713953 -0.12194276 -0.02831414  0.05711725  0.0813482
  0.05129186  0.06733704  0.04386645  0.00942003  0.037

In [28]:
word_embedding = get_plot_vector(df.iloc[5651]['Plot'])
print(word_embedding)
print("-----------------------------------------------")
print(f"Word embedding shape: {word_embedding.shape}")


[ 2.67906673e-02  4.11355123e-02 -4.16282490e-02  5.00962995e-02
 -5.46791777e-03 -3.53162475e-02  1.99096240e-02 -6.23505823e-02
  6.59983903e-02  6.90712482e-02  3.38982828e-02 -1.01685889e-01
 -5.87129258e-02 -1.55450376e-02 -1.07435890e-01  8.03548768e-02
  3.34187336e-02  6.78160042e-02  2.68023796e-02 -4.79682721e-02
  2.15817541e-02  2.93865539e-02  7.03679025e-02  1.22435940e-02
  5.93221374e-02 -4.35976982e-02 -1.21020719e-01  8.70508179e-02
  9.29541066e-02 -5.07460386e-02  5.59019530e-03 -7.70210521e-03
 -4.67363484e-02  2.69011240e-02 -3.82471122e-02 -2.22264975e-02
  1.01821437e-01 -3.77576426e-03  1.89743042e-02  4.75145467e-02
  1.16714038e-01 -5.72783500e-02  1.35411710e-01 -3.60830538e-02
  4.82306927e-02 -5.02482355e-02 -8.02980587e-02  2.34624604e-03
  5.54820038e-02 -2.34705047e-03 -9.05666426e-02  9.83230397e-02
  6.85047954e-02 -2.90866774e-02  7.17226462e-03 -4.66982322e-03
 -6.38645887e-02 -8.45813602e-02  1.22909183e-02 -6.37924522e-02
  4.29033265e-02  8.36892

## Bert

In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using device: {device}")

bert_model = SentenceTransformer('all-mpnet-base-v2',device)

df = pd.read_csv('wiki_movie_plots_deduped_updated.csv')



Using device: cuda


In [3]:
def get_bert_embedding(plot,model):
    
    if not plot:
        return np.zeros(768)
    
    embedding = model.encode(plot,batch_size=64)
    return embedding

In [26]:
print(f"Embedding for plot {df.iloc[26820]['Plot']}:")
print("-----------------------------------------------")
print(get_bert_embedding(df.iloc[26820]['Plot'], bert_model))
print("-----------------------------------------------")
print(f"Embedding shape: {get_bert_embedding(df.iloc[26820]['Plot'],bert_model).shape}")

Embedding for plot Meinu Ek Ladki Chaahiye is a comical satirical Hindi movie starring Raghubir Yadav, Puru Chibber, Reecha Sinha, Zakir Hussain, Yatin Karyekar & Rashee Bindal. Govind (Raghubir Yadav) & his assistant Shishupal (Puru Chibber) get their first legal case of firoz( irfan razaa khan) after a lot of difficulties even though Govind’s father is a renowned lawyer. Govind, a jovial person does everything that he can to save the accused. It’s during the investigation that Govind and Shishupul come across some startling incidents. The case is of a serious nature but these two solve it in a very comical and hilarious way. However this case turns Govind’s life upside down. Harassment by the police, getting locked up inside a jail, wife and daughter’s hatred resulting in strained relations with them...Govind endures a lot. Even after all this, Govind is determined to find out the truth. This movie touches upon several issues in the country however in an entertaining manner. Since it

In [21]:
bert_model.save('models/bert_model_all-mpnet-base-v2')

In [13]:
bert_model2 = SentenceTransformer('BAAI/bge-large-en-v1.5',device=device)

In [14]:
print(f"Embedding for plot {df.iloc[26820]['Plot']}:")
print("-----------------------------------------------")
print(get_bert_embedding( df.iloc[26820]['Plot'] , bert_model2 ))
print("-----------------------------------------------")
print(f"Embedding shape: {get_bert_embedding( df.iloc[26820]['Plot'] , bert_model2 ).shape}")

Embedding for plot Meinu Ek Ladki Chaahiye is a comical satirical Hindi movie starring Raghubir Yadav, Puru Chibber, Reecha Sinha, Zakir Hussain, Yatin Karyekar & Rashee Bindal. Govind (Raghubir Yadav) & his assistant Shishupal (Puru Chibber) get their first legal case of firoz( irfan razaa khan) after a lot of difficulties even though Govind’s father is a renowned lawyer. Govind, a jovial person does everything that he can to save the accused. It’s during the investigation that Govind and Shishupul come across some startling incidents. The case is of a serious nature but these two solve it in a very comical and hilarious way. However this case turns Govind’s life upside down. Harassment by the police, getting locked up inside a jail, wife and daughter’s hatred resulting in strained relations with them...Govind endures a lot. Even after all this, Govind is determined to find out the truth. This movie touches upon several issues in the country however in an entertaining manner. Since it

In [24]:
bert_model2.save('models/bert_model_bge-large-en-v1.5')

# Indexing

## Inverted Index

In [None]:


inverted_index = defaultdict(list)

for term_idx , term in enumerate(tfidf_features):
    for doc_idx in tfidf_matrix[:, term_idx].nonzero()[0]:

        tf_idf_score = tfidf_matrix[doc_idx, term_idx] 
        inverted_index[term].append((doc_idx, tf_idf_score))
        




Time to create inverse index : 15 minutes and 20 seconds

sort by tfidf score

In [34]:
for term in inverted_index:
    inverted_index[term].sort(key=lambda x: x[1], reverse=True)

test the inverted index

In [35]:
inverted_index['jack']

[(np.int32(23293), np.float64(0.7742107140428304)),
 (np.int32(16387), np.float64(0.7023112175872542)),
 (np.int32(11916), np.float64(0.69823091459789)),
 (np.int32(20115), np.float64(0.6923995546460648)),
 (np.int32(15939), np.float64(0.6735974934638481)),
 (np.int32(5651), np.float64(0.6488526218009987)),
 (np.int32(10372), np.float64(0.6422375768607566)),
 (np.int32(8569), np.float64(0.6355830966214756)),
 (np.int32(4), np.float64(0.618869899505765)),
 (np.int32(4370), np.float64(0.6106623512069875)),
 (np.int32(11609), np.float64(0.6104107418913691)),
 (np.int32(41), np.float64(0.6044558021173301)),
 (np.int32(21459), np.float64(0.6000510621626688)),
 (np.int32(12569), np.float64(0.5992219474395601)),
 (np.int32(13646), np.float64(0.5974533248525302)),
 (np.int32(17013), np.float64(0.5950808322908806)),
 (np.int32(21481), np.float64(0.5950808322908806)),
 (np.int32(22023), np.float64(0.594335244842615)),
 (np.int32(20548), np.float64(0.5810650708233327)),
 (np.int32(12755), np.floa

In [34]:
df.iloc[239]['Plot']

'As described in a film magazine,[3] Iris Lee (Minter) has tired of her humdrum country existence and one night steals away from the home of Martha Kane (Schaefer), with whom she had been living, and goes to the city intent on becoming a singer. Slowly rising from a church soloist to a prima donna, her dreams are finally realized. While in the city she meets Jack Andrews (Forrest) and falls in love with him, but when he comes to her one night intoxicated, she sends him away. Grieving over Jack she returns to her home town where, having his manhood returned, he finds her by accident.'

In [35]:
df.iloc[15792]['Plot']

'Ali Rose (Aguilera) moves to Los Angeles after she quits her bar job when her boss refuses to pay her. Once in L.A., she tries and fails at every audition she does until one night, she finds herself unknowingly in a burlesque club when she hears the music on the street. She finds Tess (Cher) and the dancers performing “Welcome to Burlesque” and decides to pursue a career on stage once she meets Jack (Gigandet) at the club\'s bar. Jack refers her to Tess for an audition, but she is instantly rejected and ushered out by Sean (Tucci). Instead of leaving, Ali begins serving customers at the club as a waitress, while Jack asks Tess to give Ali a chance.\r\nWhen Georgia (Hough) becomes pregnant, auditions are held to replace her. Ali begins her audition when everyone leaves, and after performing "Wagon Wheel Watusi", persuades Tess to allow her to become one of the club\'s dancers, much to the annoyance of Nikki (Bell), the lead performer who is always late and caught drinking before number

In [39]:
inverted_index['aguilera']

[(np.int32(15792), np.float64(0.04276427862935188)),
 (np.int32(16986), np.float64(0.044323085299058304))]

In [41]:
df.iloc[16986]['Plot']

'Three years after winning the previous competition, the Barden Bellas are now led by Beca Mitchell and three-time super senior Chloe Beale. The Bellas have become ICCA champions each of these three years. However, the group becomes involved in a national scandal, dubbed Muffgate, when a wardrobe malfunction causes Fat Amy\'s pants to rip in front of President Barack Obama, showing her genitals to the public since she did not wear underwear, leading to the Bellas\' suspension from the ICCAs. Beca makes a deal to allow the Bellas to be reinstated should they win the World Championship of a cappella.\r\nFreshman Emily Junk begins her college career, hoping to follow in her mother Katherine\'s footsteps by being a Bella. At orientation, she watches an a cappella performance by the Treblemakers, now led by Beca\'s boyfriend Jesse Swanson. Benjamin "Benji" Applebaum, Jesse\'s best friend, overhears Emily, leading to a crush on her.\r\nThe Bellas learn that Das Sound Machine (DSM), a German 

Save the inverted index

In [36]:
with open("models/inverted_index.pkl", "wb") as f:
    pickle.dump(inverted_index, f)

## FAISS(Facebook AI Similarity Search)

In [29]:
index = faiss.IndexFlatL2(model.vector_size)
print(model.vector_size)


300


In [30]:
df.shape

(34608, 8)

Add the vectors to the index

In [31]:
for plot in df['Plot']:
    plot_vector = get_plot_vector(plot)
    plot_vector = plot_vector.astype('float32')
    index.add(plot_vector.reshape(1, -1))
    
print(f"Added plot vector to index: {index.ntotal} vectors")
print(f"Index dimension: {index.d}")

Added plot vector to index: 34608 vectors
Index dimension: 300


Time to add vectors: 3 minutes and 30 seconds

In [33]:
print(index.ntotal)

34608


In [34]:
faiss.write_index(index, 'models/faiss_index.index')

## FAISS for Bert

first model

In [None]:
bert_index1 = faiss.IndexFlatIP(768)

for plot in df['Plot']:
    plot_embedding = get_bert_embedding(plot, bert_model)
    plot_embedding = plot_embedding.astype('float32')
    bert_index1.add(plot_embedding.reshape(1, -1))

Time to create vectors and add them to the index using google collab : 16 minutes

In [None]:
faiss.write_index(bert_index1, 'models/bert_index1.index')

second model

In [None]:
bert_index2 = faiss.IndexFlatIP(1024)

for plot in df['Plot']:
    plot_embedding = get_bert_embedding(plot, bert_model2)
    plot_embedding = plot_embedding.astype('float32')
    bert_index2.add(plot_embedding.reshape(1, -1))

Time to create vectors and add them to the index using google collab : 45 minutes

In [None]:
faiss.write_index(bert_index2, 'models/bert_index2.index')

# Retrieve and Rank Results

## Use the inverted index for search

load the inverted index and the tfidfvectorizer

In [33]:
with open("models/inverted_index.pkl", "rb") as f:
    inverted_index = pickle.load(f)

with open("models/tfidf_vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

with open("models/tfidf_matrix.pkl", "rb") as f:
    tfidf_matrix = pickle.load(f)

searching with inverted index

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

def search_inverted_index(query,top_k=10,depth=None,term_count_score=0.0):

    query_vector = tfidf_vectorizer.transform([query])

    query_terms = [tfidf_features[i] for i in query_vector.nonzero()[1]]
    term_count = len(query_terms)
    
    matching_docs = defaultdict(int)

    for term in query_terms:
        if term in inverted_index:
            if depth is not None:
                matches = inverted_index[term][:depth]
            else:
                matches = inverted_index[term]
            for doc_id ,score in matches:
                    matching_docs[doc_id] += 1


    matching_docs_matrix = tfidf_matrix[list(matching_docs.keys())]

    similarities = cosine_similarity(query_vector, matching_docs_matrix).flatten()


    results =[]
    for i, doc_id in enumerate(list(matching_docs.keys())):
        score = similarities[i]  + ( term_count_score * ( matching_docs[doc_id] / term_count ) )
        results.append((doc_id,score,similarities[i]))

    results.sort(key=lambda x: x[1], reverse=True)
    return results[:top_k]







Example 

In [42]:
results = search_inverted_index("jack and the beanstalk",top_k=10,term_count_score=1)
for doc_id, score, similarity in results:
    print(f"Document ID: {doc_id}, Score: {score:.4f}, Similarity: {similarity:.4f}")
    
    

Document ID: 5651, Score: 1.5343, Similarity: 0.5343
Document ID: 16571, Score: 1.4926, Similarity: 0.4926
Document ID: 16570, Score: 1.4093, Similarity: 0.4093
Document ID: 4, Score: 1.3958, Similarity: 0.3958
Document ID: 6246, Score: 1.2494, Similarity: 0.2494
Document ID: 16906, Score: 1.1942, Similarity: 0.1942
Document ID: 6931, Score: 1.1777, Similarity: 0.1777
Document ID: 12269, Score: 1.1696, Similarity: 0.1696
Document ID: 16094, Score: 1.1134, Similarity: 0.1134
Document ID: 20478, Score: 1.0312, Similarity: 0.0312


In [45]:
results = search_inverted_index("jack and the beanstalk",top_k=10)
for doc_id,score,similarity in results:
    print(f"Document ID: {doc_id}, Score: {score}  Similarity: {similarity:.4f}")
   

Document ID: 5651, Score: 0.5343020274339227  Similarity: 0.5343
Document ID: 16571, Score: 0.4925805246333418  Similarity: 0.4926
Document ID: 16570, Score: 0.4092626148872634  Similarity: 0.4093
Document ID: 4, Score: 0.39580026940712854  Similarity: 0.3958
Document ID: 23293, Score: 0.3299882066665093  Similarity: 0.3300
Document ID: 16387, Score: 0.2993428210302054  Similarity: 0.2993
Document ID: 11916, Score: 0.29760369259695857  Similarity: 0.2976
Document ID: 20115, Score: 0.2951182193555961  Similarity: 0.2951
Document ID: 15939, Score: 0.28710430487648153  Similarity: 0.2871
Document ID: 10372, Score: 0.2737379144954603  Similarity: 0.2737


## Use FAISS for search

load the index

In [25]:
index = faiss.read_index('models/faiss_index.index')

In [32]:



def search_faiss_index(query, top_k=10):
    query_vector = get_plot_vector(query)
    query_vector = query_vector.astype('float32')
    query_vector = query_vector.reshape(1, -1)
    D, I = index.search(query_vector, top_k)
    
    results = []
    for rank, (distance, idx) in enumerate(zip(D[0], I[0]), 1):
        results.append((idx, distance))
    
    results.sort(key=lambda x: x[1])

    return results

# Example usage

answers = search_faiss_index("jack and the beanstalk", top_k=10)

for  doc_id, distance  in answers:
    print(f"Document ID: {doc_id}, Distance: {distance:.4f}")




Document ID: 4, Distance: 4.3515
Document ID: 16571, Distance: 4.4894
Document ID: 5651, Distance: 4.5679
Document ID: 16570, Distance: 4.7049
Document ID: 16387, Distance: 5.0265
Document ID: 23293, Distance: 5.0378
Document ID: 15030, Distance: 5.0420
Document ID: 6246, Distance: 5.0580
Document ID: 22704, Distance: 5.0753
Document ID: 5363, Distance: 5.1144


## Use FAISS for search(Bert)

First model

In [None]:
bert_index1 = faiss.read_index('models/bert_index1.index')

def search_faiss_bert_index(query, model,top_k=10):
    query_vector = get_bert_embedding(query,model)
    query_vector = query_vector.astype('float32')
    query_vector = query_vector.reshape(1, -1)
    S, I = bert_index1.search(query_vector, top_k)
    
    results = []
    for rank, (similarity, idx) in enumerate(zip(S[0], I[0]), 1):
        results.append((idx, similarity))
    
    results.sort(key=lambda x: x[0])

    return results

In [12]:
answers = search_faiss_bert_index("jack and the beanstalk", bert_model, top_k=10)

for doc_id, similarity in answers:
    print(f"Document ID: {doc_id}, Similarity: {similarity:.4f}")

Document ID: 4, Similarity: 0.6917
Document ID: 5651, Similarity: 0.5810
Document ID: 6246, Similarity: 0.4959
Document ID: 7139, Similarity: 0.4998
Document ID: 12492, Similarity: 0.5125
Document ID: 16570, Similarity: 0.6817
Document ID: 16571, Similarity: 0.5315
Document ID: 19662, Similarity: 0.4952
Document ID: 20548, Similarity: 0.5023
Document ID: 33398, Similarity: 0.5075


second model

In [15]:
bert_index2 = faiss.read_index('models/bert_index2.index')

def search_faiss_bert_index(query, model,top_k=10):
    query_vector = get_bert_embedding(query,model)
    query_vector = query_vector.astype('float32')
    query_vector = query_vector.reshape(1, -1)
    S, I = bert_index2.search(query_vector, top_k)
    
    results = []
    for rank, (similarity, idx) in enumerate(zip(S[0], I[0]), 1):
        results.append((idx, similarity))
    
    results.sort(key=lambda x: x[0])

    return results

In [17]:
answers = search_faiss_bert_index("jack and the beanstalk", bert_model2, top_k=10)

for doc_id, similarity in answers:
    print(f"Document ID: {doc_id}, Similarity: {similarity:.4f}")

Document ID: 4, Similarity: 0.7414
Document ID: 5651, Similarity: 0.6677
Document ID: 6246, Similarity: 0.7213
Document ID: 6931, Similarity: 0.6580
Document ID: 12269, Similarity: 0.6676
Document ID: 15359, Similarity: 0.6486
Document ID: 16094, Similarity: 0.6608
Document ID: 16570, Similarity: 0.7252
Document ID: 16571, Similarity: 0.6887
Document ID: 33398, Similarity: 0.6761


# Examples of TF_IDF_Search_Engine

In [62]:
from TF_IDF_SearchEngine import TF_IDF_SearchEngine

with open("models/tfidf_search_engine.pkl", "rb") as f:
    search_engine1 = pickle.load(f)



time to load : 1 minute

In [63]:
results = search_engine1.search("jack and the beanstalk", top_k=10)
for doc_id,title, plot ,score, similarity in results:
    print(f"Document ID: {doc_id}, Title: {title}, Score: {score:.4f}, Similarity: {similarity:.4f}, Plot: {plot[:50]}...")

Document ID: 5651, Title: Jack and the Beanstalk 1952, Score: 0.5343, Similarity: 0.5343, Plot: Mr. Dinkle and Jack (Abbott and Costello) look for...
Document ID: 16571, Title: Jack the Giant Slayer, Score: 0.4926, Similarity: 0.4926, Plot: In the Kingdom of Cloister, Jack, a young farm boy...
Document ID: 16570, Title: Jack the Giant Killer 2013, Score: 0.4093, Similarity: 0.4093, Plot: After climbing a giant beanstalk, Jack discovers a...
Document ID: 4, Title: Jack and the Beanstalk 1902, Score: 0.3958, Similarity: 0.3958, Plot: The earliest known adaptation of the classic fairy...
Document ID: 23293, Title: Blackjack, Score: 0.3300, Similarity: 0.3300, Plot: Jack Devlin (Dolph Lundgren) is a U.S. Marshall tu...
Document ID: 16387, Title: A Thousand Words, Score: 0.2993, Similarity: 0.2993, Plot: Jack McCall (Eddie Murphy) is a literary agent who...
Document ID: 11916, Title: Traces of Red, Score: 0.2976, Similarity: 0.2976, Plot: The movie begins with Jack Dobson, a homicide dete..

In [64]:
results = search_engine1.search("jack and the beanstalk", top_k=10, term_count_score=0.2)
for doc_id,title, plot ,score, similarity in results:
    print(f"Document ID: {doc_id}, Title: {title}, Score: {score:.4f}, Similarity: {similarity:.4f}, Plot: {plot[:50]}...")

Document ID: 5651, Title: Jack and the Beanstalk 1952, Score: 0.7343, Similarity: 0.5343, Plot: Mr. Dinkle and Jack (Abbott and Costello) look for...
Document ID: 16571, Title: Jack the Giant Slayer, Score: 0.6926, Similarity: 0.4926, Plot: In the Kingdom of Cloister, Jack, a young farm boy...
Document ID: 16570, Title: Jack the Giant Killer 2013, Score: 0.6093, Similarity: 0.4093, Plot: After climbing a giant beanstalk, Jack discovers a...
Document ID: 4, Title: Jack and the Beanstalk 1902, Score: 0.5958, Similarity: 0.3958, Plot: The earliest known adaptation of the classic fairy...
Document ID: 6246, Title: Beanstalk Bunny, Score: 0.4494, Similarity: 0.2494, Plot: The story begins with Daffy Duck in the role of Ja...
Document ID: 23293, Title: Blackjack, Score: 0.4300, Similarity: 0.3300, Plot: Jack Devlin (Dolph Lundgren) is a U.S. Marshall tu...
Document ID: 16387, Title: A Thousand Words, Score: 0.3993, Similarity: 0.2993, Plot: Jack McCall (Eddie Murphy) is a literary agent who.

How much time it takes to answer 30 queries

In [None]:
queries =[ 

    "Romantic comedy",
    "Science fiction adventure",
    "Action thriller",
    "Historical drama",
    "Horror film",
    "Fantasy epic",
    "Biographical film",
    "Crime investigation",
    "Political thriller",
    "Coming of age",
    "Family friendly",
    "Independent film",
    "War story",
    "Western classic",
    "Documentary",
    "When a flying saucer lands in Washington",
    "Jack and the Beanstalk",
    "The first Asian Nobel Laureate, Rabindranath Tagore, is still revered",
    "A major international financier is found dead at his Hampshire",
    "Inspector Amar and Inspector Chhaya are after the criminal Hardayal",
    "In a future post-apocalyptic Earth in the year 2293, the",
    "Illustrated Press society editor Lorelei Kilbourne (Hillary Brooke) is assigned",
    "Sivan (Vinu Mohan) is a medical college student better known",
    "In the prologue, a young Max Skinner, whose parents have",
    "Five men, criminals Ray, Dave, Stevie, Julian as a",
    "Prince Edward of Euphrania returns home after meeting the princess",
    "With the outbreak of World War I, a young woman",
    "The Shintai Sobu fitness club is formed by the chief",
    "The city of Kozhikode is victimized by a massive communal",
    "A spy (Essie Lin Chia (fr)) discovers that the Chinese"
        ]

In [69]:
start_time = time.time()

for query in queries:
    results = search_engine1.search(query, top_k=10, term_count_score=0.2)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time for {len(queries)} queries: {execution_time:.2f} seconds")
    

Execution time for 30 queries: 2.48 seconds


# Examples of Word2VecSearchEngine

In [1]:
from Word2VecSearchEngine import Word2VecSearchEngine
import faiss
import pandas as pd

index = faiss.read_index('models/faiss_index.index')
df = pd.read_csv('wiki_movie_plots_deduped_updated.csv')

search_engine2 = Word2VecSearchEngine(model_path='models/GoogleNews-vectors-negative300.bin',faiss_index=index, df=df)

[nltk_data] Downloading package punkt_tab to C:\Users\John
[nltk_data]     skoul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\John
[nltk_data]     skoul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\John
[nltk_data]     skoul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Time to load Word2VecSearchEngine : 2 minutes and 30 seconds

In [2]:
results = search_engine2.search(query="jack and the beanstalk", top_k=10)
for doc_id,title, plot ,distance in results:
    print(f"Document ID: {doc_id}, Title: {title}, Distance: {distance:.4f}, Plot: {plot[:50]}...")

Document ID: 4, Title: Jack and the Beanstalk 1902, Distance: 4.3515, Plot: The earliest known adaptation of the classic fairy...
Document ID: 16571, Title: Jack the Giant Slayer, Distance: 4.4894, Plot: In the Kingdom of Cloister, Jack, a young farm boy...
Document ID: 5651, Title: Jack and the Beanstalk 1952, Distance: 4.5679, Plot: Mr. Dinkle and Jack (Abbott and Costello) look for...
Document ID: 16570, Title: Jack the Giant Killer 2013, Distance: 4.7049, Plot: After climbing a giant beanstalk, Jack discovers a...
Document ID: 16387, Title: A Thousand Words, Distance: 5.0265, Plot: Jack McCall (Eddie Murphy) is a literary agent who...
Document ID: 23293, Title: Blackjack, Distance: 5.0378, Plot: Jack Devlin (Dolph Lundgren) is a U.S. Marshall tu...
Document ID: 15030, Title: Pirates of the Caribbean: Dead Man's Chest, Distance: 5.0420, Plot: The wedding of Will Turner and Elizabeth Swann is ...
Document ID: 6246, Title: Beanstalk Bunny, Distance: 5.0580, Plot: The story begins with

In [3]:
results = search_engine2.search("Pirates", top_k=10)
for doc_id,title, plot ,distance in results:
    print(f"Document ID: {doc_id}, Title: {title}, Distance: {distance:.4f}, Plot: {plot[:50]}...")

Document ID: 580, Title: The Black Pirate, Distance: 8.4920, Plot: The film begins with the looting of a ship already...
Document ID: 8245, Title: The King's Pirate, Distance: 8.7911, Plot: A British naval officer volunteers for a dangerous...
Document ID: 653, Title: The Road to Romance, Distance: 8.8354, Plot: Serafina (Marceline Day) is captured by Don Baltha...
Document ID: 3594, Title: Jack London, Distance: 8.9751, Plot: The film follows the adventures of the writer-adve...
Document ID: 21172, Title: The Pirates! In an Adventure with Scientists!, Distance: 9.0019, Plot: In 1837, the Pirate Captain (Hugh Grant), inexpert...
Document ID: 10113, Title: The Pirates of Penzance, Distance: 9.3465, Plot: Frederic was sent in the care of his nursemaid, Ru...
Document ID: 360, Title: Treasure Island 1920, Distance: 9.4517, Plot: Young Jim Hawkins is caught up with the pirate Lon...
Document ID: 8867, Title: Treasure Island 1972, Distance: 9.5347, Plot: Enchanted by the idea of locating tr

How much time it takes to answer 30 queries

In [6]:
import time

start_time = time.time()

for query in queries:
    results = search_engine2.search(query, top_k=10)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time for {len(queries)} queries: {execution_time:.2f} seconds")

Execution time for 30 queries: 0.74 seconds


# Examples of BertSearchEngine

all-mpnet-base-v2 model

In [5]:
from BertSearchEngine import BertSearchEngine
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'


index = faiss.read_index('models/bert_index1.index')

df = pd.read_csv('wiki_movie_plots_deduped_updated.csv')

bert_search_engine = BertSearchEngine(model_path='models/bert_model_all-mpnet-base-v2', index=index, df=df)



In [6]:
results = bert_search_engine.search(query="jack and the beanstalk", top_k=10)
for doc_id,similarity,title, plot  in results:
    print(f"Document ID: {doc_id},Similarity: {similarity:.4f}, Title: {title},  Plot: {plot[:50]}...")

Document ID: 4,Similarity: 0.6917, Title: Jack and the Beanstalk 1902,  Plot: The earliest known adaptation of the classic fairy...
Document ID: 16570,Similarity: 0.6817, Title: Jack the Giant Killer 2013,  Plot: After climbing a giant beanstalk, Jack discovers a...
Document ID: 5651,Similarity: 0.5810, Title: Jack and the Beanstalk 1952,  Plot: Mr. Dinkle and Jack (Abbott and Costello) look for...
Document ID: 16571,Similarity: 0.5315, Title: Jack the Giant Slayer,  Plot: In the Kingdom of Cloister, Jack, a young farm boy...
Document ID: 12492,Similarity: 0.5125, Title: Magic Island (film),  Plot: Jack Carlisle is a disillusioned 13-year-old boy. ...
Document ID: 33398,Similarity: 0.5075, Title: Magic Tree House,  Plot: Jack is a shy but confident bookworm and his siste...
Document ID: 20548,Similarity: 0.5023, Title: The Cement Garden,  Plot: Jack is a narcissistic 15-year-old boy, helping hi...
Document ID: 7139,Similarity: 0.4998, Title: Tom Thumb,  Plot: Jonathan, a poor but hones

In [7]:
results = bert_search_engine.search(query="Horror Film", top_k=10)
for doc_id,similarity,title, plot  in results:
    print(f"Document ID: {doc_id},Similarity: {similarity:.4f}, Title: {title},  Plot: {plot[:50]}...")

Document ID: 23423,Similarity: 0.6090, Title: The Heirloom,  Plot: The film begins with the description of an obscure...
Document ID: 21097,Similarity: 0.6050, Title: Grave Tales,  Plot: A young, genealogist (Heather Darcy) whiles away h...
Document ID: 11734,Similarity: 0.6048, Title: There's Nothing Out There,  Plot: A frog-like alien attacks a group of teenagers who...
Document ID: 34392,Similarity: 0.6006, Title: Tunnel 3D,  Plot: A group of friends are invited to the launch party...
Document ID: 10737,Similarity: 0.5922, Title: I Was a Teenage Zombie,  Plot: The film begins like a 1980s comedy with teens loo...
Document ID: 10297,Similarity: 0.5788, Title: Terror in the Aisles,  Plot: Director Andrew J. Kuehn has excerpted brief segme...
Document ID: 18799,Similarity: 0.5771, Title: Escape from Broadmoor,  Plot: An insane killer escapes from Broadmoor Hospital, ...
Document ID: 23327,Similarity: 0.5740, Title: Troublesome Night 7,  Plot: A film crew travels to a remote island to s

How much time it takes to answer 30 queries

In [10]:
import time

start_time = time.time()

for query in queries:
    results = bert_search_engine.search(query, top_k=10)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time for {len(queries)} queries: {execution_time:.2f} seconds")

Execution time for 30 queries: 2.22 seconds


bge-large-en-v1.5 model

In [11]:
from BertSearchEngine import BertSearchEngine
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'


index2 = faiss.read_index('models/bert_index2.index')

df = pd.read_csv('wiki_movie_plots_deduped_updated.csv')

bert_search_engine2 = BertSearchEngine(model_path='models/bert_model_bge-large-en-v1.5', index=index2, df=df)

In [14]:
results = bert_search_engine2.search(query="jack and the beanstalk", top_k=10)
for doc_id,similarity,title, plot  in results:
    print(f"Document ID: {doc_id},Similarity: {similarity:.4f}, Title: {title},  Plot: {plot[:50]}...")

Document ID: 4,Similarity: 0.7414, Title: Jack and the Beanstalk 1902,  Plot: The earliest known adaptation of the classic fairy...
Document ID: 16570,Similarity: 0.7252, Title: Jack the Giant Killer 2013,  Plot: After climbing a giant beanstalk, Jack discovers a...
Document ID: 6246,Similarity: 0.7213, Title: Beanstalk Bunny,  Plot: The story begins with Daffy Duck in the role of Ja...
Document ID: 16571,Similarity: 0.6887, Title: Jack the Giant Slayer,  Plot: In the Kingdom of Cloister, Jack, a young farm boy...
Document ID: 33398,Similarity: 0.6761, Title: Magic Tree House,  Plot: Jack is a shy but confident bookworm and his siste...
Document ID: 5651,Similarity: 0.6677, Title: Jack and the Beanstalk 1952,  Plot: Mr. Dinkle and Jack (Abbott and Costello) look for...
Document ID: 12269,Similarity: 0.6676, Title: The Pagemaster,  Plot: Pessimistic 10-year-old Richard Tyler lives life b...
Document ID: 16094,Similarity: 0.6608, Title: Puss in Boots 2011,  Plot: Puss in Boots (Antonio B

In [13]:
results = bert_search_engine2.search(query="Horror Film", top_k=10)
for doc_id,similarity,title, plot  in results:
    print(f"Document ID: {doc_id},Similarity: {similarity:.4f}, Title: {title},  Plot: {plot[:50]}...")

Document ID: 21097,Similarity: 0.7081, Title: Grave Tales,  Plot: A young, genealogist (Heather Darcy) whiles away h...
Document ID: 16700,Similarity: 0.6989, Title: V/H/S/2,  Plot: The film is presented as an anthology of short hor...
Document ID: 14797,Similarity: 0.6805, Title: Abominable,  Plot: The film begins with a farmer named Billy Hoss (Re...
Document ID: 9471,Similarity: 0.6798, Title: A Distant Thunder,  Plot: Patty Myers is lying awake one night in a church b...
Document ID: 23671,Similarity: 0.6788, Title: Basement 2014,  Plot: A group of people gets stuck in a basement parking...
Document ID: 20904,Similarity: 0.6772, Title: Wild Country,  Plot: The plot of the film revolves around a group of Gl...
Document ID: 8556,Similarity: 0.6752, Title: I Drink Your Blood,  Plot: The film opens on a Satanic ritual conducted by Ho...
Document ID: 11033,Similarity: 0.6745, Title: Phantasm II,  Plot: The film introduces Liz Reynolds, a young woman wh...
Document ID: 10952,Similarity: 

In [16]:
import time

start_time = time.time()

for query in queries:
    results = bert_search_engine2.search(query, top_k=10)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time for {len(queries)} queries: {execution_time:.2f} seconds")

Execution time for 30 queries: 5.88 seconds
