# Simple Elastic search engine

## Import Elastic

In [1]:
#import Elasticsearch class from package elasticsearch
from elasticsearch import Elasticsearch

## Connect to Elastic

In [2]:
#enter url, auth credentials and certificate of the locally running elastic server
es = Elasticsearch(
    "https://localhost:9200/",
    basic_auth=("elastic", "yy9VKrgkGF54ci28GPlJ"),
    ca_certs="C:/Users/Simov/Downloads/elasticsearch-8.17.0/config/certs/http_ca.crt"
)
#ping the server to test the connection
es.ping()

True

## Prepare the data

In [3]:
#import pandas package for working with data frames and use the alias pd
import pandas as pd
#use the read_csv function from pandas to load the entries from the csv dataset into a data frame
df = pd.read_csv("movies_metadata.csv", low_memory=False)
#check the first 5 documents loaded into the data frame
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
#check overview column, which is of interest because it contains the plot summary
df["overview"].isna().value_counts()

overview
False    44512
True       954
Name: count, dtype: int64

In [5]:
#fill the the NaN movie overviews with empty string
df["overview"] = df["overview"].fillna("")
#check for NaN once again to verify the previous operation
df["overview"].isna().value_counts()

overview
False    45466
Name: count, dtype: int64

In [6]:
#check the rest of the columns for NaN values
df.isna().value_counts()

adult  belongs_to_collection  budget  genres  homepage  id     imdb_id  original_language  original_title  overview  popularity  poster_path  production_companies  production_countries  release_date  revenue  runtime  spoken_languages  status  tagline  title  video  vote_average  vote_count
False  True                   False   False   True      False  False    False              False           False     False       False        False                 False                 False         False    False    False             False   True     False  False  False         False         19388
                                                                                                                                                                                                                                                    False    False  False  False         False         14096
                                              False     False  False    False              False          

In [7]:
#fill the the NaN values with None string
with pd.option_context("future.no_silent_downcasting", True):
    df = df.fillna("None").infer_objects(copy=False)
#check for NaN once again to verify the previous operation
df.isna().value_counts()

adult  belongs_to_collection  budget  genres  homepage  id     imdb_id  original_language  original_title  overview  popularity  poster_path  production_companies  production_countries  release_date  revenue  runtime  spoken_languages  status  tagline  title  video  vote_average  vote_count
False  False                  False   False   False     False  False    False              False           False     False       False        False                 False                 False         False    False    False             False   False    False  False  False         False         45466
Name: count, dtype: int64

## Convert the relevant field to Vector using a pre-trained BERT model

In [8]:
#import SentenceTransformer class from package sentence_transformers
from sentence_transformers import SentenceTransformer
#load all-mpnet-base-v2 pre-trained model from https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2")

In [9]:
#use the transformer model to create an additional column which will contain the embeddings of the overviews
df["overview_vector"] = df["overview"].apply(lambda o: model.encode(o))
#compare the first 5 entries from the extra column which the model created in the last operation with the original column
df[["overview", "overview_vector"]].head()

Unnamed: 0,overview,overview_vector
0,"Led by Woody, Andy's toys live happily in his ...","[0.05644015, 0.059867132, -0.010645737, 0.0240..."
1,When siblings Judy and Peter discover an encha...,"[0.04063317, -0.012308189, -0.048614934, 0.051..."
2,A family wedding reignites the ancient feud be...,"[-0.013073963, 0.02507021, -0.021723554, -0.00..."
3,"Cheated on, mistreated and stepped on, the wom...","[0.046399668, 0.0318815, -0.008322174, -0.0191..."
4,Just when George Banks has recovered from his ...,"[-0.015505952, -0.01025263, -0.020241242, -0.0..."


## Create new index in Elastic

In [10]:
#import index mapping from a file
from index_mappings import index_mappings
#create a new index by passing in name of the index and its mappings
es.indices.create(index="movie_overviews", mappings=index_mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movie_overviews'})

## Ingest the data into the index

In [11]:
#convert the data frame into json record format
records = df.to_dict("records")
records[0]

{'adult': 'False',
 'belongs_to_collection': "{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",
 'budget': '30000000',
 'genres': "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': '862',
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': '21.946943',
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'production_companies': "[{'name': 'Pixar Animation Studios', 'id': 3}]",
 'production_countries': "[{'iso_3166_1':

In [22]:
for record in records:
    try:
        es.index(index="movie_overviews", document=record, id=record["id"])
    except Exception as e:
        print(e)

BadRequestError(400, 'document_parsing_exception', "[1:521] failed to parse field [runtime] of type [float] in document with id '287305'. Preview of field's value: 'None'")
BadRequestError(400, 'document_parsing_exception', "[1:472] failed to parse field [runtime] of type [float] in document with id '339428'. Preview of field's value: 'None'")
BadRequestError(400, 'document_parsing_exception', "[1:433] failed to parse field [runtime] of type [float] in document with id '278978'. Preview of field's value: 'None'")
BadRequestError(400, 'document_parsing_exception', "[1:609] failed to parse field [release_date] of type [date] in document with id '365371'. Preview of field's value: 'None'")
BadRequestError(400, 'document_parsing_exception', "[1:1293] failed to parse field [release_date] of type [date] in document with id '215107'. Preview of field's value: 'None'")
BadRequestError(400, 'document_parsing_exception', "[1:445] failed to parse field [runtime] of type [float] in document with i

## Search

In [23]:
def search(raw_query, k, n):
    query_vector = model.encode(raw_query)
    query = {
        "knn":{
            "field" : "overview_vector",
            "query_vector" : query_vector,
            "k" : k,
            "num_candidates": n
        },
        "fields" : ["original_title", "overview", "vote_average", "vote_count"]
    }
    res = es.search(index="movie_overviews", body=query)
    return [x["fields"] for x in res["hits"]["hits"]]

In [24]:
search("clownfish father and forgetful fish", 5, 500)

[{'overview': ["Nemo, an adventurous young clownfish, is unexpectedly taken from his Great Barrier Reef home to a dentist's office aquarium. It's up to his worrisome father Marlin and a friendly but forgetful fish Dory to bring Nemo home -- meeting vegetarian sharks, surfer dude turtles, hypnotic jellyfish, hungry seagulls, and more along the way."],
  'original_title': ['Finding Nemo'],
  'vote_count': [6292],
  'vote_average': [7.6]},
 {'overview': ['An old lighthouse keeper who lives with his daughter secretly keeps a prehistoric fish-man by feeding it scraps and fish. One day he misses the feeding and all hell breaks loose.'],
  'original_title': ['The Monster of Piedras Blancas'],
  'vote_count': [3],
  'vote_average': [5.0]},
 {'overview': ["In this story set at a seaside fishing village and inspired by a Charles Kingsley poem, a young couple's happy life is turned about by an accident. The husband, although saved from drowning, loses his memory. A child is on the way, and soon a

In [25]:
search("professional thieves crew versus predatory restless detective", 5, 500)

[{'overview': ['Obsessive master thief, Neil McCauley leads a top-notch crew on various insane heists throughout Los Angeles while a mentally unstable detective, Vincent Hanna pursues him without rest. Each man recognizes and respects the ability and the dedication of the other even though they are aware their cat-and-mouse game may end in violence.'],
  'original_title': ['Heat'],
  'vote_count': [1886],
  'vote_average': [7.7]},
 {'overview': ["A thief with a unique code of professional ethics is double-crossed by his crew and left for dead. Assuming a new disguise and forming an unlikely alliance with a woman on the inside, he looks to hijack the score of the crew's latest heist."],
  'original_title': ['Parker'],
  'vote_count': [1467],
  'vote_average': [5.7]},
 {'overview': ['A reformed jewel thief helps detectives track down a criminal.'],
  'original_title': ['Arsène Lupin Returns'],
  'vote_count': [1],
  'vote_average': [6.5]},
 {'overview': ['A crooked detective masterminds 

In [26]:
search("genious mathematician marries student, goes insane and then wins nobel prize", 5, 500)

[{'overview': ["At Princeton University, John Nash struggles to make a worthwhile contribution to serve as his legacy to the world of mathematics. He finally makes a revolutionary breakthrough that will eventually earn him the Nobel Prize. After graduate school he turns to teaching, becoming romantically involved with his student Alicia. Meanwhile the government asks his help with breaking Soviet codes, which soon gets him involved in a terrifying conspiracy plot. Nash grows more and more paranoid until a discovery that turns his entire world upside down. Now it is only with Alicia's help that he will be able to recover his mental strength and regain his status as the great mathematician we know him as today.."],
  'original_title': ['A Beautiful Mind'],
  'vote_count': [3087],
  'vote_average': [7.7]},
 {'overview': ['In an age when genius is a mere commodity, it is useful to look at a person who led a rich life without the traditional trappings of success. A man with no home and no j

In [27]:
search("paleontologists and mathematican go to a theme park island with dinosaurs where things go bad", 5, 500)

[{'overview': ['Carl and David, two boys flying a small aircraft over the ocean with their father, crash land near an uncharted island. The boys swim safely to shore, but their father unfortunately drowns in the crash. On their own, the brothers explore the island and soon discover it is not only inhabited by people, but by dinosaurs as well! While the place seems easy enough to get used to, the boys must find a way of returning to their home.'],
  'original_title': ['Dinotopia'],
  'vote_count': [71],
  'vote_average': [5.8]},
 {'overview': ['A wealthy entrepreneur secretly creates a theme park featuring living dinosaurs drawn from prehistoric DNA. Before opening day, he invites a team of experts and his two eager grandchildren to experience the park and help calm anxious investors. However, the park is anything but amusing as the security systems go off-line and the dinosaurs escape.'],
  'original_title': ['Jurassic Park'],
  'vote_count': [4956],
  'vote_average': [7.6]},
 {'overvi

In [28]:
search("two magicians compete while mysterious scientist creates machine and warns about obsessions", 5, 500)

[{'overview': ['A mysterious story of two magicians whose intense rivalry leads them on a life-long battle for supremacy -- full of obsession, deceit and jealousy with dangerous and deadly consequences.'],
  'original_title': ['The Prestige'],
  'vote_count': [4510],
  'vote_average': [8.0]},
 {'overview': ['A pair of rivaling stage magicians are forced to confront their falling out over a guillotine mishap when they compete in a magic competition.'],
  'original_title': ['Magicians'],
  'vote_count': [25],
  'vote_average': [6.2]},
 {'overview': ['An average magician can entertain but a world-class artist can reawaken your faith in the impossible. In this utterly charming showbiz chronicle, four stellar magicians will amaze even the staunchest of skeptics. But for each of these virtuosos, true success seems illusory. Among them: Brian Gillis was Johnny Carson’s favourite close-up magician and a regular on The Tonight Show; David Minkin can levitate almost anything with his mind; and J

## Sources

[1] Movies Dataset: [Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?resource=download&select=movies_metadata.csv)

[2] Elastic: [Installation](https://www.elastic.co/guide/en/elasticsearch/reference/current/zip-windows.html), [KNN Search](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html), [YouTube 1](https://www.youtube.com/watch?v=u-ivLEDC_74), [YouTube 2](https://www.youtube.com/watch?v=obTK8dAaOkc), [YouTube 3](https://www.youtube.com/watch?v=KSwPR9eig7w)

[3] Transformers: [Pretrained Models](https://sbert.net/docs/sentence_transformer/pretrained_models.html)