## Import libraries

In [11]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from sentence_transformers import SentenceTransformer
import pandas as pd
import ast

## Setup Elastic search configuration

In [8]:
es = Elasticsearch(
    "http://localhost:9200"
)

In [9]:
es.ping()

True

## Prepare the data

In [12]:
def merge_details(dataframe):
    dataframe['merge_details'] = dataframe['title'] + " " + dataframe['updated_genres'] + " " + dataframe['overview'] + " " +dataframe['tagline'] + " " +dataframe['updated_spoken_languages']
    return dataframe

def retireve_genres(items):
    return [item['name'] for item in items] 

dataframe1 = pd.read_csv('docs/movies_metadata.csv',sep = ',')
dataframe2 = pd.read_csv('docs/ratings_small.csv',sep=',')
dataframe1['bool_id'] = dataframe1['id'].str.isdigit()
dataframe1 = dataframe1.drop(dataframe1[dataframe1['bool_id'] == False].index,axis=0)
dataframe1 = dataframe1.reset_index(drop=True)
dataframe1['id'] = dataframe1['id'].apply(lambda x: int(x))
dataframe2.rename(columns={'movieId':'id'}, inplace=True)
merge_dataframe = pd.merge(
    dataframe1, 
    dataframe2, 
    on='id'
)[['id','title','rating','genres','overview','tagline','spoken_languages']].dropna().reset_index(drop=True)
merge_dataframe['genres'] = merge_dataframe['genres'].apply(lambda x: ast.literal_eval(x))
merge_dataframe['spoken_languages'] = merge_dataframe['spoken_languages'].apply(lambda x: ast.literal_eval(x))
merge_dataframe['genres'] = merge_dataframe['genres'].apply(retireve_genres)
merge_dataframe['spoken_languages'] = merge_dataframe['spoken_languages'].apply(retireve_genres)
merge_dataframe['updated_genres'] =  merge_dataframe['genres'].apply(lambda x : ' '.join(x))
merge_dataframe['updated_spoken_languages'] = merge_dataframe['spoken_languages'].apply(lambda x : ' '.join(x))
final_data = merge_details(merge_dataframe)

  dataframe1 = pd.read_csv('docs/movies_metadata.csv',sep = ',')


In [47]:
df = final_data.loc[final_data['merge_details'].drop_duplicates(keep='first').index].reset_index(drop=True)
df.head()

Unnamed: 0,id,title,rating,genres,overview,tagline,spoken_languages,updated_genres,updated_spoken_languages,merge_details
0,949,Heat,3.5,"[Action, Crime, Drama, Thriller]","Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,"[English, Español]",Action Crime Drama Thriller,English Español,Heat Action Crime Drama Thriller Obsessive mas...
1,710,GoldenEye,1.0,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"[English, Pусский, Español]",Adventure Action Thriller,English Pусский Español,GoldenEye Adventure Action Thriller James Bond...
2,1408,Cutthroat Island,1.0,"[Action, Adventure]","Morgan Adams and her slave, William Shaw, are ...",The Course Has Been Set. There Is No Turning B...,"[English, Latin]",Action Adventure,English Latin,Cutthroat Island Action Adventure Morgan Adams...
3,524,Casino,2.0,"[Drama, Crime]",The life of the gambling paradise – Las Vegas ...,No one stays at the top forever.,[English],Drama Crime,English,Casino Drama Crime The life of the gambling pa...
4,4584,Sense and Sensibility,5.0,"[Drama, Romance]","Rich Mr. Dashwood dies, leaving his second wif...",Lose your heart and come to your senses.,[English],Drama Romance,English,Sense and Sensibility Drama Romance Rich Mr. D...


## Convert the embeddings of the fields

In [15]:
model = SentenceTransformer("all-mpnet-base-v2")

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 588kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 190kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 6.75MB/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 285kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 116kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 188kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:24<00:00, 17.5MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 52.2kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 239kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 655kB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 181kB/s]
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 13.1MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 560kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 175kB/s]


In [54]:
df['embeddings'] = df['merge_details'].apply(lambda x: model.encode(x))

In [96]:
dataframe = df[['id','title','rating','genres','overview','tagline','spoken_languages','embeddings']]

In [97]:
dataframe

Unnamed: 0,id,title,rating,genres,overview,tagline,spoken_languages,embeddings
0,949,Heat,3.5,"[Action, Crime, Drama, Thriller]","Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,"[English, Español]","[-0.042521507, 0.020714978, 0.005733385, 0.023..."
1,710,GoldenEye,1.0,"[Adventure, Action, Thriller]",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,"[English, Pусский, Español]","[0.0058759227, -0.025645023, 0.04358933, -0.00..."
2,1408,Cutthroat Island,1.0,"[Action, Adventure]","Morgan Adams and her slave, William Shaw, are ...",The Course Has Been Set. There Is No Turning B...,"[English, Latin]","[-0.04299491, 0.03970336, 0.012585637, -0.0166..."
3,524,Casino,2.0,"[Drama, Crime]",The life of the gambling paradise – Las Vegas ...,No one stays at the top forever.,[English],"[-0.02989307, 0.06692077, 0.009279962, 0.02446..."
4,4584,Sense and Sensibility,5.0,"[Drama, Romance]","Rich Mr. Dashwood dies, leaving his second wif...",Lose your heart and come to your senses.,[English],"[-0.023869524, -0.047999404, -0.009230304, 0.0..."
...,...,...,...,...,...,...,...,...
1781,86190,Adventures Of A Taxi Driver,1.5,[Comedy],Joe North is a London taxi driver who manages ...,He gets more than his fare share...!,[English],"[0.019077772, 0.0903764, 0.023905175, 0.007013..."
1782,2154,The Dark Side of The Moon,4.0,"[Horror, Action, Thriller, Science Fiction]",It is the year 2022. A mysterious systems fail...,Beyond Rescue ... Beyond Reason ... Beyond Sal...,[English],"[0.029375672, -0.044158503, 0.03268308, -0.025..."
1783,8453,Fanaa,4.0,"[Action, Drama, Romance, Thriller]",Zooni Ali Beg (Kajol) is a blind Kashmiri girl...,Destroyed in love...,"[हिन्दी, Italiano, اردو]","[0.039156586, -0.0106035005, -0.0042984663, 0...."
1784,3178,Beat,4.0,[Drama],The story of writer William Seward Burroughs a...,Sex and Drugs before Rock and Roll,[English],"[-0.00040901752, 0.001449527, 0.00467245, -0.0..."


## Create Index with custom mapping

In [74]:
from indexMapping import indexMapping
es.indices.create(index = "elastic-demo", mappings = indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elastic-demo'})

## Injest data into the index

In [98]:
records = dataframe.to_dict(orient='records')

In [101]:
[es.index(index = "elastic-demo", document = record, id = record['id']) for record in records] 

[ObjectApiResponse({'_index': 'elastic-demo', '_id': '949', '_version': 2, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}),
 ObjectApiResponse({'_index': 'elastic-demo', '_id': '710', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}),
 ObjectApiResponse({'_index': 'elastic-demo', '_id': '1408', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}),
 ObjectApiResponse({'_index': 'elastic-demo', '_id': '524', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 4, '_primary_term': 1}),
 ObjectApiResponse({'_index': 'elastic-demo', '_id': '4584', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 5, '_primary_term': 1}),
 ObjectApiResponse({'_index': 'elastic-demo', '_id': '5', '_ver

In [119]:
query = 'Action movies'
query_vector = model.encode(query)
es.search(index = 'elastic-demo',body={
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
        "params": {
          "query_vector": query_vector
        }
      }
    }
  }
})['hits']['hits']

[{'_index': 'elastic-demo',
  '_id': '6474',
  '_score': 1.5070573,
  '_source': {'id': 6474,
   'title': 'Navajo Joe',
   'rating': 2.0,
   'genres': ['Action', 'Western'],
   'overview': 'The sole survivor of a bloody massacre vows revenge on his attackers and on the men who killed his wife.',
   'tagline': 'Navajo revenge slashes ...burns ...ravages the screen!',
   'spoken_languages': ['Italiano'],
   'embeddings': [-0.05125097185373306,
    0.05712582916021347,
    0.010602284222841263,
    -0.0077367243357002735,
    -0.003970255143940449,
    0.031582992523908615,
    -0.01254508551210165,
    0.06344683468341827,
    0.015127033926546574,
    -0.010670589283108711,
    -0.012268530204892159,
    0.04718364030122757,
    0.03762334957718849,
    0.02239859290421009,
    -0.019393058493733406,
    -0.007170577067881823,
    -0.02554761990904808,
    -0.0009041107259690762,
    0.03644527867436409,
    -0.016203202307224274,
    0.020630095154047012,
    -0.02164549194276333,
    