In [None]:
! pip install faiss-gpu ##Installing GPU version of faiss

In [None]:
! pip install sentence_transformers ## For textual similarity, using pretrained models

In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import faiss
from sentence_transformers import SentenceTransformer, util
import math

In [7]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1',device="cuda") ## On GPU Loads the distil roberta model,whcih was trained on millions of data

HBox(children=(FloatProgress(value=0.0, max=305584576.0), HTML(value='')))




### Load the movies data

In [50]:
imdb_movies=pd.read_json('./imdb.json', orient = 'split')
imdb_movies.shape

(10000, 9)

In [52]:
imdb_movies.head(2)

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description
0,Godzilla vs. Kong,2021.0,PG-13,"[Action, Sci-Fi, Thriller]",113.0,59.0,6.5,,The epic next chapter in the cinematic Monster...
1,Black Widow,2021.0,PG-13,"[Action, Adventure, Sci-Fi]",133.0,,,,A film about Natasha Romanoff in her quests be...


In [53]:
## Dropping off rows where Movie Description is NULL
imdb_movies=imdb_movies[pd.notnull(imdb_movies['description'])]
imdb_movies=imdb_movies.reset_index(drop=True)

In [55]:
imdb_movies['id']=imdb_movies.index

In [56]:
imdb_movies.head(2)

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description,id
0,Godzilla vs. Kong,2021.0,PG-13,"[Action, Sci-Fi, Thriller]",113.0,59.0,6.5,,The epic next chapter in the cinematic Monster...,0
1,Black Widow,2021.0,PG-13,"[Action, Adventure, Sci-Fi]",133.0,,,,A film about Natasha Romanoff in her quests be...,1


## Extract the Embeddings for movie description

In [57]:
sentences=imdb_movies['description'].tolist()
print("Number of Sentences in Movie Description ",len(sentences))

Number of Sentences in Movie Description  10000


In [58]:
embeddings=model.encode(sentences)
faiss.normalize_L2(embeddings) 

In [59]:
print("Shape of the EMbeddings is ",embeddings.shape)

Shape of the EMbeddings is  (10000, 768)


In [60]:
## We get a 768 dimension vector using Roberta. So we will create FAISS index with dimaensions - 768

dim=768
ncentroids=50 ## This is a hyperparameter, and indicates number of clusters to be split into
m=16 ## This is also a hyper parameter
quantiser = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFPQ (quantiser, dim,ncentroids, m , 8)
index.train(embeddings) ## This step, will do the clustering and create the clusters
print(index.is_trained)
faiss.write_index(index, "trained.index")

True


In [61]:
### We have to add the embeddings to the Trained Index.
ids=imdb_movies['id'].tolist()
ids=np.array(ids)
index.add_with_ids(embeddings,ids)
print(index.ntotal)


10000


In [62]:
faiss.write_index(index,"block.index")

## Let us use the FAISS index to search for similar movie plots

In [63]:
def searchFAISSIndex(data,id_col_name,query,index,nprobe,model,topk=20):
    ## Convert the query into embeddings
    query_embedding=model.encode([query])[0]
    dim=query_embedding.shape[0]
    query_embedding=query_embedding.reshape(1,dim)
    faiss.normalize_L2(query_embedding)
  
    
    index.nprobe=nprobe
    
    D,I=index.search(query_embedding,topk) 
    ids=[i for i in I][0]
    L2_score=[d for d in D][0]
    inner_product=[calculateInnerProduct(l2) for l2 in L2_score]
    search_result=pd.DataFrame()
    search_result[id_col_name]=ids
    search_result['cosine_sim']=inner_product
    search_result['L2_score']=L2_score
    dat=data[data[id_col_name].isin(ids)]
    dat=pd.merge(dat,search_result,on=id_col_name)
    dat=dat.sort_values('cosine_sim',ascending=False)
    return dat

In [64]:
def calculateInnerProduct(L2_score):
    return (2-math.pow(L2_score,2))/2

In [66]:
query="Princess in castle"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','titre','cosine_sim','L2_score']]

In [67]:
search_result

Unnamed: 0,id,description,titre,cosine_sim,L2_score
8,3512,The early years of the reign of Elizabeth I of...,Elizabeth,0.585967,0.909981
3,1268,Reluctantly designated as the heir to the land...,Shrek the Third,0.532843,0.966599
1,672,When an unconfident young woman is cursed with...,Howl's Moving Castle,0.532425,0.967031
18,8799,"Set in medieval Rajasthan, Queen Padmavati is ...",Padmaavat,0.526439,0.973202
6,1678,A prince cursed to spend his days as a hideous...,Beauty and the Beast,0.523838,0.97587
15,6675,An Albanian castle with bloodthirsty creatures...,Castle Freak,0.522766,0.976969
0,541,On an isolated island in Brittany at the end o...,Portrait of a Lady on Fire,0.5223,0.977446
9,3969,A Scottish lord becomes convinced by a trio of...,The Tragedy of Macbeth,0.519349,0.98046
7,2235,A bored and sheltered princess escapes her gua...,Roman Holiday,0.507908,0.99206
4,1289,"A young maiden in a land called Andalasia, who...",Enchanted,0.500384,0.999616


In [69]:
query="Former Football player  to train an  football team"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','titre','cosine_sim','L2_score']]
search_result

Unnamed: 0,id,description,titre,cosine_sim,L2_score
8,3811,A rookie high school football coach has a hard...,Wildcats,0.594527,0.900526
16,7173,Jim White moves his family after losing his la...,"McFarland, USA",0.573775,0.923283
18,8569,A failed businessman is hired by the army to t...,Renaissance Man,0.560202,0.937868
2,762,"During a pro football strike, the owners hire ...",The Replacements,0.553604,0.944876
11,5349,A young Shaolin follower reunites with his dis...,Shaolin Soccer,0.548904,0.949838
7,3736,Dave Buznik is a businessman who is wrongly se...,Anger Management,0.547416,0.951403
1,432,Oakland A's general manager Billy Beane's succ...,Moneyball,0.532101,0.967367
17,8451,A college basketball coach is forced to break ...,Blue Chips,0.529386,0.970169
3,1405,The true story of a newly appointed African-Am...,Remember the Titans,0.521888,0.977867
19,9430,"Haunted by his mysterious past, a devoted high...",12 Mighty Orphans,0.51953,0.980276
