In [None]:
! pip install faiss-gpu ##Installing GPU version of faiss

In [None]:
! pip install sentence_transformers ## For textual similarity, using pretrained models

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import math
import re

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

In [5]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1',device="cuda") ## On GPU Loads the distil roberta model,whcih was trained on millions of data

HBox(children=(FloatProgress(value=0.0, max=305584576.0), HTML(value='')))




# Load the movies data

In [34]:
imdb_movies=pd.read_json('./imdb.json', orient = 'split')
imdb_movies.shape

(10000, 9)

In [15]:
imdb_movies.head(2)

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description
0,Godzilla vs. Kong,2021.0,PG-13,"[Action, Sci-Fi, Thriller]",113.0,59.0,6.5,,The epic next chapter in the cinematic Monster...
1,Black Widow,2021.0,PG-13,"[Action, Adventure, Sci-Fi]",133.0,,,,A film about Natasha Romanoff in her quests be...


In [16]:
## Dropping off rows where Movie Description is NULL
imdb_movies=imdb_movies[pd.notnull(imdb_movies['description'])]
imdb_movies=imdb_movies.reset_index(drop=True)

In [17]:
imdb_movies['id']=imdb_movies.index

In [18]:
imdb_movies.head(2)

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description,id
0,Godzilla vs. Kong,2021.0,PG-13,"[Action, Sci-Fi, Thriller]",113.0,59.0,6.5,,The epic next chapter in the cinematic Monster...,0
1,Black Widow,2021.0,PG-13,"[Action, Adventure, Sci-Fi]",133.0,,,,A film about Natasha Romanoff in her quests be...,1


# FAISS GPU

## Extract the Embeddings for movie description

In [8]:
sentences=imdb_movies['description'].tolist()
print("Number of Sentences in Movie Description ",len(sentences))

Number of Sentences in Movie Description  10000


In [9]:
embeddings=model.encode(sentences)
faiss.normalize_L2(embeddings) 

In [10]:
print("Shape of the EMbeddings is ",embeddings.shape)

Shape of the EMbeddings is  (10000, 768)


In [11]:
## We get a 768 dimension vector using Roberta. So we will create FAISS index with dimaensions - 768

dim=768
ncentroids=50 ## This is a hyperparameter, and indicates number of clusters to be split into
m=16 ## This is also a hyper parameter
quantiser = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFPQ (quantiser, dim,ncentroids, m , 8)
index.train(embeddings) ## This step, will do the clustering and create the clusters
print(index.is_trained)
faiss.write_index(index, "trained.index")

True


In [12]:
### We have to add the embeddings to the Trained Index.
ids=imdb_movies['id'].tolist()
ids=np.array(ids)
index.add_with_ids(embeddings,ids)
print(index.ntotal)


KeyError: ignored

In [None]:
faiss.write_index(index,"block.index")

## Let us use the FAISS index to search for similar movie plots

In [None]:
def searchFAISSIndex(data,id_col_name,query,index,nprobe,model,topk=20):
    ## Convert the query into embeddings
    query_embedding=model.encode([query])[0]
    dim=query_embedding.shape[0]
    query_embedding=query_embedding.reshape(1,dim)
    faiss.normalize_L2(query_embedding)
  
    
    index.nprobe=nprobe
    
    D,I=index.search(query_embedding,topk) 
    ids=[i for i in I][0]
    L2_score=[d for d in D][0]
    inner_product=[calculateInnerProduct(l2) for l2 in L2_score]
    search_result=pd.DataFrame()
    search_result[id_col_name]=ids
    search_result['cosine_sim']=inner_product
    search_result['L2_score']=L2_score
    dat=data[data[id_col_name].isin(ids)]
    dat=pd.merge(dat,search_result,on=id_col_name)
    dat=dat.sort_values('cosine_sim',ascending=False)
    return dat

In [None]:
def calculateInnerProduct(L2_score):
    return (2-math.pow(L2_score,2))/2

In [None]:
query="Princess in castle"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','titre','cosine_sim','L2_score']]

In [None]:
search_result

In [None]:
query="war"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','titre','cosine_sim','L2_score']]
search_result

# KNN MNIST

## Load Data

In [111]:
imdb_movies_knn = imdb_movies 
imdb_movies_knn.head()

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description
20,Voyagers,2021.0,PG-13,"[Adventure, Sci-Fi, Thriller]",108,44,5.5,536.0,A crew of astronauts on a multi-generational m...
31,The Unholy,2021.0,PG-13,[Horror],99,36,5.1,947.0,A hearing-impaired girl is visited by the Virg...
97,Every Breath You Take,2021.0,R,[Thriller],105,32,5.0,736.0,"A psychiatrist, whose client commits suicide, ..."
234,Mainstream,2020.0,R,"[Comedy, Drama]",94,37,5.3,162.0,"In this cautionary tale, three people struggle..."
330,The Night House,2020.0,R,"[Horror, Thriller]",108,62,6.5,240.0,A widow begins to uncover her recently decease...


In [78]:
imdb_movies_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   titre        10000 non-null  object 
 1   année        9844 non-null   float64
 2   certificate  9184 non-null   object 
 3   genre        10000 non-null  object 
 4   durée        10000 non-null  float64
 5   metascore    10000 non-null  float64
 6   stars        9598 non-null   float64
 7   votes        497 non-null    float64
 8   description  10000 non-null  object 
dtypes: float64(5), object(4)
memory usage: 703.2+ KB


In [79]:
imdb_movies_knn.dropna(subset=["metascore"])

Unnamed: 0,titre,année,certificate,genre,durée,metascore,stars,votes,description
0,Godzilla vs Kong,2021.0,PG-13,"[Action, Sci-Fi, Thriller]",113.0,59.0,6.5,,The epic next chapter in the cinematic Monster...
1,Black Widow,2021.0,PG-13,"[Action, Adventure, Sci-Fi]",133.0,57.0,,,A film about Natasha Romanoff in her quests be...
2,Zack Snyder s Justice League,2021.0,R,"[Action, Adventure, Fantasy]",242.0,54.0,8.2,,Determined to ensure Superman's ultimate sacri...
3,Mortal Kombat,2021.0,R,"[Action, Adventure, Fantasy]",110.0,57.0,7.1,,MMA fighter Cole Young seeks out Earth's great...
4,Space Jam A New Legacy,2021.0,Tous Public,"[Animation, Adventure, Comedy]",104.0,57.0,,,NBA superstar LeBron James teams up with Bugs ...
...,...,...,...,...,...,...,...,...,...
9995,Triggered,2020.0,R,"[Action, Comedy, Horror]",94.0,48.0,5.0,,"9 old high school ""friends"" have a 5 year reun..."
9996,The Bear,1988.0,PG,"[Adventure, Drama, Family]",96.0,60.0,7.7,,An orphan bear cub hooks up with an adult male...
9997,The In Laws,1979.0,PG,"[Action, Adventure, Comedy]",103.0,57.0,7.3,,"On the eve of their children's marriage, NYC i..."
9998,Missing in Action,1984.0,R,"[Action, Adventure, Drama]",101.0,43.0,5.5,,Colonel Braddock launches a mission deep into ...


In [80]:
imdb_movies_knn.columns

Index(['titre', 'année', 'certificate', 'genre', 'durée', 'metascore', 'stars',
       'votes', 'description'],
      dtype='object')

## Data preprocessing

### Supression des NaN

In [92]:
imdb_movies_knn.isnull().any()

titre          False
année           True
certificate     True
genre          False
durée          False
metascore      False
stars           True
votes           True
description    False
dtype: bool

In [None]:
imdb_movies_knn.dropna(subset=["metascore", "durée", "année"])

In [95]:
imdb_movies_knn.dropna(inplace=True)

In [96]:
imdb_movies_knn.isnull().any()

titre          False
année          False
certificate    False
genre          False
durée          False
metascore      False
stars          False
votes          False
description    False
dtype: bool

### Changement de type 

In [97]:
imdb_movies_knn["durée"] = imdb_movies_knn["durée"].astype(int)
imdb_movies_knn["durée"].fillna(imdb_movies_knn["durée"].median(),inplace = True)
imdb_movies_knn["metascore"] = imdb_movies_knn["metascore"].astype(int)
imdb_movies_knn["metascore"].fillna(imdb_movies_knn["metascore"].median(),inplace = True)

### Encoding

In [98]:
# Scaling
imdb_features = pd.concat([imdb_movies_knn["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(imdb_movies_knn[["certificate"]]),
                            imdb_movies_knn[["durée"]],imdb_movies_knn[["metascore"]],imdb_movies_knn["année"]],axis=1)
imdb_movies_knn["titre"] = imdb_movies_knn["titre"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
imdb_features.head()

Unnamed: 0,'Action','Adventure','Adventure'],'Biography','Comedy','Comedy'],'Crime','Crime'],'Drama','Drama'],'Family','Family'],'Fantasy','Fantasy'],'History'],'Horror','Horror'],'Music','Music'],'Mystery','Mystery'],'Romance','Romance'],'Sci-Fi','Sci-Fi'],'Sport'],'Thriller','Thriller'],'War'],'Western'],['Action',['Action'],['Adventure',['Adventure'],['Animation',['Animation'],['Biography',['Comedy',['Comedy'],['Crime',['Drama',['Drama'],['Horror',['Horror'],['Mystery',['Mystery'],['Romance',['Romance'],['Sport'],['Thriller'],['Western'],certificate_12,certificate_18,certificate_Approved,certificate_Not Rated,certificate_PG,certificate_PG-13,certificate_Passed,certificate_R,certificate_TV-14,certificate_TV-G,certificate_TV-MA,certificate_TV-PG,certificate_Tous Public,certificate_Tous publics,certificate_Tous publics avec avertissement,certificate_Unrated,certificate_X,durée,metascore,année
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,108,44,2021.0
31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,99,36,2021.0
97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,105,32,2021.0
234,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,94,37,2020.0
330,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,108,62,2020.0


In [99]:
imdb_features.dropna(subset=["metascore", "durée"])

Unnamed: 0,'Action','Adventure','Adventure'],'Biography','Comedy','Comedy'],'Crime','Crime'],'Drama','Drama'],'Family','Family'],'Fantasy','Fantasy'],'History'],'Horror','Horror'],'Music','Music'],'Mystery','Mystery'],'Romance','Romance'],'Sci-Fi','Sci-Fi'],'Sport'],'Thriller','Thriller'],'War'],'Western'],['Action',['Action'],['Adventure',['Adventure'],['Animation',['Animation'],['Biography',['Comedy',['Comedy'],['Crime',['Drama',['Drama'],['Horror',['Horror'],['Mystery',['Mystery'],['Romance',['Romance'],['Sport'],['Thriller'],['Western'],certificate_12,certificate_18,certificate_Approved,certificate_Not Rated,certificate_PG,certificate_PG-13,certificate_Passed,certificate_R,certificate_TV-14,certificate_TV-G,certificate_TV-MA,certificate_TV-PG,certificate_Tous Public,certificate_Tous publics,certificate_Tous publics avec avertissement,certificate_Unrated,certificate_X,durée,metascore,année
20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,108,44,2021.0
31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,99,36,2021.0
97,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,105,32,2021.0
234,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,94,37,2020.0
330,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,108,62,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9953,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,83,57,2021.0
9958,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,99,57,2019.0
9976,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,106,57,2019.0
9991,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,85,82,2019.0


## Fit Nearest Neighbor To Data

In [100]:
min_max_scaler = MinMaxScaler()
imdb_features = min_max_scaler.fit_transform(imdb_features)

In [101]:
np.round(imdb_features,2)

array([[0.  , 0.  , 0.  , ..., 0.28, 0.44, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.21, 0.34, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.26, 0.29, 1.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.26, 0.59, 0.97],
       [0.  , 0.  , 0.  , ..., 0.1 , 0.88, 0.97],
       [0.  , 0.  , 0.  , ..., 0.22, 0.59, 0.97]])

In [102]:
np.any(np.isnan(imdb_features))

False

In [103]:
imdb_movies_knn.isnull().any()

titre          False
année          False
certificate    False
genre          False
durée          False
metascore      False
stars          False
votes          False
description    False
dtype: bool

In [104]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(imdb_features)
distances, indices = nbrs.kneighbors(imdb_features)

##Query examples and helper functions

In [132]:
def name_movies(name):
    return imdb_features[imdb_features["titre"]==name].index.tolist()[0]

In [133]:
all_movies_names = list(imdb_movies_knn.titre.values)

In [165]:
def recommandation_partial_movies(partial):
    for name in all_movies_names:
        if partial in name:
            print(name,all_movies_names.index(name))

In [162]:
def recommandation_movies(query=None or id=None):
    if id:
        for id in indices[id][1:]:
            print(imdb_movies_knn.ix[id]["titre"])
    if query:
        found_id = name_movies(query)
        for id in indices[found_id][1:]:
            print(imdb_movies_knn.ix[id]["titre"])

## Query Examples

In [166]:
recommandation_partial_movies("Mainstream")

Mainstream 3


In [164]:
recommandation_movies(query="Mainstream")

IndexError: ignored

In [2]:
import joblib

In [4]:
model_recommandation = joblib.load('model_recommandation.pkl')

AttributeError: module '__main__' has no attribute 'searchFAISSIndex'