In [3]:
from sentence_transformers import SentenceTransformer
import pickle
import numpy as np


In [10]:
import pandas as pd

In [11]:
movies_clean = pd.read_csv("../data/movies_cleaned.csv")

In [12]:
model = SentenceTransformer("all-MiniLM-L6-v2")


In [13]:
content_list = movies_clean['content'].tolist()

In [24]:
embeddings = model.encode(
    content_list,
    show_progress_bar = True
)

Batches: 100%|████████████████████████████████| 151/151 [00:17<00:00,  8.47it/s]


In [25]:
print(type(embeddings))

<class 'numpy.ndarray'>


In [26]:
print(embeddings.shape)

(4803, 384)


In [27]:
print(len(embeddings))

4803


In [28]:
print(len(movies_clean))

4803


In [29]:
print(len(embeddings[0]))

384


In [30]:
print(embeddings[0])

[-2.57971194e-02 -3.32936086e-02  7.60778878e-03 -5.29247634e-02
 -7.71878585e-02  2.50345841e-02 -5.00053866e-03 -8.51928815e-02
  2.70986073e-02  1.50026577e-02  2.92691849e-02 -5.49177863e-02
 -1.10469852e-02  3.20897065e-02 -3.22754867e-02  6.44825175e-02
 -1.27119962e-02  1.00140469e-02  1.05573609e-02  6.07281625e-02
 -4.21410613e-02  3.16623077e-02 -5.25368452e-02  3.63436639e-02
 -3.60980816e-02  3.88724171e-02  6.25832528e-02 -6.65085576e-03
 -8.43190402e-02  3.35503034e-02 -6.40532672e-02  1.55829534e-01
 -8.40990841e-02  3.36101055e-02  1.39775835e-02  1.13686644e-01
 -2.96652298e-02 -5.57777025e-02  2.82413010e-02 -1.51874153e-02
 -8.74765217e-02 -1.10513069e-01  3.83488052e-02 -2.28110850e-02
 -5.61962910e-02 -1.35707617e-01 -5.92456479e-03 -7.86024258e-02
  8.56946036e-02 -1.11344969e-02 -9.91751552e-02 -3.06938514e-02
 -6.39107600e-02  2.19990537e-02  3.18171717e-02 -1.21638991e-01
 -4.76740375e-02  4.71041016e-02 -7.53745111e-03 -1.43682778e-01
 -3.87970805e-02 -1.10247

In [31]:
with open("movies_embeddings.pkl" , "wb") as f:
    pickle.dump(embeddings,f)

In [43]:
with open("movie_index.pkl", "wb") as f:
    pickle.dump(
        movies_clean[['original_title']],
        f
    )


In [44]:
with open("movies_embeddings.pkl" , "rb") as f:
    embedding = pickle.load(f)

In [45]:
print(embedding.shape)

(4803, 384)


In [46]:
with open("movie_index.pkl" , "rb") as f:
    movie_index = pickle.load(f)

In [59]:
title_to_index = {}
for idx,title in enumerate(movie_index['original_title']):
    title_to_index[title.lower()] = idx

In [60]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Final Recommend Func

In [67]:
def recommend(movie_name,top_n=5):
    
    if not isinstance(movie_name,str):
        return "Input Invalid , Movie name must be a string..."
        
    movie_name = movie_name.lower()

    if movie_name not in title_to_index:
        return "Movie not Found"

    idx = title_to_index[movie_name]

    query_vector = embedding[idx].reshape(1,-1)

    similarity_scores = cosine_similarity(query_vector,embedding)[0]

    similar_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]

    return [
        movie_index.iloc[i]['original_title']
        for i in similar_indices
    ]

In [68]:
recommend("Avatar")

['Serenity',
 'Aliens',
 'Star Trek Beyond',
 'Barbarella',
 'Star Trek Into Darkness']

In [69]:
recommend("interstellar")

['Prometheus',
 'Mission to Mars',
 'Red Planet',
 'Tomorrowland',
 'Obitaemyy Ostrov']

In [70]:
%time recommend("Avatar")

CPU times: user 8.12 ms, sys: 3.13 ms, total: 11.2 ms
Wall time: 8.98 ms


['Serenity',
 'Aliens',
 'Star Trek Beyond',
 'Barbarella',
 'Star Trek Into Darkness']

In [71]:
recommend("random junk")

'Movie not Found'