In [11]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../data/processed/preprocessed_dataset.csv')
df.head()

Unnamed: 0,title,year,image_link,imdb_id,rating_count,rating,genre,keywords,trailer_link,actors,director,synopsis,processed_plot
0,Gladiator,2000,https://m.media-amazon.com/images/M/MV5BMDliMm...,tt0172495,1617220,8.5,"[Action, Adventure, Drama]","[roman empire, gladiator, slavery, combat, bat...",https://www.imdb.com/video/vi2628367897/,"[Russell Crowe, Joaquin Phoenix, Connie Nielsen]",[Ridley Scott],"Shouting ""Roma Invicta!"" as his forces attack,...",shouting rom invicta force attack general maxi...
1,Unbreakable,2000,https://m.media-amazon.com/images/M/MV5BMDIwMj...,tt0217869,440860,7.3,"[Drama, Mystery, Sci-Fi]","[supernatural power, train crash, superhero, c...",https://www.imdb.com/video/vi1103346201/,"[Bruce Willis, Samuel L. Jackson, Robin Wright]",[M. Night Shyamalan],The film opens as we see a baby being born in ...,film open see baby born west philadelphia elij...
2,Snatch,2000,https://m.media-amazon.com/images/M/MV5BMTA2ND...,tt0208092,909299,8.2,"[Comedy, Crime]","[diamond, boxer, narrated by character, cockne...",https://www.imdb.com/video/vi1558577433/,"[Jason Statham, Brad Pitt, Stephen Graham]",[Guy Ritchie],After stealing an 86-carat (17 g) diamond in a...,stealing carat g diamond heist antwerp franki...
3,AmericanPsycho,2000,https://m.media-amazon.com/images/M/MV5BZTM2ZG...,tt0144084,713635,7.6,"[Crime, Drama, Horror]","[narcissism, materialism, serial murder, socio...",https://www.imdb.com/video/vi4060743449/,"[Christian Bale, Justin Theroux, Josh Lucas]",[Mary Harron],A white background. Red drops begin to fall pa...,white background red drop begin fall past open...
4,RequiemforaDream,2000,https://m.media-amazon.com/images/M/MV5BOTdiNz...,tt0180093,897414,8.3,[Drama],"[drug addiction, sex show, sex scene, prostitu...",https://www.imdb.com/video/vi2156069145/,"[Ellen Burstyn, Jared Leto, Jennifer Connelly]",[Darren Aronofsky],Harry Goldfarb (Jared Leto) and Tyrone Love (M...,harry goldfarb jared leto tyrone love marlon w...


In [3]:
df.columns

Index(['title', 'year', 'image_link', 'imdb_id', 'rating_count', 'rating',
       'genre', 'keywords', 'trailer_link', 'actors', 'director', 'synopsis',
       'processed_plot'],
      dtype='object')

In [4]:
features = ['genre', 'actors', 'director', 'keywords', 'processed_plot']
df[features].head()

Unnamed: 0,genre,actors,director,keywords,processed_plot
0,"[Action, Adventure, Drama]","[Russell Crowe, Joaquin Phoenix, Connie Nielsen]",[Ridley Scott],"[roman empire, gladiator, slavery, combat, bat...",shouting rom invicta force attack general maxi...
1,"[Drama, Mystery, Sci-Fi]","[Bruce Willis, Samuel L. Jackson, Robin Wright]",[M. Night Shyamalan],"[supernatural power, train crash, superhero, c...",film open see baby born west philadelphia elij...
2,"[Comedy, Crime]","[Jason Statham, Brad Pitt, Stephen Graham]",[Guy Ritchie],"[diamond, boxer, narrated by character, cockne...",stealing carat g diamond heist antwerp franki...
3,"[Crime, Drama, Horror]","[Christian Bale, Justin Theroux, Josh Lucas]",[Mary Harron],"[narcissism, materialism, serial murder, socio...",white background red drop begin fall past open...
4,[Drama],"[Ellen Burstyn, Jared Leto, Jennifer Connelly]",[Darren Aronofsky],"[drug addiction, sex show, sex scene, prostitu...",harry goldfarb jared leto tyrone love marlon w...


In [5]:
# convert the features to lowercase
for feature in features:
    df[feature] = df[feature].apply(lambda x: x.lower())

In [6]:
df[features].head()

Unnamed: 0,genre,actors,director,keywords,processed_plot
0,"[action, adventure, drama]","[russell crowe, joaquin phoenix, connie nielsen]",[ridley scott],"[roman empire, gladiator, slavery, combat, bat...",shouting rom invicta force attack general maxi...
1,"[drama, mystery, sci-fi]","[bruce willis, samuel l. jackson, robin wright]",[m. night shyamalan],"[supernatural power, train crash, superhero, c...",film open see baby born west philadelphia elij...
2,"[comedy, crime]","[jason statham, brad pitt, stephen graham]",[guy ritchie],"[diamond, boxer, narrated by character, cockne...",stealing carat g diamond heist antwerp franki...
3,"[crime, drama, horror]","[christian bale, justin theroux, josh lucas]",[mary harron],"[narcissism, materialism, serial murder, socio...",white background red drop begin fall past open...
4,[drama],"[ellen burstyn, jared leto, jennifer connelly]",[darren aronofsky],"[drug addiction, sex show, sex scene, prostitu...",harry goldfarb jared leto tyrone love marlon w...


In [9]:
df.isnull().sum()

title               0
year                0
image_link          0
imdb_id             0
rating_count        0
rating              0
genre               0
keywords            0
trailer_link      775
actors              0
director            0
synopsis            0
processed_plot      0
dtype: int64

In [8]:
titles = df['title'].tolist()
descriptions = df['processed_plot'].tolist()

In [19]:
glove_file = 'glove.6B.100d.txt'

In [12]:
embedding_size = 100
word_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.array(values[1:], dtype='float32')
        word_embeddings[word] = embedding

In [13]:
def text_to_embeddings(text):
    words = text.split()
    embeddings = np.zeros(embedding_size)
    count = 0
    for word in words:
        if word in word_embeddings:
            embeddings += word_embeddings[word]
            count += 1
    if count != 0:
        embeddings /= count
    return embeddings

In [20]:
movie_embeddings = np.array([text_to_embeddings(desc) for desc in descriptions])

In [24]:
# save glove embeddings
np.save('../models/glove_embeddings.npy', word_embeddings)

In [22]:
# save movie embeddings
np.save('../models/movie_embeddings.npy', movie_embeddings)

In [17]:
# User-provided description
user_description = input('Enter movie description:')
user_embeddings = text_to_embeddings(user_description)

In [18]:
# Calculate cosine similarity between user's description and movie embeddings
similarities = cosine_similarity([user_embeddings], movie_embeddings).flatten()

# Get indices of movies sorted by similarity scores (descending order)
similar_indices = similarities.argsort()[::-1]

# Get recommended movie titles
recommended_movies = [(titles[i], similarities[i]) for i in similar_indices]

print("Recommended Movies:")
for movie_title, similarity_score in recommended_movies:
    print(f"{movie_title}: Similarity Score = {similarity_score}")

Recommended Movies:
Batman:TheLongHalloween: Similarity Score = 0.8466227772351715
LegoDCBatman:FamilyMatters: Similarity Score = 0.8330902669500433
TheInterrupters: Similarity Score = 0.830051928307808
Noise: Similarity Score = 0.8239761345333814
TurnedOut: Similarity Score = 0.8208310125372642
TheLand: Similarity Score = 0.8200217912915713
Urbania: Similarity Score = 0.8198467819863395
RubbleKings: Similarity Score = 0.8180028152548493
BatmanBegins: Similarity Score = 0.8159558010224857
LifeofCrime-: Similarity Score = 0.815868286892454
PAWPatrol:TheMovie: Similarity Score = 0.8149807230203875
Limelight: Similarity Score = 0.8149061169766239
SinnersandSaints: Similarity Score = 0.8135088884851983
Studio: Similarity Score = 0.8121539935097917
Batman:GothambyGaslight: Similarity Score = 0.8110127282507098
TheLegoBatmanMovie: Similarity Score = 0.8094287554391006
MerryLittleBatman: Similarity Score = 0.8073745379349527
Batman:TheLongHalloween,PartOne: Similarity Score = 0.80654660709834