In [1]:
import numpy as np
import sklearn
import tensorflow as tf
import pandas as pd

from keras.src.layers import Dense
from tensorflow import keras
from keras import layers
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dot, Input, Dropout, Multiply, Concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import regularizers

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise    import cosine_similarity
import json


# Load The Data

In [3]:
users = pd.read_csv("/kaggle/input/the-movies-dataset/ratings.csv")
users.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
from sklearn.preprocessing import LabelEncoder

# 1. Fit LabelEncoders on your raw IDs
u_enc = LabelEncoder()
m_enc = LabelEncoder()

users['u_idx'] = u_enc.fit_transform(users['userId'])
users['m_idx'] = m_enc.fit_transform(users['movieId'])

# 2. Recompute the number of unique users/items
num_users = users['u_idx'].nunique()
num_items = users['m_idx'].nunique()

# 3. Prepare inputs for the model (as 2D int32 arrays)
user_ids  = users['u_idx'].values.reshape(-1, 1).astype('int32')
movie_ids = users['m_idx'].values.reshape(-1, 1).astype('int32')
ratings = users['rating'].values.astype('float32')

# 4. (Optional) Inspect the first few rows
print(users[['userId','u_idx','movieId','m_idx']].head())

print(f"num_users={num_users}, num_items={num_items}")


   userId  u_idx  movieId  m_idx
0       1      0      110    108
1       1      0      147    145
2       1      0      858    843
3       1      0     1221   1195
4       1      0     1246   1218
num_users=270896, num_items=45115


# Collaborative Filtering Model

### When you think collaborative filtering, think of statements like:
- Users who liked similar items also liked...
- Items similar to this item

### 🧠 “Behavioral Similarity”
The system learns from what users did, not what items are about.

> It doesn’t care what genre the item is — it just learns from the pattern of user behavior.

### 🔍 How It Works:
- Looks at user-item interactions (ratings, likes, views)
- Learns latent similarities between users or items
- Powered by embeddings, matrix factorization, or neural models

finds patterns in behavior



In [7]:
gmf_dim      = 32 #defines dimensions of gmf
mlp_dim      = 32 # defines dimensions of mlp
mlp_layers   = [64] # mlp neuron layers

# Input layer takes item of vector size 1
user_input = Input(shape=(1,), name='userId')
item_input = Input(shape=(1,), name='movieId')

gmf_user_emb = Embedding(num_users, gmf_dim, embeddings_regularizer=regularizers.l2(1e-5))(user_input) #embedding for userID
gmf_item_emb = Embedding(num_items, gmf_dim, embeddings_regularizer=regularizers.l2(1e-5))(item_input) # embedding for movieID
gmf_user_vec = Flatten()(gmf_user_emb) # flatten them
gmf_user_vec = Dropout(0.4)(gmf_user_vec)
gmf_item_vec = Flatten()(gmf_item_emb) # flatten them
gmf_item_vec = Dropout(0.4)(gmf_item_vec)
gmf_vector   = Multiply()([gmf_user_vec, gmf_item_vec]) # multiply to check how much the movie aligns with the user in training

# 4. MLP branch
mlp_user_emb = Embedding(num_users, mlp_dim, embeddings_regularizer=regularizers.l2(1e-5))(user_input) # embedding for userID
mlp_item_emb = Embedding(num_items, mlp_dim, embeddings_regularizer=regularizers.l2(1e-5))(item_input) # embedding for movieID
mlp_user_vec = Flatten()(mlp_user_emb) # flatten them
mlp_user_vec = Dropout(0.4)(mlp_user_vec)
mlp_item_vec = Flatten()(mlp_item_emb) # flatten them
mlp_item_vec = Dropout(0.4)(mlp_item_vec)
mlp_vector   = Concatenate()([mlp_user_vec, mlp_item_vec]) # stacks them both together
for units in mlp_layers:
    mlp_vector = Dense(units, activation='relu')(mlp_vector) # goes through neurons, activation relu to allow complexity
    mlp_vector = Dropout(0.4)(mlp_vector)




In [8]:
# The dot product of the two vectors, which gives a single number that determines how similar the two vectors are.
fusion = Concatenate()([gmf_vector, mlp_vector]) #then we concatenate both mlp and gmf
output = Dense(1, activation='linear', name='prediction')(fusion) # and add them all together to make the prediction


model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='mse'
)
model.summary()

In [None]:
# configure the callback
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',    # watch validation loss
    factor=0.5,            # multiply LR by this factor on plateau
    patience=2,            # wait this many epochs with no improvement
    min_lr=1e-6,           # don’t go below this LR
    verbose=1              # print messages when LR is reduced
)

# now include it in your fit call
model.fit(
    x=[user_ids, movie_ids],
    y=users['rating'].astype('float32'),
    epochs=10,
    batch_size=2048,
    validation_split=0.2,
    callbacks=[reduce_lr]
)


# Content-Based Filtering Model

When you think of content-based filtering, think of statements like:
- Because you liked horror
- Because you searched laptops

### 🧠 “Attribute Similarity”
The system uses the metadata or features of items (or users) directly.

> It recommends items with similar features to what you liked, not because other users liked them.

### 📦 How It Works:
- Uses item (or user) attributes: genres, categories, descriptions
- Builds a user profile from liked item features
- Compares feature vectors (e.g., via cosine similarity)

finds patterns in features

Collaborative filtering learns from who likes what, no matter what it is.

Content-based filtering learns from what the thing is, no matter who liked it.

In [3]:
# Read the movies meta-data(we will be using the feature genre, overview & title from this)
movie_md = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")

# Read the keywords
movie_keywords = pd.read_csv("../input/the-movies-dataset/keywords.csv")

# Read the credits
movie_credits = pd.read_csv("../input/the-movies-dataset/credits.csv")

  movie_md = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")


In [4]:
movie_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
#we are going to select movies with atleast more than 50 votes
movie_md = movie_md[movie_md['vote_count']>50]

  return op(a, b)


In [6]:
movie_md = movie_md[['id','original_title','overview','genres']]

In [7]:
# Creating a duplicate column for title so that once can be used to search later and one for creating features
movie_md['title'] = movie_md['original_title'].copy()

In [8]:
movie_md.reset_index(inplace=True, drop=True)
movie_md.head()

Unnamed: 0,id,original_title,overview,genres,title
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat


In [9]:
movie_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [10]:
movie_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [16]:
movie_credits = movie_credits[['id','cast']]

In [17]:
# Removing the records for which the id is not available
movie_md = movie_md[movie_md['id'].astype(str).str.isnumeric()]


In [26]:
# Merge all dataframe as a single entity
# To merge the ids must be of same datatype
movie_md['id'] = movie_md['id'].astype(int)

# Merge
df = pd.merge(movie_md, movie_keywords, on='id', how='left')

# Reset the index
df.reset_index(inplace=True, drop=True)

In [27]:
# Merge with movie credits
df = pd.merge(df, movie_credits, on='id', how='left')

# Reset the index
df.reset_index(inplace=True, drop=True)

In [28]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '..."
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han..."


In [29]:
# Lets first start with cleaning the movies metadata
# Fetchin the genre list from the column
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])

# Replaces spaces in between genre(ex - sci fi to scifi) and make it a string
df['genres'] = df['genres'].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [30]:
# Filling the numm values as []
df['keywords'].fillna('[]', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['keywords'].fillna('[]', inplace=True)


In [31]:
# Let's clean the keywords dataframe to extract the keywords
# Fetchin the keyword list from the column     
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])

# Remove the expty spaces and join all the keyword wwwith spaces
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [32]:
# Filling the numm values as []
df['cast'].fillna('[]', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('[]', inplace=True)


In [33]:
# Let's clean the cast dataframe to extract the name of aactors from cast column
# Fetchin the cast list from the column
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)])

# Remove the expty spaces and join all the cast with spaces
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [34]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,TomHanks TimAllen DonRickles JimVarney Wallace...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,RobinWilliams JonathanHyde KirstenDunst Bradle...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen,WalterMatthau JackLemmon Ann-Margret SophiaLor...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...,SteveMartin DianeKeaton MartinShort KimberlyWi...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Action Crime Drama Thriller,Heat,robbery detective bank obsession chase shootin...,AlPacino RobertDeNiro ValKilmer JonVoight TomS...


In [35]:
df['tags'] = df['overview'] + ' ' + df['genres'] +  ' ' + df['original_title'] + ' ' + df['keywords'] + ' ' + df['cast']

In [36]:
# Delete useless columns
df.drop(columns=['genres','overview','original_title','keywords','cast'], inplace=True)

In [37]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [38]:
df.isnull().sum()

id        0
title     0
tags     37
dtype: int64

In [39]:
df.drop(df[df['tags'].isnull()].index, inplace=True)

In [40]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [41]:
df.shape

(9150, 3)

In [43]:
df.drop_duplicates(inplace=True)

In [44]:
df.shape

(9005, 3)

In [45]:
# Initialize a tfidf object
tfidf = TfidfVectorizer(max_features=5000)

# Transform the data
vectorized_data = tfidf.fit_transform(df['tags'].values)

In [46]:
vectorized_data

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 392422 stored elements and shape (9005, 5000)>

In [47]:
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=df['tags'].index.tolist())

In [48]:
from sklearn.decomposition import TruncatedSVD

# Initialize a PCA object
svd = TruncatedSVD(n_components=3000)

# Fit transform the data
reduced_data = svd.fit_transform(vectorized_dataframe)

# Print the shape
reduced_data.shape

(9005, 3000)

In [49]:
svd.explained_variance_ratio_.cumsum()

array([0.00474059, 0.0116974 , 0.01738288, ..., 0.91885673, 0.91893071,
       0.91900466])

In [50]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)

In [51]:
def recommendation(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
    
    for i in movie_list:
        print(df.iloc[i[0]].title)

In [52]:
recommendation('The Matrix')

The Matrix Revisited
The Matrix Revolutions
The Matrix Reloaded
The Animatrix
Commando
Hackers
Terminator 3: Rise of the Machines
GHOST IN THE SHELL
Tron


In [9]:
user_NN = Sequential([
    layers.Input(shape = num_user_features),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(32)
])

item_NN = Sequential([
    layers.Input(shape= num_item_features),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(32)
])

vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

output = layers.Dot(axes=1)([vu, vm])

model = Model([input_user, input_item], output)

cost_fn = keras.losses.MeanSquaredError()

NameError: name 'num_user_features' is not defined

# Training

# Testing