# Content Based Filtering

By: Nandakishore Vinayakrishnan - 23070854

In [89]:
import pandas as pd
import ast
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Model
from keras.layers import Input, Dense, Dropout

# Load the dataset
df = pd.read_csv('../tmdb_5000_movies.csv')
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [90]:
column_info = df.dtypes
print(column_info)

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object


# Data Preprocessing

## Steps:

1. Identify the columns to drop
    * Drop 'homepage', 'original_language', 'popularity', 'release_date', 'status', 'original_title', 'vote_average', 'vote_count'
2. Extract the genres, keywords, and production companies from their respective columns
3. Feature Engineering
4. Implementing Neural Network model
5. Model Evaluation

### Step 1: Identify the columns to drop

The columns to drop are:
1. 'homepage'
2. 'original_language'
3. 'popularity'
4. 'release_date'
5. 'status' 
6. 'original_title'
7. 'vote_average'
8. 'vote_count'
9. 'revenue'

These columns were removed because we didn't see a reason to implement them in the model since they have very little important metadata.

In [91]:
drop_cols = ['homepage', 'original_language', 'popularity', 'release_date', 'spoken_languages', 'status', 'original_title', 'revenue', 'vote_average', 'vote_count']
df = df.drop(drop_cols, axis=1)

### Step 2: Extract metadata from columns

In [92]:
def extract_features(x):
    return [i['name'] for i in ast.literal_eval(x)] if pd.notnull(x) else []

df['genres_list'] = df['genres'].apply(extract_features)
df['keywords_list'] = df['keywords'].apply(extract_features)
df['production_companies'] = df['production_companies'].apply(extract_features)
df['production_countries'] = df['production_countries'].apply(extract_features)

### Step 3: Feature Engineering

Encode the genres using MultiLabelBinarizer, and vectorize movie overview using TfidfVectorizer.

In [93]:
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(df['genres_list'])

tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
overview_features = tfidf.fit_transform(df['overview'].fillna(''))

Stack the encoded genres and vectorized movie overview into a single array, X, which is the model input.

In [94]:
X = np.hstack([genre_features, overview_features.toarray()])

### Step 4: Implementing the Neural Network Model

The model is implemented using a simple neural network with 3 hidden layers and a dropout layer to prevent overfitting.

The network architecture is as follows:
1. Input Layer
2. Dense layer with 256 units and ReLU activation
3. Dropout layer with 0.3 dropout rate
4. Dense layer with 128 units and ReLU activation
5. Dense layer with 64 units and ReLU activation
6. Embedding layer with 64 units to capture compressed representation of the input
7. Output layer with same number of units as input dimensions + linear activation function

In [95]:
input_dim = X.shape[1]
embedding_dim = 64

input_layer = Input(shape=(input_dim,))
dense_1 = Dense(256, activation='relu')(input_layer)
dropout_1 = Dropout(0.3)(dense_1)
dense_2 = Dense(128, activation='relu')(dropout_1)
dense_3 = Dense(64, activation='relu')(dense_2)
embedding = Dense(embedding_dim, name='embedding')(dense_3)
output_layer = Dense(input_dim, activation='linear')(embedding)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mse')

model.fit(X, X, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.0031 - val_loss: 0.0019
Epoch 2/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0019 - val_loss: 0.0015
Epoch 3/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0015 - val_loss: 0.0013
Epoch 4/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0013 - val_loss: 0.0012
Epoch 5/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0012 - val_loss: 0.0012
Epoch 6/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0012 - val_loss: 0.0012
Epoch 7/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0011 - val_loss: 0.0011
Epoch 8/50
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0011 - val_loss: 0.0011
Epoch 9/50
[1m121/121[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x25f66d1e5a0>

Pulls activations from embedding layer and uses them as the output of the model.

In [96]:
embedding_model = Model(inputs=model.input, outputs=model.get_layer('embedding').output)
embeddings = embedding_model.predict(X)

[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


### Step 5: Model Evaluation

Function takes a TMDB ID as input and returns the top 10 recommendations based on cosine simularity between the **embeddings** of the input movie & **embeddings** of all other movies in the dataset.

Simularity scores are sorted in descending order and the top 10 movies with highest sim scores are returned.

In [97]:
def nn_recommend(tmdb_id, n=10):
    try:
        idx = df[df['id'] == tmdb_id].index[0]
    except:
        return ["Movie not found"]
    
    sim_scores = cosine_similarity([embeddings[idx]], embeddings)[0]
    top_indices = sim_scores.argsort()[-n-1:-1][::-1]
    
    return [f"{row['id']} - {row['title']}" for _, row in df.iloc[top_indices].iterrows()]

nn_recommend(19995) # Avatar, just to test the function

['127585 - X-Men: Days of Future Past',
 '1924 - Superman',
 '27549 - Beastmaster 2: Through the Portal of Time',
 '8536 - Superman II',
 '76170 - The Wolverine',
 '1452 - Superman Returns',
 '76757 - Jupiter Ascending',
 '49521 - Man of Steel',
 '297761 - Suicide Squad',
 '9824 - Mystery Men']