In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
final_movie_data = pd.read_csv('data/movies_with_tmdb_features_and_tags.csv')

In [3]:
movie_titles = final_movie_data[['movieId', 'title']]

In [4]:
# drop the title, movieId, revenue and budget column
final_movie_data = final_movie_data.drop([ 'spoken_languages', 'revenue', 'budget'], axis=1)

In [5]:
unique_genres = final_movie_data['genres'].str.split('|').explode().unique()
print(unique_genres)

['Adventure' 'Animation' 'Children' 'Comedy' 'Fantasy' 'Romance' 'Drama'
 'Action' 'Crime' 'Thriller' 'Horror' 'Mystery' 'Sci-Fi' 'IMAX'
 'Documentary' 'War' 'Musical' 'Western' 'Film-Noir' '(no genres listed)']


In [6]:
empty_rows_count = final_movie_data[final_movie_data['production_countries'] == '[]'].shape[0]
print(f"Number of rows where countries is an empty list: {empty_rows_count}")


final_movie_data = final_movie_data[final_movie_data['production_countries'].apply(lambda x: x != '[]')]

# Reset index if needed
final_movie_data.reset_index(drop=True, inplace=True)

empty_rows_count = final_movie_data[final_movie_data['production_countries'] == '[]'].shape[0]
print(f"Number of rows where countries is an empty list: {empty_rows_count}")


Number of rows where countries is an empty list: 834
Number of rows where countries is an empty list: 0


In [7]:
nan_count = final_movie_data['title'].isna().sum()
print(f"The 'tag' column contains {nan_count} NaN values.")

The 'tag' column contains 0 NaN values.


In [8]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight and efficient


embeddings = final_movie_data['title'].apply(lambda x: model.encode(x))

In [9]:
embed_columns = [f'embeddings_{i+1}' for i in range(384)]

embeddings_df = pd.DataFrame(embeddings.tolist(), index=final_movie_data.index, columns=embed_columns)

print(embeddings_df.head())


   embeddings_1  embeddings_2  embeddings_3  embeddings_4  embeddings_5  \
0     -0.082835      0.053031      0.053576     -0.027935      0.016134   
1     -0.105293      0.150841     -0.026398     -0.065596      0.006964   
2     -0.098787      0.017650     -0.052744     -0.038677      0.069102   
3     -0.087231      0.036612     -0.021703     -0.012105      0.062955   
4     -0.069206      0.038752      0.014738      0.012141      0.050471   

   embeddings_6  embeddings_7  embeddings_8  embeddings_9  embeddings_10  ...  \
0      0.012132      0.024147      0.020295     -0.005547       0.013974  ...   
1      0.054954      0.052583      0.009236      0.014830      -0.011859  ...   
2      0.000289      0.051787     -0.058244     -0.011713      -0.107212  ...   
3      0.043525      0.013925     -0.055051      0.072511      -0.101182  ...   
4      0.014589     -0.031408     -0.000894      0.046883      -0.006560  ...   

   embeddings_375  embeddings_376  embeddings_377  embeddings_

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)

pca_embeddings = pca.fit_transform(embeddings_df)

print(pca_embeddings.shape)


(25901, 20)


In [11]:
pca_embeddings_df = pd.DataFrame(pca_embeddings, columns=[f'PC{i+1}' for i in range(pca_embeddings.shape[1])])

In [12]:
final_movie_data = pd.concat([final_movie_data, pca_embeddings_df], axis=1)

In [13]:

country_to_continent = {
    # North America
    'United States of America': 'North America', 'Canada': 'North America', 'Mexico': 'North America', 'Bahamas': 'North America',
    'Dominican Republic': 'North America', 'Cuba': 'North America', 'Puerto Rico': 'North America', 
    # South America
    'Brazil': 'South America', 'Argentina': 'South America', 'Chile': 'South America', 'Peru': 'South America', 
    'Colombia': 'South America', 'Uruguay': 'South America', 'Venezuela': 'South America', 'Paraguay': 'South America', 
    # Europe
    'Germany': 'Europe', 'United Kingdom': 'Europe', 'France': 'Europe', 'Italy': 'Europe', 'Spain': 'Europe', 
    'Netherlands': 'Europe', 'Belgium': 'Europe', 'Switzerland': 'Europe', 'Sweden': 'Europe', 'Portugal': 'Europe',
    'Austria': 'Europe', 'Denmark': 'Europe', 'Ireland': 'Europe', 'Norway': 'Europe', 'Poland': 'Europe', 
    'Finland': 'Europe', 'Greece': 'Europe', 'Hungary': 'Europe', 'Czech Republic': 'Europe', 'Slovakia': 'Europe', 
    'Iceland': 'Europe', 'Russia': 'Europe', 'Serbia and Montenegro': 'Europe', 'Macedonia': 'Europe',
    'Bulgaria': 'Europe', 'Yugoslavia': 'Europe', 'Luxembourg': 'Europe', 'Ukraine': 'Europe', 'Romania': 'Europe', 
    'Estonia': 'Europe', 'Lithuania': 'Europe', 'Albania': 'Europe', 'Latvia': 'Europe', 'Slovenia': 'Europe', 
    # Asia
    'China': 'Asia', 'Japan': 'Asia', 'India': 'Asia', 'South Korea': 'Asia', 'Iran': 'Asia', 'Thailand': 'Asia', 
    'Hong Kong': 'Asia', 'Malaysia': 'Asia', 'Taiwan': 'Asia', 'Philippines': 'Asia', 'Turkey': 'Asia', 'Israel': 'Asia',
    'Vietnam': 'Asia', 'Indonesia': 'Asia', 'Singapore': 'Asia', 'Jordan': 'Asia', 'Lebanon': 'Asia', 
    'Bangladesh': 'Asia', 'Pakistan': 'Asia', 'Sri Lanka': 'Asia', 'Saudi Arabia': 'Asia', 'Afghanistan': 'Asia', 
    'Kuwait': 'Asia', 'United Arab Emirates': 'Asia', 'Qatar': 'Asia', 'Myanmar': 'Asia', 'Kazakhstan': 'Asia', 
    # Africa
    'South Africa': 'Africa', 'Egypt': 'Africa', 'Morocco': 'Africa', 'Tunisia': 'Africa', 'Algeria': 'Africa', 
    'Cameroon': 'Africa', 'Senegal': 'Africa', 'Burkina Faso': 'Africa', 'Zimbabwe': 'Africa', 'Ivory Coast': 'Africa', 
    'Libya': 'Africa', 'Nigeria': 'Africa', 'Kenya': 'Africa', 'Uganda': 'Africa', 'Angola': 'Africa', 
    'Mali': 'Africa', 'Niger': 'Africa', 'Ghana': 'Africa', 'Rwanda': 'Africa', 'Ethiopia': 'Africa', 
    # Oceania
    'Australia': 'Oceania', 'New Zealand': 'Oceania', 'Papua New Guinea': 'Oceania', 'Solomon Islands': 'Oceania', 
    # For other countries not listed, include as needed
}




In [14]:
import ast


continents = ['North America', 'South America', 'Europe', 'Asia', 'Africa', 'Oceania']
for continets in continents:
    final_movie_data[continets] = 0

for i, row in final_movie_data.iterrows():
    countries = row['production_countries']
    if isinstance(countries, str):
        countries = ast.literal_eval(countries)
    for country in countries:
        continent = country_to_continent.get(country, None)

        if continent: 
            final_movie_data.at[i, continent] = 1

In [15]:
final_movie_data = final_movie_data.drop(['production_countries'], axis=1)

In [16]:
# converting the release_date to a numerical feature
# We choose to convert the release date to a numerical feature represented by the months from the first movie's release date.
# We made this decision instead of saving the dates as year and month column. 
# The reason is that we would like to capture the time difference between the movies rather than seasonal or yearly patterns.
final_movie_data['release_date'] = pd.to_datetime(final_movie_data['release_date'])
first_release_date = final_movie_data['release_date'].min()
final_movie_data['release_date'] = (
    (final_movie_data['release_date'].dt.year - first_release_date.year) * 12 +
    (final_movie_data['release_date'].dt.month - first_release_date.month)
)


# hot-k encoding for 'genres' column
# We felt that the genre feature was so important that we transformed it into a one-hot encoding format.
# This way, we can use the genre information in the clustering process.
# Step 1: Create a separate column for each genre
for genre in unique_genres:
    # Check if each genre is in the 'genre' column and create a binary column
    final_movie_data[genre] = final_movie_data['genres'].apply(lambda x: 1 if genre in x.split('|') else 0)

# Drop the original 'genre' column if needed
final_movie_data = final_movie_data.drop(columns=['genres'])


# We transformed the 'adult' column into a binary column.

final_movie_data['adult'] = final_movie_data['adult'].apply(lambda x: 1 if x == 'True' else 0)



In [17]:
# We will use the following features for clustering

features = ['popularity', 'vote_average', 'vote_count', 'runtime',
       'release_date', 'adult', 'PC1', 'PC2', 'PC3', 'PC4',
       'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13',
       'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'North America',
       'South America', 'Europe', 'Asia', 'Africa', 'Oceania', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama',
       'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'IMAX',
       'Documentary', 'War', 'Musical', 'Western', 'Film-Noir',
       '(no genres listed)']


In [18]:
# We will apply scaling to the features
scaler = StandardScaler()
final_movie_data_scaled = final_movie_data.copy()
final_movie_data_scaled[features] = scaler.fit_transform(final_movie_data[features])

In [29]:
import numpy as np
import faiss

# Generate randomhow  data (100,000 vectors of dimension 128)
print(final_movie_data_scaled[features].shape)
# Build a flat index (exact nearest neighbors)
d = final_movie_data_scaled[features].shape[1]
index = faiss.IndexHNSWFlat(d, 5)
index.hnsw.efConstruction = final_movie_data_scaled[features].shape[0]
index.add(final_movie_data_scaled[features])  # Add vectors to the index



(25901, 52)


In [30]:
# Query the index (find 5 nearest neighbors of the first vector)
query = final_movie_data_scaled[final_movie_data_scaled['movieId'] == 2959][features]
distances, indices = index.search(query, k=6)

print("Indices of nearest neighbors:", indices)
print("Distances of nearest neighbors:", distances)

Indices of nearest neighbors: [[ 2772   311   289 21207  1601   820]]
Distances of nearest neighbors: [[ 0.       63.743866 67.50018  70.59146  81.06602  84.34573 ]]


In [32]:
movieIds = final_movie_data.loc[indices[0]]["movieId"].values
recommend_titles = movie_titles[movie_titles['movieId'].isin(movieIds[1:])]["title"]
print(recommend_titles)


292                   Pulp Fiction (1994)
314      Shawshank Redemption, The (1994)
833                 Godfather, The (1972)
1636                       Titanic (1997)
21813     Wolf of Wall Street, The (2013)
Name: title, dtype: object
