In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import time

pd.set_option('display.max_columns',None)
pd.set_option('display.width',1000)

print("libraries loaded")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


libraries loaded


In [3]:
movies = pd.read_csv("data/movies.csv")
ratings = pd.read_csv("data/ratings.csv")

print(f"loaded {len(movies)} movies and {len(ratings)} ratings")

loaded 9742 movies and 100836 ratings


In [5]:
print(movies.head())

   movieId                               title                                       genres
0        1                    Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2                      Jumanji (1995)                   Adventure|Children|Fantasy
2        3             Grumpier Old Men (1995)                               Comedy|Romance
3        4            Waiting to Exhale (1995)                         Comedy|Drama|Romance
4        5  Father of the Bride Part II (1995)                                       Comedy


In [6]:
print(ratings.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [7]:
movies['combined_features'] = movies['title']+" "+movies['genres'].str.replace('|'," ")

print(movies[['title','combined_features']].head())

                                title                                  combined_features
0                    Toy Story (1995)  Toy Story (1995) Adventure Animation Children ...
1                      Jumanji (1995)          Jumanji (1995) Adventure Children Fantasy
2             Grumpier Old Men (1995)             Grumpier Old Men (1995) Comedy Romance
3            Waiting to Exhale (1995)      Waiting to Exhale (1995) Comedy Drama Romance
4  Father of the Bride Part II (1995)          Father of the Bride Part II (1995) Comedy


In [9]:
user_movie_matrix = ratings.pivot_table(index='userId',columns='movieId',values='rating')
user_movie_matrix = user_movie_matrix.fillna(0)

movie_user_matrix = user_movie_matrix.values.T
movie_index_map = list(user_movie_matrix.columns)

SVD = TruncatedSVD(n_components=12,random_state=42)
matrix_reduced = SVD.fit_transform(movie_user_matrix)

corr_matrix = np.corrcoef(matrix_reduced)

print(f"SVD Model Trained. Correlation Matrix Shape: {corr_matrix.shape}")

SVD Model Trained. Correlation Matrix Shape: (9724, 9724)


In [10]:
def recommend_svd(movie_title):
    match = movies[movies['title'].str.contains(movie_title, case=False)]
    if match.empty:
        return f"Movie '{movie_title}' not found."
    
    movie_id = match.iloc[0]['movieId']
    exact_title = match.iloc[0]['title']
    
    try:
        idx = movie_index_map.index(movie_id)
    except ValueError:
        return "Not enough ratings to recommend for this movie."
    
    similarity_scores = corr_matrix[idx]
    top_indices = similarity_scores.argsort()[::-1][:6]
    
    print(f"\n--- SVD Recommendations for '{exact_title}' ---")
    print("(Based on what other users liked)")
    for i in top_indices:
        rec_id = movie_index_map[i]
        rec_title = movies[movies['movieId'] == rec_id]['title'].values[0]
        if rec_id != movie_id:
            print(f"- {rec_title} (Score: {similarity_scores[i]:.2f})")

recommend_svd("Toy Story")
recommend_svd("Matrix, The")


--- SVD Recommendations for 'Toy Story (1995)' ---
(Based on what other users liked)
- Home Alone (1990) (Score: 0.95)
- Jurassic Park (1993) (Score: 0.95)
- Mrs. Doubtfire (1993) (Score: 0.94)
- Babe (1995) (Score: 0.93)
- Aladdin (1992) (Score: 0.93)

--- SVD Recommendations for 'Matrix, The (1999)' ---
(Based on what other users liked)
- Green Mile, The (1999) (Score: 0.95)
- Gladiator (2000) (Score: 0.95)
- Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000) (Score: 0.95)
- Sixth Sense, The (1999) (Score: 0.94)
- Saving Private Ryan (1998) (Score: 0.94)


In [11]:
print("Loading GenAI Model (Sentence Transformers)...")
start_time = time.time()

bert_model = SentenceTransformer('all-MiniLM-L6-v2')


print("Generating Vector Embeddings for all movies...")
tfidf_embeddings = bert_model.encode(movies['combined_features'].tolist())

cosine_sim = cosine_similarity(tfidf_embeddings)

print(f"GenAI Model Ready. Time taken: {time.time() - start_time:.2f}s")

Loading GenAI Model (Sentence Transformers)...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Generating Vector Embeddings for all movies...
GenAI Model Ready. Time taken: 38.25s


In [17]:
def recommend_genai(movie_title):
    match = movies[movies['title'].str.contains(movie_title, case=False)]
    if match.empty:
        return f"Movie '{movie_title}' not found."
    
    idx = match.index[0] 
    exact_title = match.iloc[0]['title']
    
    print(f"\n--- GenAI Recommendations for '{exact_title}' ---")
    
    target_vector = tfidf_embeddings[idx].reshape(1, -1)
    
    sim_scores = cosine_similarity(target_vector, tfidf_embeddings)
    
    sim_scores = sim_scores[0]
    
    sorted_indices = sim_scores.argsort()[::-1][:6]
    
    for i in sorted_indices:
        rec_title = movies.iloc[i]['title']
        score = sim_scores[i]
        
        if rec_title != exact_title:
            print(f"- {rec_title} (Score: {score:.2f})")


recommend_genai("Avatar")


--- GenAI Recommendations for 'Avatar (2009)' ---
- Star Trek (2009) (Score: 0.82)
- After Earth (2013) (Score: 0.80)
- Oblivion (2013) (Score: 0.75)
- Pacific Rim (2013) (Score: 0.75)
- Tron: Legacy (2010) (Score: 0.75)


In [18]:
import pickle

print("Saving models to disk...")

with open('models/movies.pkl', 'wb') as f:
    pickle.dump(movies, f)

with open('models/svd_matrix.pkl', 'wb') as f:
    pickle.dump(corr_matrix, f)

with open('models/genai_embeddings.pkl', 'wb') as f:
    pickle.dump(tfidf_embeddings, f)
    
with open('models/movie_map.pkl', 'wb') as f:
    pickle.dump(movie_index_map, f)

print("SUCCESS: Models saved to /models/ folder!")

Saving models to disk...
SUCCESS: Models saved to /models/ folder!
