In [1]:
import os

repo_dir = "Movie-Recommender-System"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

# Clone the repository from GitHub
!git clone https://github.com/Goshmar/Movie-Recommender-System

Cloning into 'Movie-Recommender-System'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 44 (delta 12), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (44/44), 6.68 MiB | 11.31 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [2]:
import requests
import zipfile
import pandas as pd
import numpy as np

# Define the paths
zip_file_path = "/content/Movie-Recommender-System/data/raw/ml-100k.zip"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".")

# ZIP cleaning up
os.remove(zip_file_path)

In [5]:
from scipy.sparse import load_npz

# Download data
data_matrix = load_npz('/content/Movie-Recommender-System/data/interim/data_matrix.npz')
rating_matrix = pd.read_csv("/content/Movie-Recommender-System/data/interim/rating_matrix.csv")
feature_matrix = pd.read_csv("/content/Movie-Recommender-System/data/interim/feature_matrix.csv")
item_info = pd.read_csv('/content/ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [7]:
# ========== SVD Model ==========
from scipy.sparse.linalg import svds
U, S, Vt = svds(data_matrix, k=50, return_singular_vectors='vh')

In [28]:
# Evaluating title
val_title = "Lord of Illusions"

matching_rows = item_info.loc[item_info.movie_title.str.contains(val_title, flags=2)]

# Check if there are any matching rows
matching_index = matching_rows.index[0] if not matching_rows.empty else None
print(f'Matching index: {matching_index} for {val_title}')

rec_svd = item_info.iloc[np.argsort(-Vt.T @ Vt[:, matching_index])[1:11]][['movie_id', 'movie_title']]

# Create a DataFrame with the selected rows
rec_svd_df = pd.DataFrame({
    'movie_id': rec_svd['movie_id'],
    'movie_title': rec_svd['movie_title']
})

# Save the DataFrame to a CSV file
rec_svd_df.to_csv(f'rec_svd_df_{val_title}.csv', index=False)

Matching index: 550 for Lord of Illusions


In [29]:
from scipy.sparse.linalg import spsolve_triangular
from scipy.sparse import csr_matrix

# ========== Hybrid Model ==========

# Calculate SVD for the rating matrix
u, s, vt = np.linalg.svd(rating_matrix.values, full_matrices=False)
v = vt.T[:, :50]

# Item similarity matrix
d = 0.5  # off-diagonal similarity factor
item_similarity_matrix = np.eye(feature_matrix.shape[1])
item_similarity_matrix[-1, -3] = d
item_similarity_matrix[-3, -1] = d

# Finding Cholesky factors
L = np.linalg.cholesky(item_similarity_matrix)

# Update this line to match the shape of the feature matrix
u2, s2, v2 = np.linalg.svd(np.dot(feature_matrix.values, L), full_matrices=False)
v2 = v2.T[:, :50]

In [30]:
# Recommendations for the user with the hybrid model
rv = spsolve_triangular(csr_matrix(L.T), v2, lower=False)

# Extract top N recommendations
top_n_hybrid = np.argsort(-np.dot(feature_matrix.values, np.dot(L, v2)).dot(rv.T))[0][:10]

# Create a list of dictionaries for the recommendations
recommendations_data = []
for i, idx in enumerate(top_n_hybrid):
    movie_id = item_info.iloc[idx]['movie_id']
    movie_title = item_info.iloc[idx]['movie_title']
    recommendations_data.append({
        'movie_id': movie_id,
        'movie_title': movie_title
    })

# Convert the list of dictionaries to a DataFrame
rec_hybrid_svd = pd.DataFrame(recommendations_data)

# Save the DataFrame to a CSV file
rec_hybrid_svd.to_csv(f'rec_hybrid_svd_df_{val_title}.csv', index=False)