In [1]:
import os


repo_dir = "Movie-Recommender-System"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

# Clone the repository from GitHub
!git clone https://github.com/Goshmar/Movie-Recommender-System

Cloning into 'Movie-Recommender-System'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 1), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (14/14), 5.63 MiB | 14.46 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [2]:
import requests
import zipfile
import pandas as pd
import numpy as np

# Define the paths
zip_file_path = "/content/Movie-Recommender-System/data/raw/ml-100k.zip"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(".")

# ZIP cleaning up
os.remove(zip_file_path)

In [3]:
!pip install surprise -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/772.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/772.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m491.5/772.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone


In [4]:
from surprise import Dataset
from surprise.model_selection import train_test_split

# Load the MovieLens 100K dataset
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [5]:
# Load additional feature information
rating_info = pd.read_csv("/content/ml-100k/u.data",sep = '\t', names = ['user_id', 'movie_id', 'rating', 'unix_timestamp'])
user_info = pd.read_csv('/content/ml-100k/u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
item_info = pd.read_csv('/content/ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDB_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [6]:
# Convert the Surprise trainset to a DataFrame
trainset_df = pd.DataFrame(trainset.all_ratings(), columns=['user_id', 'item_id', 'rating'])

# Merge additional feature information with trainset_df
trainset_df = pd.merge(trainset_df, user_info, on='user_id')
trainset_df = pd.merge(trainset_df, item_info[['movie_id', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']], left_on='item_id', right_on='movie_id')

# Create a user-item rating matrix with additional features
rating_matrix = trainset_df.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)
feature_matrix = trainset_df.pivot_table(index='user_id', values=['age', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], fill_value=0)

In [7]:
trainset_df.head()

Unnamed: 0,user_id,item_id,rating,age,gender,occupation,zip_code,movie_id,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,24,M,technician,85711,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,6,1,1.0,42,M,executive,98101,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,1,2.0,57,M,administrator,91344,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10,1,4.0,53,M,lawyer,90703,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,1,2.0,39,F,other,30329,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
rating_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1634,1635,1636,1637,1638,1639,1640,1641,1642,1643
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4,0,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,3,4,0,1,4,3,0,5,...,0,0,0,0,0,0,0,0,0,0
3,0,0,4,0,0,0,0,0,0,5,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,5,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
feature_matrix.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0.201613,0.129032,0.056452,0.120968,0.362903,0.072581,0.024194,0.41129,0.0,0.008065,0.024194,0.08871,0.056452,0.137097,0.048387,0.193548,0.064516,0.008065,24
2,0.198052,0.116883,0.022727,0.081169,0.301948,0.061688,0.022727,0.425325,0.012987,0.006494,0.068182,0.051948,0.025974,0.146104,0.094156,0.181818,0.064935,0.016234,53
3,0.229167,0.15625,0.041667,0.083333,0.197917,0.0625,0.010417,0.46875,0.020833,0.010417,0.083333,0.0625,0.020833,0.09375,0.09375,0.208333,0.072917,0.020833,23
4,0.226415,0.119497,0.037736,0.100629,0.289308,0.100629,0.031447,0.383648,0.006289,0.025157,0.056604,0.081761,0.056604,0.157233,0.081761,0.176101,0.075472,0.031447,24
5,0.227273,0.159091,0.022727,0.090909,0.227273,0.068182,0.022727,0.431818,0.0,0.022727,0.068182,0.045455,0.068182,0.181818,0.113636,0.159091,0.136364,0.022727,33


In [14]:
from scipy.sparse import csr_matrix, save_npz

# Convert 'user_id' and 'movie_id' to integers
user_ids = rating_info['user_id'].astype('int')
movie_ids = rating_info['movie_id'].astype('int')

# Create a CSR matrix
data_matrix = csr_matrix(
    (rating_info['rating'].astype('f8'), (user_ids, movie_ids)),
    shape=(user_ids.max() + 1, movie_ids.max() + 1)
)

In [15]:
data_matrix

<944x1683 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [16]:
# Downloading dataframes
trainset_df.to_csv('trainset.csv', index=False)
rating_matrix.to_csv('rating_matrix.csv', index=False)
feature_matrix.to_csv('feature_matrix.csv', index=False)

# Downloading CSR matrix
save_npz('data_matrix.npz', data_matrix)