# Final Project Recommender Systems

##Title: Recommender Systems for 1M movie dataset.
Dataset: MovieLens 1M Dataset (https://grouplens.org/datasets/movielens/1m/)
Project Owner: Masoumeh Ghorbani

Tasks:
1. Load dataset
2. EDA
3. Create the user-item matrix
4. Content-based model
5. User-user collaborative
6. Item-item collaborative
7. SVD model
8. ALS model
9. Combine models

## 1.Load dataset

In [1]:
# get data from grouplens (movielens data)
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip

--2024-10-07 12:47:51--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2024-10-07 12:47:53 (4.17 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [2]:
!unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler,MultiLabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD

## 2.EDA

In [4]:
# Define file paths

ratings_file = '/content/ml-1m/ratings.dat'
movies_file = '/content/ml-1m/movies.dat'
users_file = '/content/ml-1m/users.dat'

# Load ratings data

ratings = pd.read_csv(ratings_file, sep='::', engine='python', encoding='ISO-8859-1', names=['user_id', 'movie_id', 'rating', 'timestamp'])

movies = pd.read_csv(movies_file, sep='::', engine='python', encoding='ISO-8859-1', names=['movie_id', 'title', 'genres'])

users = pd.read_csv(users_file, sep='::', engine='python', encoding='ISO-8859-1', names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])



In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [8]:
# Merging data for comprehensive analysis
data = pd.merge(ratings, movies, on='movie_id')
data = pd.merge(data, users, on='user_id')

In [9]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [10]:
# Handling missing data
# Checking for missing values
print(data.isnull().sum())

user_id       0
movie_id      0
rating        0
timestamp     0
title         0
genres        0
gender        0
age           0
occupation    0
zip_code      0
dtype: int64


In [11]:
# Extracting time features
data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour

In [12]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip_code,datetime,year,month,day,hour
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067,2000-12-31 22:12:40,2000,12,31,22
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067,2000-12-31 22:35:09,2000,12,31,22
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067,2000-12-31 22:32:48,2000,12,31,22
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067,2000-12-31 22:04:35,2000,12,31,22
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067,2001-01-06 23:38:11,2001,1,6,23


In [13]:
# Drop timestamp and datetime columns
data.drop(['timestamp', 'datetime'], axis=1, inplace=True)

In [14]:
# Encoding gender, genre, and other categorical features

le_gender = LabelEncoder()
data['gender'] = le_gender.fit_transform(data['gender'])

data['genres'] = data['genres'].apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genres_encoded = pd.DataFrame(mlb.fit_transform(data['genres']), columns=mlb.classes_, index=data.index)
data = pd.concat([data, genres_encoded], axis=1)
data.drop('genres', axis=1, inplace=True)

le_occupation = LabelEncoder()
data['occupation'] = le_occupation.fit_transform(data['occupation'])

In [15]:
data.head()

Unnamed: 0,user_id,movie_id,rating,title,gender,age,occupation,zip_code,year,month,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),0,1,10,48067,2000,12,...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,James and the Giant Peach (1996),0,1,10,48067,2000,12,...,0,0,0,1,0,0,0,0,0,0
2,1,914,3,My Fair Lady (1964),0,1,10,48067,2000,12,...,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,Erin Brockovich (2000),0,1,10,48067,2000,12,...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,"Bug's Life, A (1998)",0,1,10,48067,2001,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Normalizing data
scaler = StandardScaler()
data[['rating', 'age', 'year', 'month', 'day', 'hour']] = scaler.fit_transform(data[['rating', 'age', 'year', 'month', 'day', 'hour']])


## 3.user-item matrix

In [17]:
user_item_matrix = data.pivot_table(index='user_id', columns='movie_id', values='rating', fill_value=0)


In [18]:
print(user_item_matrix.head())
print(f"Matrix dimensions: {user_item_matrix.shape}")

movie_id      1     2     3     4     5         6     7     8     9     10    \
user_id                                                                        
1         1.269747   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0   
2         0.000000   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0   
3         0.000000   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0   
4         0.000000   0.0   0.0   0.0   0.0  0.000000   0.0   0.0   0.0   0.0   
5         0.000000   0.0   0.0   0.0   0.0 -1.415775   0.0   0.0   0.0   0.0   

movie_id  ...  3943  3944  3945  3946  3947  3948  3949  3950  3951  3952  
user_id   ...                                                              
1         ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2         ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3         ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4         ...   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0

In [19]:
#Dimensionality reduction

svd = TruncatedSVD(n_components=100, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
print(f"Reduced matrix dimensions: {user_factors.shape}")


Reduced matrix dimensions: (6040, 100)


## 4. Content-based model


## 5. User-user collaborative

## 6. Item-item collaborative

In [140]:

def get_movie_titles_with_ids(movie_ids):
    input_df  = pd.DataFrame({'movie_id': movie_ids, 'index': range(len(movie_ids))})
    merged_df = pd.merge(input_df, movies[['movie_id', 'title']], on='movie_id')
    
    sorted_df = merged_df.sort_values('index')
    sorted_df = sorted_df.drop('index', axis=1)
    return sorted_df

collaborative_recommendations_item = recommend_movies_colab_item(1)
recommended_movies_df = get_movie_titles_with_ids(collaborative_recommendations_item)
print(recommended_movies_df)

   movie_id                                              title
0      1198                     Raiders of the Lost Ark (1981)
1       318                   Shawshank Redemption, The (1994)
2      1196  Star Wars: Episode V - The Empire Strikes Back...
3       593                   Silence of the Lambs, The (1991)
4      1259                                 Stand by Me (1986)


## 7. SVD model

## 8. ALS model

## 9. Combine models