# Movie Recommender System

## Dataset

The MovieLens Dataset is most often used for the purpose of recommender systems, which aim to predict user movie ratings based on other users’ ratings.

The dataset used was extracted from MovieLens and contains 100836 ratings and 3683 tag applications across 9742 movies. This data was created by 610 users between March 29, 1996 and September 24, 2018.

The data are contained in the files:
- Movies.csv:: movieId, title, genres.
- Ratings.csv:: userId, movieId, rating, timestamp.
- Tags.csv:: userId, movieId, tag, timestamp.

In [None]:
# Import all necessary libraries
 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from funk_svd.dataset import fetch_ml_ratings
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error

## Funk SVD

Fetches dataset

Splits data

Trains with learning rate...

In [None]:
#retrieve dataset with 100k rows
df = fetch_ml_ratings(variant='100k')

train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(lr=0.001, reg=0.005, n_epochs=100, n_factors=15, early_stopping=True,
          shuffle=False, min_rating=1, max_rating=5)

svd.fit(X=train, X_val=val)

pred = svd.predict(test)
mae = mean_absolute_error(test['rating'], pred)

print(f'Test MAE: {mae:.2f}')

In [None]:
plt.plot(svd.pu_[:100,0], svd.pu_[:100,1], 'o', label="Funk SVD")
plt.plot(U[:100,0], U[:100,1], 'o', label="Normal SVD")
plt.ylabel("Latent Factor 1")
plt.xlabel("Latent Factor 2")
plt.legend()

In [None]:
def get_key(val):
    for key, value in svd.item_mapping_.items():
         if val == value:
             return key
 
    return "key doesn't exist"

for i in range(5, 10):
    plt.plot(svd.pu_[i,0], svd.pu_[i,1], 'o')
    plt.annotate(movies.iloc[get_key(i)].title, (svd.pu_[i,0], svd.pu_[i,1]))
    plt.ylabel("Latent Factor 1")
    plt.xlabel("Latent Factor 2")

plt.legend()

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=r_cols, encoding='latin-1')

ratings.head()

ratings.describe()

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=r_cols, encoding='latin-1')

ratings.head()

ratings.describe()

In [None]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('ml-100k/u.item',  sep='|', names=i_cols, encoding='latin-1')

movies.head()

In [None]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

users.head()

In [None]:
num_users = len(users)
num_movies = len(movies)
print(num_users)
print(num_movies)

print(len(ratings.user_id.unique()))
print(len(ratings.movie_id.unique()))

sparsity = 1 - len(ratings) / (num_users * num_movies)

print(f"Sparsity: {sparsity:.3f}")

In [None]:
plt.hist(ratings.rating, ec='black', bins=[0.5, 1.5, 2.5, 3.5, 4.5, 5.5])
plt.xlabel("Rating")
plt.ylabel("Number of Ratings")
plt.title("Distribution of Ratings")
plt.xticks([1, 2, 3, 4, 5])
plt.show()

In [None]:
rating_matrix = ratings.pivot(index="user_id", columns="movie_id", values="rating")
matrix = pd.DataFrame(rating_matrix.values)

print(matrix.shape)
print(matrix.iloc[:5, :5])
print(matrix.iloc[:5, :5].describe())

rating_matrix = ratings.pivot(index="user_id", columns="movie_id", values="rating").fillna(0)
matrix = pd.DataFrame(rating_matrix.values)

sparsity = 1 - np.count_nonzero(matrix) / (num_users * num_movies)
print(f"Sparsity: {sparsity:.3f}")

In [None]:
rating_matrix = ratings.pivot(index="user_id", columns="movie_id", values="rating").fillna(0)
matrix = pd.DataFrame(rating_matrix.values)

print(matrix.shape)
print(matrix.iloc[:5, :5])
print(matrix.iloc[:5, :5].describe())

sparsity = 1 - np.count_nonzero(matrix) / (num_users * num_movies)
print(f"Sparsity: {sparsity:.3f}")

In [None]:
U, S, V = np.linalg.svd(matrix)


print(f"U: {pd.DataFrame(U).iloc[:5, :5]}")
print(f"S: {pd.DataFrame(S).iloc[:5, :]}")
print(f"VT: {pd.DataFrame(V.transpose()).iloc[:5, :5]}")