In [7]:
# Data info
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
 12  955k   12  119k    0     0  56252      0  0:00:17  0:00:02  0:00:15 56300
100  955k  100  955k    0     0   332k      0  0:00:02  0:00:02 --:--:--  333k


Recommendation System: Collaborative System utilize Matrix Factorization and Kmeans

Start working with the dataset

In [9]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [1]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [2]:
print('The dimension of movies dataframe are: ', movies_df.shape, '\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimension of movies dataframe are:  (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of uniqe users:", n_users)
print("Number of uniqe users:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements. ')
print("-------------------------------------------------")
print("Number of ratings:", len(ratings_df))
print("This means: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of uniqe users: 610
Number of uniqe users: 9724
The full rating matrix will have: 5931640 elements. 
-------------------------------------------------
Number of ratings: 100836
This means:  1.6999683055613624 % of the matrix is filled.


In [6]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a loop table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a loop table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [7]:
# Creating the dataloader (for pytorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader #this helps transfrom your data to ML readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Producing new countinous IDs for users and movies
        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # Return the id from the indexed values as noted in the lambda function
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensord (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [8]:
num_epochs = 128
import torch
print(torch.version.cuda)
cuda = torch.cuda.is_available()

print("Is running on GPU: ", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if available
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# Adam Optimizers
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train model
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

11.8
Is running on GPU:  True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0393, 0.0485, 0.0473,  ..., 0.0099, 0.0455, 0.0435],
        [0.0445, 0.0073, 0.0313,  ..., 0.0192, 0.0036, 0.0277],
        [0.0249, 0.0258, 0.0230,  ..., 0.0009, 0.0261, 0.0283],
        ...,
        [0.0045, 0.0489, 0.0490,  ..., 0.0167, 0.0297, 0.0319],
        [0.0267, 0.0430, 0.0371,  ..., 0.0056, 0.0223, 0.0064],
        [0.0012, 0.0453, 0.0346,  ..., 0.0434, 0.0061, 0.0011]])
item_factors.weight tensor([[0.0181, 0.0393, 0.0018,  ..., 0.0355, 0.0004, 0.0389],
        [0.0352, 0.0416, 0.0482,  ..., 0.0490, 0.0323, 0.0259],
        [0.0065, 0.0159, 0.0253,  ..., 0.0342, 0.0054, 0.0243],
        ...,
        [0.0303, 0.0052, 0.0035,  ..., 0.0262, 0.0382, 0.0200],
        [0.0167, 0.0147, 0.0276,  ..., 0.0117, 0.0002, 0.0248],
        [0.0233, 0.0083, 0.0081,  ..., 0.0133, 0.0480, 0.0131]])


In [9]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x,y in train_loader:
        if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.0675803997795
iter #1 Loss: 4.74774285349144
iter #2 Loss: 2.4771450882030623
iter #3 Loss: 1.722094149002569
iter #4 Loss: 1.346322514305865
iter #5 Loss: 1.1290845287027698
iter #6 Loss: 0.9917883684641213
iter #7 Loss: 0.9007227372397021
iter #8 Loss: 0.8372564085092641
iter #9 Loss: 0.7924526236232767
iter #10 Loss: 0.7595862663412457
iter #11 Loss: 0.7348819890527556
iter #12 Loss: 0.7161125317594121
iter #13 Loss: 0.701716947184904
iter #14 Loss: 0.69064312242917
iter #15 Loss: 0.68174049054003
iter #16 Loss: 0.6751544304608088
iter #17 Loss: 0.6697232748575622
iter #18 Loss: 0.6658261372958343
iter #19 Loss: 0.6631252764899114
iter #20 Loss: 0.6605720669031143
iter #21 Loss: 0.6589147372899322
iter #22 Loss: 0.6579325071673103
iter #23 Loss: 0.656866823772186
iter #24 Loss: 0.6560199491414928
iter #25 Loss: 0.655037562518858
iter #26 Loss: 0.6546409914865712
iter #27 Loss: 0.6536765035033831
iter #28 Loss: 0.6526153637899965
iter #29 Loss: 0.6519604122199988
ite

In [10]:
# By training model , we will have turned latent factors for movies and users
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data

user_factors.weight tensor([[ 1.3404,  1.1545,  0.9744,  ...,  0.9171,  1.4148,  1.4991],
        [-0.3588,  0.8034,  1.2650,  ...,  1.9001,  1.3248,  0.6369],
        [-0.7991, -0.3714,  0.9428,  ...,  3.0603,  1.9288,  0.9640],
        ...,
        [-0.3642,  1.1918,  2.9811,  ...,  1.0466,  0.4813,  1.1835],
        [ 0.8912,  0.4304,  0.8548,  ...,  1.2422,  1.2165,  1.0027],
        [ 0.8427,  2.0820,  1.2852,  ...,  1.3544,  1.2635,  0.4195]],
       device='cuda:0')
item_factors.weight tensor([[ 0.4935,  0.9306,  0.1590,  ...,  0.3412,  0.3901,  0.4233],
        [ 0.6151,  0.4735,  0.2595,  ...,  0.7328,  0.1241, -0.1935],
        [ 0.3260,  0.7434,  0.6546,  ...,  0.5516,  0.2806,  0.4545],
        ...,
        [ 0.3696,  0.3486,  0.3474,  ...,  0.3690,  0.3798,  0.3527],
        [ 0.4124,  0.4114,  0.4240,  ...,  0.4078,  0.3970,  0.4028],
        [ 0.4033,  0.3871,  0.3869,  ...,  0.3908,  0.4260,  0.3680]],
       device='cuda:0')


In [11]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [12]:
trained_movie_embeddings # unique movie factor weights

array([[ 0.49352616,  0.9306135 ,  0.15904601, ...,  0.34116322,
         0.39008784,  0.42332396],
       [ 0.615147  ,  0.47346   ,  0.25952616, ...,  0.7327716 ,
         0.12405194, -0.1935458 ],
       [ 0.3260084 ,  0.7434126 ,  0.6545562 , ...,  0.5515972 ,
         0.2805826 ,  0.45453426],
       ...,
       [ 0.3696093 ,  0.34859645,  0.34743902, ...,  0.36900786,
         0.37982777,  0.35272756],
       [ 0.4124302 ,  0.41137502,  0.42395863, ...,  0.4077892 ,
         0.39695445,  0.40281463],
       [ 0.4033077 ,  0.38707095,  0.3868785 , ...,  0.3907948 ,
         0.426049  ,  0.3679608 ]], dtype=float32)

In [13]:
from sklearn.cluster import KMeans
# Fit the cluster based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [16]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
'''
It can be seen here that the movies which are in the same cluster tend to have similar genres. 
Also note that the algorithm in unfamilliar with the movie name and only obtained the relationship
by looking at the numbers representing how users have responded to the movie selections.
'''
def display_clusters(kmeans, train_set, movie_names, ratings_df):
    for cluster in range(10):
        print("Cluster #{}".format(cluster))
        movs = []
        for movidx in np.where(kmeans.labels_ == cluster)[0]:
            movid = train_set.idx2movieid[movidx]
            rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
            movs.append((movie_names[movid], rat_count))
        for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
            print("\t", mov[0])

In [19]:
display_clusters(kmeans, train_set, movie_names, ratings_df)

Cluster #0
	 Mask, The (1994)
	 Cliffhanger (1993)
	 Mummy, The (1999)
	 Mars Attacks! (1996)
	 Santa Clause, The (1994)
	 Demolition Man (1993)
	 Starship Troopers (1997)
	 Face/Off (1997)
	 Desperado (1995)
	 Blair Witch Project, The (1999)
Cluster #1
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Jurassic Park (1993)
	 Braveheart (1995)
	 Terminator 2: Judgment Day (1991)
	 Schindler's List (1993)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Seven (a.k.a. Se7en) (1995)
Cluster #2
	 Honey, I Shrunk the Kids (1989)
	 Sabrina (1995)
	 Last Action Hero (1993)
	 Superman II (1980)
	 Nine Months (1995)
	 Batman & Robin (1997)
	 Hellboy (2004)
	 Junior (1994)
	 Godzilla (1998)
	 Toys (1992)
Cluster #3
	 Ace Ventura: Pet Detective (1994)
	 Austin Powers: The Spy Who Shagged Me (1999)
	 Bourne Identity, The (2002)
	 Big Lebowski, The (1998)
	 WALLÂ·E (2008)
	 Austin Powers: International Man of Mystery (1997)
	 