<a href="https://colab.research.google.com/github/Malaika26/movie-recommendation-system/blob/main/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   507k      0  0:00:01  0:00:01 --:--:--  507k


In [3]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
#import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [6]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [7]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.


In [10]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [11]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transform data to tensors

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [12]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[8.5600e-03, 2.9135e-05, 3.1032e-02,  ..., 6.1806e-04, 2.7018e-02,
         3.7860e-02],
        [4.9255e-02, 1.1644e-02, 1.1662e-03,  ..., 4.0109e-04, 4.5099e-03,
         4.2438e-02],
        [4.1229e-02, 2.2791e-02, 4.8271e-02,  ..., 1.7442e-02, 3.1924e-02,
         2.5822e-03],
        ...,
        [4.0001e-02, 1.1105e-02, 4.3241e-02,  ..., 7.4050e-03, 4.4071e-02,
         4.7915e-02],
        [4.2861e-02, 1.0328e-02, 1.1344e-02,  ..., 4.9703e-02, 4.7668e-03,
         1.2845e-02],
        [1.4646e-02, 4.7488e-02, 3.6694e-02,  ..., 3.3619e-02, 4.1538e-02,
         2.5148e-02]])
item_factors.weight tensor([[0.0012, 0.0430, 0.0423,  ..., 0.0011, 0.0352, 0.0371],
        [0.0166, 0.0301, 0.0365,  ..., 0.0381, 0.0292, 0.0465],
        [0.0469, 0.0185, 0.0486,  ..., 0.0373, 0.0376, 0.0303],
        ...,
        [0.0454, 0.0488, 0.0147,  ..., 

In [13]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.061313822184722
iter #1 Loss: 4.7460910475193545
iter #2 Loss: 2.47576242503781
iter #3 Loss: 1.7218536959989421
iter #4 Loss: 1.3457928515479045
iter #5 Loss: 1.128357599078096
iter #6 Loss: 0.9911559031245673
iter #7 Loss: 0.8999758598768166
iter #8 Loss: 0.836989780443574
iter #9 Loss: 0.7919213948440431
iter #10 Loss: 0.7590427289003043
iter #11 Loss: 0.7344174968258379
iter #12 Loss: 0.7156996442581797
iter #13 Loss: 0.7014278417008782
iter #14 Loss: 0.6902890609681304
iter #15 Loss: 0.6813951155104613
iter #16 Loss: 0.6751055425874473
iter #17 Loss: 0.6695939749101092
iter #18 Loss: 0.6658809633575721
iter #19 Loss: 0.6626847949503037
iter #20 Loss: 0.6607002174566845
iter #21 Loss: 0.658775194382607
iter #22 Loss: 0.6577076611633833
iter #23 Loss: 0.6563682274770011
iter #24 Loss: 0.65564006688026
iter #25 Loss: 0.6549259732444275
iter #26 Loss: 0.6540443130631737
iter #27 Loss: 0.6534855046550635
iter #28 Loss: 0.6520507135593951
iter #29 Loss: 0.65041670880190

In [14]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.2459,  1.1232,  1.4300,  ...,  1.1449,  1.5499,  1.3191],
        [ 1.4546,  1.2802,  2.0479,  ...,  1.3086,  0.3807,  0.6889],
        [-0.5438, -0.8424,  2.4784,  ...,  1.9573, -0.7638,  1.0072],
        ...,
        [ 1.5884,  1.2368,  0.0896,  ...,  0.6913,  1.2328,  1.4558],
        [ 0.7398,  0.8090,  1.4944,  ...,  0.6572,  1.1300,  0.8062],
        [ 0.7186,  1.1873,  1.1978,  ...,  2.0175,  0.8291,  0.2539]],
       device='cuda:0')
item_factors.weight tensor([[0.5831, 0.5954, 0.4327,  ..., 0.4397, 0.4754, 0.3774],
        [0.2164, 0.3190, 0.4262,  ..., 0.3857, 0.4230, 0.9080],
        [0.6267, 0.5526, 0.1194,  ..., 0.6461, 0.6044, 0.4543],
        ...,
        [0.3738, 0.3769, 0.3438,  ..., 0.3342, 0.3546, 0.3467],
        [0.4130, 0.3958, 0.4118,  ..., 0.4183, 0.4155, 0.4096],
        [0.3862, 0.3792, 0.4045,  ..., 0.3937, 0.3905, 0.4293]],
       device='cuda:0')


In [15]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [16]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [18]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [19]:
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Net, The (1995)
	 Cliffhanger (1993)
	 Natural Born Killers (1994)
	 Meet the Parents (2000)
	 Mars Attacks! (1996)
	 Broken Arrow (1996)
	 Demolition Man (1993)
	 Face/Off (1997)
	 Blair Witch Project, The (1999)
	 Congo (1995)
Cluster #1
	 Patton (1970)
	 Battlestar Galactica (2003)
	 His Girl Friday (1940)
	 Brotherhood of the Wolf (Pacte des loups, Le) (2001)
	 Howards End (1992)
	 From Here to Eternity (1953)
	 Mary and Max (2009)
	 Rope (1948)
	 Killing, The (1956)
	 Gettysburg (1993)
Cluster #2
	 Ace Ventura: Pet Detective (1994)
	 Stargate (1994)
	 Fifth Element, The (1997)
	 Waterworld (1995)
	 Interview with the Vampire: The Vampire Chronicles (1994)
	 Ace Ventura: When Nature Calls (1995)
	 Casino (1995)
	 Starship Troopers (1997)
	 Dogma (1999)
	 Django Unchained (2012)
Cluster #3
	 Independence Day (a.k.a. ID4) (1996)
	 True Lies (1994)
	 Speed (1994)
	 Men in Black (a.k.a. MIB) (1997)
	 Mission: Impossible (1996)
	 Pretty Woman (1990)
	 Dumb & Dumber (Dumb an