In [1]:
import pandas as pd 
movies_df=pd.read_csv(r'/kaggle/input/movies/movies.csv')
ratings_df=pd.read_csv(r'/kaggle/input/movies/ratings.csv')

In [2]:
print('The dimension of movies dataframe are:'),movies_df.shape,'The dimension of movies dataframe are:',ratings_df.shape

The dimension of movies dataframe are:


(None, (9742, 3), 'The dimension of movies dataframe are:', (100836, 4))

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


In [6]:
movie_names=movies_df.set_index('movieId')['title'].to_dict()
n_users=len(ratings_df.userId.unique())
n_items=len(ratings_df.movieId.unique())
print("Number of unique users",n_users)
print('The number of unique movies',n_items)
print('The full rating matrix will have:',n_users*n_items,'elements')
print('Therefore', len(ratings_df)/(n_users*n_items)*100 , 'percent of the matrix will be filled')

Number of unique users 610
The number of unique movies 9724
The full rating matrix will have: 5931640 elements
Therefore 1.6999683055613624 percent of the matrix will be filled


In [7]:
import torch
import numpy as np
from torch.autograd import variable
from tqdm import tqdm_notebook as tqdm
#basic format of the matrix factorization where we initilize the embedding
class MaxFactorization(torch.nn.Module):
    def __init__(self,n_users,n_items,n_factors=20):
        super().__init__()
        self.user_factors=torch.nn.Embedding(n_users,n_factors)#a lookup table for the input
        self.item_factors=torch.nn.Embedding(n_items,n_factors)#lookup table for the input
        self.user_factors.weight.data.uniform_(0,0.05)
        self.item_factors.weight.data.uniform_(0,0.05)

    def forward(self,data):
        users, items =data[:,0], data[:,1]
        return(self.user_factors(users)*self.item_factors(items)).sum(1)
    def predict(self,user,item):
        return self.forward(user,item)

In [8]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings=ratings_df.copy()
        
        users = ratings_df.userId.unique()
        movies=ratings_df.movieId.unique()

        self.userid2idx= {o:i for i,o in enumerate(users)}
        self.movieid2idx= {o:i for i,o in enumerate(movies)}
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid={i:o for o,i in self.movieid2idx.items()}
        
        self.ratings.movieId= ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId= ratings_df.userId.apply(lambda x: self.userid2idx[x])
        
        self.x=self.ratings.drop(['rating','timestamp'],axis=1).values
        self.y=self.ratings['rating'].values
        self.x,self.y= torch.tensor(self.x),torch.tensor(self.y)
        
        
    def __getitem__(self,index):
        return (self.x[index],self.y[index])
    def __len__(self):
        return len(self.ratings)

print('done')

done


In [9]:
num_epochs = 128
cuda = torch.cuda.is_available()

print('is running on GPU', cuda)

model= MaxFactorization(n_users,n_items,n_factors=8)
print(model)
for name,param in model.named_parameters():
    if param.requires_grad:
        print(name,param.data)

if cuda:
    model=model.cuda()
    
loss_fn=torch.nn.MSELoss()

optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)

train_set= Loader()
train_loader= DataLoader(train_set,128,shuffle=True)


is running on GPU True
MaxFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0132, 0.0238, 0.0460,  ..., 0.0042, 0.0160, 0.0199],
        [0.0410, 0.0448, 0.0291,  ..., 0.0320, 0.0408, 0.0044],
        [0.0079, 0.0231, 0.0054,  ..., 0.0438, 0.0161, 0.0338],
        ...,
        [0.0475, 0.0050, 0.0224,  ..., 0.0316, 0.0246, 0.0188],
        [0.0164, 0.0437, 0.0104,  ..., 0.0009, 0.0414, 0.0030],
        [0.0483, 0.0114, 0.0263,  ..., 0.0071, 0.0226, 0.0275]])
item_factors.weight tensor([[0.0098, 0.0007, 0.0459,  ..., 0.0140, 0.0333, 0.0352],
        [0.0406, 0.0282, 0.0411,  ..., 0.0097, 0.0328, 0.0299],
        [0.0151, 0.0224, 0.0027,  ..., 0.0394, 0.0125, 0.0471],
        ...,
        [0.0307, 0.0142, 0.0437,  ..., 0.0073, 0.0472, 0.0252],
        [0.0445, 0.0453, 0.0213,  ..., 0.0014, 0.0132, 0.0159],
        [0.0068, 0.0338, 0.0110,  ..., 0.0005, 0.0466, 0.0122]])


In [10]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x,y in train_loader:
        if cuda:
            x,y=x.cuda(),y.cuda()
            optimizer.zero_grad()
            outputs=model(x)
            loss = loss_fn(outputs.squeeze(),y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print('iter #{}'.format(it),'Loss:',sum(losses)/len(losses))
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.071891653961336
iter #1 Loss: 4.751887458532595
iter #2 Loss: 2.4783561961905
iter #3 Loss: 1.7213862833335314
iter #4 Loss: 1.3460121818151571
iter #5 Loss: 1.1286294325656698
iter #6 Loss: 0.9910669817688501
iter #7 Loss: 0.9001742805623766
iter #8 Loss: 0.8372699086587441
iter #9 Loss: 0.7922346216773019
iter #10 Loss: 0.759384210982601
iter #11 Loss: 0.7347322258776819
iter #12 Loss: 0.71584734585382
iter #13 Loss: 0.7015986476982305
iter #14 Loss: 0.6905528691214353
iter #15 Loss: 0.6819392146192832
iter #16 Loss: 0.6749572255435934
iter #17 Loss: 0.6695353003020214
iter #18 Loss: 0.665907263150675
iter #19 Loss: 0.6629372952023739
iter #20 Loss: 0.6604591210557119
iter #21 Loss: 0.6586626301032638
iter #22 Loss: 0.6575523369430285
iter #23 Loss: 0.6567973127673725
iter #24 Loss: 0.6558209972333182
iter #25 Loss: 0.6549199617226716
iter #26 Loss: 0.654080422547868
iter #27 Loss: 0.6535242947225038
iter #28 Loss: 0.6520977712373444
iter #29 Loss: 0.6510404731870303

In [11]:
c=0
uw=0
iw=0
for name,param in model.named_parameters():
    if param.requires_grad:
        print(name,param.data)
        if c == 0:
            uw = param.data
            c+=1
        else:
            iw=param.data
            

user_factors.weight tensor([[ 1.5080,  1.3075,  1.4231,  ...,  0.6003,  1.2367,  1.8915],
        [ 1.9910,  1.3721,  0.2084,  ...,  0.5512,  0.9778,  0.9047],
        [-1.6259,  1.4564,  0.5056,  ...,  0.4476, -0.5610,  0.0945],
        ...,
        [ 1.3315, -1.1745,  1.1458,  ...,  1.4891,  0.1635,  1.0942],
        [ 1.0873,  0.7844,  0.9002,  ...,  1.7296,  0.5573,  0.2509],
        [ 0.2308,  1.2090,  0.6518,  ...,  1.2877,  0.9109,  0.8331]],
       device='cuda:0')
item_factors.weight tensor([[0.4327, 0.7222, 0.4932,  ..., 0.5978, 0.7145, 0.2517],
        [0.7540, 0.5026, 0.2385,  ..., 0.2425, 0.7110, 0.6596],
        [0.3186, 0.2377, 0.4034,  ..., 0.4666, 0.5506, 0.5420],
        ...,
        [0.3897, 0.3518, 0.3828,  ..., 0.3466, 0.3859, 0.3647],
        [0.4763, 0.4342, 0.4111,  ..., 0.3908, 0.4002, 0.4058],
        [0.4011, 0.4120, 0.3887,  ..., 0.3779, 0.4234, 0.3901]],
       device='cuda:0')


In [12]:
trained_movie_embeddings=model.item_factors.weight.data.cpu().numpy()

In [13]:
trained_movie_embeddings

array([[0.43268162, 0.7221894 , 0.49317485, ..., 0.59776694, 0.71447337,
        0.2517133 ],
       [0.75398797, 0.50258625, 0.23853864, ..., 0.242501  , 0.71096987,
        0.659569  ],
       [0.31863937, 0.23769757, 0.40336943, ..., 0.46662107, 0.55060434,
        0.5419912 ],
       ...,
       [0.38965592, 0.3517531 , 0.38275376, ..., 0.34655687, 0.3858927 ,
        0.3647114 ],
       [0.47629598, 0.43420237, 0.4110809 , ..., 0.3908073 , 0.400188  ,
        0.40581518],
       [0.40105006, 0.41203392, 0.3887285 , ..., 0.37793687, 0.42339626,
        0.39014566]], dtype=float32)

In [14]:
from sklearn.cluster import KMeans
kmeans=KMeans (n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [18]:
for cluster in range(11):
    print("Cluster #{}".format(cluster))
    movs=[]
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid=train_set.idx2movieid[movidx]
        #rat_count=ratings_df.loc[ratings_df['movieId']== movid].count()[0]
        rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count().iloc[0]
        movs.append((movie_names[movid],rat_count))
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print('\t', mov[0])


Cluster #0
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Star Wars: Episode VI - Return of the Jedi (1983)
Cluster #1
	 Jurassic Park (1993)
	 Terminator 2: Judgment Day (1991)
	 Toy Story (1995)
	 Independence Day (a.k.a. ID4) (1996)
	 Apollo 13 (1995)
	 Fugitive, The (1993)
	 Batman (1989)
	 Aladdin (1992)
	 True Lies (1994)
	 Lion King, The (1994)
Cluster #2
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Twister (1996)
	 Crimson Tide (1995)
	 American Pie (1999)
	 Happy Gilmore (1996)
	 Avatar (2009)
	 Matrix Reloaded, The (2003)
	 Armageddon (1998)
	 Star Wars: Episode II - Attack of the Clones (2002)
	 Contact (1997)
Cluster #3
	 Mask, The (1