In [1]:
from surprise import Dataset
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

In [2]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 02:11:52


In [3]:
data = Dataset.load_builtin('ml-1m')
df = pd.DataFrame(data.raw_ratings, columns = ['UserId', 'MovieId', 'Rating',  'Timestamp'])
user_movie_rating_df = df

In [4]:
users = df['UserId'].unique()
items = df['MovieId'].unique()

In [5]:
np_users = np.array(list(map(lambda user: int(user), users)), dtype = np.int32)
np_items = np.array(list(map(lambda item: int(item), items)), dtype = np.int32)
np_labels = np.array(list(map(lambda r: 1 if int(r) > 3 else 0, df['Rating'].unique())), dtype = np.int32)

In [6]:
EMBEDDING_SIZE = 100

MAX_EPOCH = 128
INIT_USER_BATCH_SIZE = random.randint(32, 1024)
FINAL_USER_BATCH_SIZE = 2048
INIT_USER_BATCH_SIZE = random.randint(64, 2048)
FINAL_USER_BATCH_SIZE = 4096

In [7]:
max_rating = max(np_labels)
min_rating = min(np_labels)

class UserMovieEmbedding(nn.Module):
    def __init__(self, n_users, n_movies, n_factors = 100, nh = 20, p1 = 0.05, p2= 0.5):
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.u.weight.data.uniform_(-0.01,0.01)
        self.m = nn.Embedding(n_movies, n_factors)
        self.m.weight.data.uniform_(-0.01,0.01)
        self.lin1 = nn.Linear(n_factors*2, nh)  # bias is True by default
        self.lin2 = nn.Linear(nh, 1)
        self.drop1 = nn.Dropout(p = p1)
        self.drop2 = nn.Dropout(p = p2)
    
    def forward(self, users, movies): # forward pass i.e.  dot product of vector from movie embedding matrixx
                                    # and vector from user embeddings matrix
        
        # torch.cat : concatenates both embedding matrix to make more columns, same rows i.e. n_factors*2, n : rows
        # u(users) is doing lookup for indexed mentioned in users
        # users has indexes to lookup in embedding matrix. 
        
        u2,m2 = self.u(users) , self.m(movies)
       
        x = self.drop1(torch.cat([u2,m2], 1)) # drop initialized weights
        x = self.drop2(F.relu(self.lin1(x))) # drop 1st linear + nonlinear wt
        r = torch.sigmoid(self.lin2(x)) * (max_rating - min_rating) + min_rating               
        return r
    
    
    def get_user_embedding_layer(self, user):
        return self.u
    
    def get_movie_embedding_layer(self, user):
        return self.m
# class UserMovieEmbedding(nn.Module):
#     def __init__(self, users_count, items_count, embedding_dim):
#         super().__init__()
#         self.user_embeddings = nn.Embedding(users_count, embedding_dim)
#         self.item_embeddings = nn.Embedding(items_count, embedding_dim)
#         self.output_layer = nn.Linear(FINAL_USER_BATCH_SIZE, 1)
        
#     def forward(self, users, items):
#         user_embedding = self.user_embeddings(users)
#         item_embeddings = self.item_embeddings(items)
#         mat_mul = torch.matmul(user_embedding, item_embeddings.T)
# #         pad = FINAL_USER_BATCH_SIZE - mat_mul.shape[1]
# #         if pad != 0:
# #             print('MATEMATICA')
# #             mat_mul =  torch.cat((mat_mul, torch.zeros(mat_mul.shape[0], pad)), 1)

# #         output_padded = torch.cat((mat_mul, torch.zeros()), 1)
#         output = torch.sigmoid(self.output_layer(mat_mul))
#         return output
    
#     def get_user_embedding_layer(user):
#         return self.user_embeddings
    
#     def get_movie_embedding_layer(user):
#         return self.user_embeddings

In [8]:
# n_factors = 50
# max_rating = max(np_labels)
# min_rating = min(np_labels)

# def get_emb(ni,nf):
#     e = nn.Embedding(ni, nf)
#     e.weight.data.uniform_(-0.01,0.01)
#     #e.weight.data.normal_(0,0.003)

#     return e

# class UserMovieEmbedding(nn.Module):
#     def __init__(self, n_users, n_movies, nh = 10, p1 = 0.05, p2= 0.5):
#         super().__init__()
#         (self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [
#             (n_users, n_factors), (n_movies, n_factors),
#             (n_users,1), (n_movies,1)
#         ]]
        
#         self.lin1 = nn.Linear(n_factors*2, nh)  # bias is True by default
#         self.lin2 = nn.Linear(nh, 1)
#         self.drop1 = nn.Dropout(p = p1)
#         self.drop2 = nn.Dropout(p = p2)
    
#     def forward(self, users, movies): # forward pass i.e.  dot product of vector from movie embedding matrixx
#                                     # and vector from user embeddings matrix
        
#         # torch.cat : concatenates both embedding matrix to make more columns, same rows i.e. n_factors*2, n : rows
#         # u(users) is doing lookup for indexed mentioned in users
#         # users has indexes to lookup in embedding matrix. 
        
# #         users,movies = cats[:,0],cats[:,1]
#         u2,m2 = self.u(users) , self.m(movies)
       
#         x = self.drop1(torch.cat((u2,m2), 1)) # drop initialized weights
#         x = self.drop2(F.relu(self.lin1(x))) # drop 1st linear + nonlinear wt
#         r = torch.sigmoid(self.lin2(x)) * (max_rating - min_rating) + min_rating               
#         return r

In [9]:
model = UserMovieEmbedding(max(np_users)+1, max(np_items)+1, EMBEDDING_SIZE)
# model.load_state_dict(torch.load('saved_model_torch'))

In [10]:
modified_user_movie_rating_df = user_movie_rating_df.apply(np.int32)
index_names = modified_user_movie_rating_df[modified_user_movie_rating_df['Rating']<4].index
modified_user_movie_rating_df = modified_user_movie_rating_df.drop(index_names)
modified_user_movie_rating_df = modified_user_movie_rating_df.drop('Rating', axis=1)
u_m_pairs = modified_user_movie_rating_df.to_numpy()

positive_user_movie_dict = {u : [] for u in range(1, max(modified_user_movie_rating_df['UserId'])+1)}
for data in modified_user_movie_rating_df.iterrows():
    positive_user_movie_dict[data[1][0]].append(data[1][1])

In [11]:
def generate_user_movie_batch(positive_pairs, batch_size, negative_ratio=0.5):
    batch = np.zeros((batch_size, 3))
    positive_batch_size = batch_size - int(batch_size*negative_ratio)
    max_user_id = max(modified_user_movie_rating_df['UserId'])+1
    max_movie_id = max(modified_user_movie_rating_df['MovieId'])+1
    
    while True:
        idx = np.random.choice(len(positive_pairs), positive_batch_size)
        data = positive_pairs[idx]
        for i, d in enumerate(data):
            batch[i] = (d[0], d[1], 1)
        
        while i+1 < batch_size:
            u = np.random.randint(1, max_user_id)
            m = np.random.randint(1, max_movie_id)
            if m not in positive_user_movie_dict[u]:
                i += 1
                batch[i] = (u, m, 0)
        
        np.random.shuffle(batch)
        yield torch.LongTensor(batch[:,0]), torch.LongTensor(batch[:,1]),torch.LongTensor(batch[:,2])

In [12]:
bce = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

def train_step(users, items, labels):
    optimizer.zero_grad()
    predictions = torch.flatten(model(users, items))
    labels = labels.float()
#     predictions = torch.flatten(torch.where(predictions > 0.5, torch.tensor(1.0, requires_grad=True), torch.tensor(0.0, requires_grad=True)))
#     print(labels, predictions[0])
#     labels.requires_grad=True
    loss = bce(predictions, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

In [13]:
losses = []
us, mo, _ = next(generate_user_movie_batch(u_m_pairs, 100))

for epoch in range(MAX_EPOCH):
    running_loss = 0.0
    batch_size = INIT_USER_BATCH_SIZE * (epoch+1)
    if batch_size > FINAL_USER_BATCH_SIZE:
        batch_size = FINAL_USER_BATCH_SIZE
    test_generator = generate_user_movie_batch(u_m_pairs, batch_size)
    steps_count = len(user_movie_rating_df)//batch_size
    for step in range(steps_count):
        # embedding layer update
        u_batch, m_batch, u_m_label_batch = next(test_generator)
        loss = train_step(u_batch, m_batch, u_m_label_batch)
        running_loss += loss
    print(f'{epoch} epoch, Loss: {running_loss/steps_count}')
    running_loss = 0.0
    torch.save(model.state_dict(), 'saved_concat_model')

#     test_losses.append(test_train_loss.result())

0 epoch, Loss: 0.4140145425212207
1 epoch, Loss: 0.3672398751592401
2 epoch, Loss: 0.3623041728028545
3 epoch, Loss: 0.3609416764534888
4 epoch, Loss: 0.3602478338069603
5 epoch, Loss: 0.3580698673842383
6 epoch, Loss: 0.35705869632666226
7 epoch, Loss: 0.3577936437042033
8 epoch, Loss: 0.3576170276911532
9 epoch, Loss: 0.3569568296680685
10 epoch, Loss: 0.3574908880181
11 epoch, Loss: 0.35739025070530467
12 epoch, Loss: 0.3555609909970252
13 epoch, Loss: 0.35650846484254617
14 epoch, Loss: 0.3556248411291935
15 epoch, Loss: 0.35582328013709336
16 epoch, Loss: 0.3547584352190377
17 epoch, Loss: 0.35494713729522265
18 epoch, Loss: 0.35361618414276935
19 epoch, Loss: 0.35364954271277443
20 epoch, Loss: 0.3523416558250052
21 epoch, Loss: 0.35402903461554014
22 epoch, Loss: 0.35138327078741105
23 epoch, Loss: 0.3509424330025423
24 epoch, Loss: 0.3495912255078066
25 epoch, Loss: 0.34875556276958497
26 epoch, Loss: 0.34688813630186144
27 epoch, Loss: 0.3462115843765071
28 epoch, Loss: 0.3459

In [14]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 03:17:39


In [15]:
model = UserMovieEmbedding(max(np_users)+1, max(np_items)+1, EMBEDDING_SIZE)
model.load_state_dict(torch.load('saved_concat_model'))

<All keys matched successfully>