### Loading The Dataset

In [1]:
import pandas as pd

In [2]:
ratings_columns = ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv('./ml-1m/ratings.dat', names=ratings_columns, delimiter='::', engine='python')

In [3]:
ratings_df = ratings_df.drop('timestamp', axis=1)

In [4]:
ratings_df.userId = ratings_df.userId - 1
ratings_df.movieId = ratings_df.movieId - 1

In [5]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,0,1192,5
1,0,660,3
2,0,913,3
3,0,3407,4
4,0,2354,5


### Stats

In [108]:
from collections import Counter
import numpy as np

In [109]:
num_users = 6040
num_movies = 3952

In [110]:
labels = Counter(list(ratings_df[['rating']].to_numpy().squeeze()))
print(labels)

Counter({4: 348971, 3: 261197, 5: 226310, 2: 107557, 1: 56174})


### Loading Movie Genres

In [111]:
movies_columns = ['movieId', 'title', 'genres']
movies_df = pd.read_csv('./ml-1m/movies.dat', names=movies_columns, delimiter='::', engine='python')

In [112]:
movies_df.movieId = movies_df.movieId - 1

In [113]:
all_genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
              "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
              "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

In [114]:
def genres_to_onehot(x):
    movies_genres = np.zeros((num_movies, len(all_genres))) 
    
    for movie in x.to_numpy():
        movie_id = movie[0]
        movie_gen = movie[2].split('|')
        for i, genre in enumerate(all_genres):
            has_gen = int(genre in movie_gen)    
            movies_genres[movie_id, i] = has_gen

    return movies_genres

In [115]:
movies_genres = genres_to_onehot(movies_df)

### Loading User's Gender & Age

In [116]:
from sklearn.preprocessing import OneHotEncoder

In [117]:
users_columns = ['userId', 'gender', 'age', 'occupation', 'zip_code']
users_df = pd.read_csv('./ml-1m/users.dat', names=users_columns, delimiter='::', engine='python')

In [118]:
users_df.userId = users_df.userId - 1

In [119]:
users_gender = users_df.gender.astype('category').cat.codes.to_numpy()

In [120]:
age_encoder = OneHotEncoder()
age_encoder.fit(users_df[['age']])
users_age = age_encoder.transform(users_df[['age']]).toarray()

### Train-Validation Split

In [121]:
from sklearn.model_selection import train_test_split

In [122]:
data = ratings_df.to_numpy()
users = data[:, 0].astype(np.int32)
movies = data[:, 1].astype(np.int32)
ratings = data[:, 2]

In [123]:
users_train, users_val, movies_train, movies_val, ratings_train, ratings_val = train_test_split(
    users, movies, ratings, test_size=0.1, random_state=0)

### Creating PyTorch Datasets

In [29]:
import torch as T
from torch.utils.data import DataLoader, Dataset

In [30]:
class MovieLensDataset(Dataset):
    def __init__(self, users, genders, ages, movies, genres, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
        self.genres = genres
        self.genders = genders
        self.ages = ages
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, index):
        user = self.users[index]
        gender = np.expand_dims(self.genders[user], axis=0)
        age = self.ages[user]
        movie = self.movies[index]
        genre = self.genres[movie]
        rating = self.ratings[index]
        return (T.tensor(user, dtype=T.int), T.tensor(gender, dtype=T.int), T.tensor(age, dtype=T.int),
                T.tensor(movie, dtype=T.int), T.tensor(genre, dtype=T.int), T.tensor(rating, dtype=T.float32))

In [31]:
def get_loader(users, genders, ages, movies, genres, ratings, batch_size=32, shuffle=True):
    dataset = MovieLensDataset(users, genders, ages, movies, genres, ratings)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle)
    return loader

In [32]:
train_loader = get_loader(users_train, users_gender, users_age, movies_train,
                          movies_genres, ratings_train, batch_size=2000)

In [33]:
val_loader = get_loader(users_val, users_gender, users_age, movies_val,
                        movies_genres, ratings_val, batch_size=2000, shuffle=False)

### Model

In [34]:
import torch.nn as nn

In [35]:
class RatingPredictor(nn.Module):
    
    def __init__(self, n_users, n_genders, n_ages, n_movies, user_emb_dim, movie_emb_dim, n_genres):
        super().__init__()

        self.user_emb = nn.Embedding(n_users, user_emb_dim)
        self.movie_emb = nn.Embedding(n_movies, movie_emb_dim)
        self.fc = nn.Sequential(
            nn.Linear(user_emb_dim+n_genders+n_ages+movie_emb_dim+n_genres, 200),
            nn.ReLU(),
            nn.Linear(200, 100),
            nn.ReLU(),
            nn.Linear(100, 1),
            nn.Sigmoid()
        )
    
    def forward(self, users, genders, ages, movies, genres):
        user_embedded = self.user_emb(users)
        movie_embedded = self.movie_emb(movies)
        x = T.cat((user_embedded, genders, ages, movie_embedded, genres), dim=1)
        x = 6 * self.fc(x)
        return x

In [77]:
model = RatingPredictor(
    n_users=num_users,
    n_genders=1,
    n_ages=users_age.shape[1],
    n_movies=num_movies,
    user_emb_dim=100,
    movie_emb_dim=100,
    n_genres=movies_genres.shape[1]
)

In [39]:
print((num_users, 1, users_age.shape[1], num_movies, 100, 100, movies_genres.shape[1]))

(6040, 1, 7, 3952, 100, 100, 18)


### Training

In [78]:
import torch.optim as optim
import time

In [79]:
emb_optimizer = optim.AdamW(list(model.parameters())[:2], weight_decay=1)
fc_optimizer = optim.AdamW(list(model.parameters())[2:], weight_decay=1)
emb_scheduler = optim.lr_scheduler.MultiStepLR(emb_optimizer, milestones=[8], gamma=0.1)
fc_scheduler = optim.lr_scheduler.MultiStepLR(fc_optimizer, milestones=[8], gamma=0.1)
criterion = nn.MSELoss()

In [80]:
device = 'cuda'
criterion = criterion.to(device)
model = model.to(device)

In [81]:
def train(model, iterator, emb_optimizer, fc_optimizer, criterion):
    
    epoch_loss = 0
    
    model.train()
        
    for i, (users, genders, ages, movies, genres, ratings) in enumerate(iterator):
        
        emb_optimizer.zero_grad()
        fc_optimizer.zero_grad()
        
        users = users.to(device)
        genders = genders.to(device)
        ages = ages.to(device)
        movies = movies.to(device)
        genres = genres.to(device)
        ratings = ratings.to(device)
        
        predictions = model(users, genders, ages, movies, genres).squeeze()
        loss = criterion(predictions, ratings)
        
        loss.backward()
        
        emb_optimizer.step()
        fc_optimizer.step()
        
        epoch_loss += loss.item() * len(users)
                
    return epoch_loss / len(iterator.dataset)

In [82]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    
    model.eval() 
    
    labels = []
    preds = []
    
    with T.no_grad():
    
        for i, (users, genders, ages, movies, genres, ratings) in enumerate(iterator):

            users = users.to(device)
            genders = genders.to(device)
            ages = ages.to(device)
            movies = movies.to(device)
            genres = genres.to(device)
            ratings = ratings.to(device)
            
            predictions = model(users, genders, ages, movies, genres).squeeze()
            predictions = T.clip(predictions, min=1.0, max=5.0)
            loss = criterion(predictions, ratings)
            
            preds += predictions.tolist()
            labels += ratings.tolist()

            epoch_loss += loss.item() * len(users)
          
    labels = np.asarray(labels).ravel()
    preds = np.asarray(preds).ravel()
    rmse = np.sqrt(np.mean((preds - labels)**2))
            
    return epoch_loss / len(iterator.dataset), rmse

In [83]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [84]:
N_EPOCHS = 10

for epoch in range(N_EPOCHS):

    print(f'Epoch: {epoch+1}')
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, emb_optimizer, fc_optimizer, criterion)
    valid_loss, valid_rmse = evaluate(model, val_loader, criterion)
    
    end_time = time.time()
    
    emb_scheduler.step()
    fc_scheduler.step()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
       
    print(f'Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'    Train Loss: {train_loss:.3f}')
    print(f'     Val. Loss: {valid_loss:.3f} | Val. RMSE: {valid_rmse:.3f}')
    print()

Epoch: 1
Epoch Time: 0m 49s
    Train Loss: 0.980
     Val. Loss: 0.874 | Val. RMSE: 0.935

Epoch: 2
Epoch Time: 0m 50s
    Train Loss: 0.836
     Val. Loss: 0.831 | Val. RMSE: 0.912

Epoch: 3
Epoch Time: 0m 53s
    Train Loss: 0.805
     Val. Loss: 0.812 | Val. RMSE: 0.901

Epoch: 4
Epoch Time: 0m 51s
    Train Loss: 0.782
     Val. Loss: 0.795 | Val. RMSE: 0.891

Epoch: 5
Epoch Time: 0m 49s
    Train Loss: 0.753
     Val. Loss: 0.768 | Val. RMSE: 0.877

Epoch: 6
Epoch Time: 0m 52s
    Train Loss: 0.724
     Val. Loss: 0.757 | Val. RMSE: 0.870

Epoch: 7
Epoch Time: 0m 50s
    Train Loss: 0.705
     Val. Loss: 0.746 | Val. RMSE: 0.864

Epoch: 8
Epoch Time: 0m 50s
    Train Loss: 0.691
     Val. Loss: 0.745 | Val. RMSE: 0.863

Epoch: 9
Epoch Time: 0m 49s
    Train Loss: 0.626
     Val. Loss: 0.722 | Val. RMSE: 0.850

Epoch: 10
Epoch Time: 0m 51s
    Train Loss: 0.604
     Val. Loss: 0.720 | Val. RMSE: 0.848



### Saving The Model

In [45]:
T.save(model.state_dict(), './model/recom.pth')

In [126]:
np.save('./model/movies_genres.npy', movies_genres)
np.save('./model/users_gender.npy', users_gender)
np.save('./model/users_age.npy', users_age)

### Loading The Model

In [48]:
model.load_state_dict(T.load('./model/recom.pth'))

<All keys matched successfully>

In [128]:
movies_genres = np.load('./model/movies_genres.npy')
users_gender = np.load('./model/users_gender.npy')
users_age = np.load('./model/users_age.npy')

### Finding Similar Movies (Those Who Liked X also Liked Y)

In [87]:
def similar_movies(movie_name, k=5):
    movie_id = int(movies_df.loc[movies_df['title'] == movie_name]['movieId'])
    embs = model.movie_emb.weight.data.cpu().numpy()
    movie_emb = embs[movie_id]
    dists = np.sum((embs - movie_emb)**2, axis=1)
    sorted_idx = sorted(range(len(dists)), key=lambda k: dists[k])
    top_movie_ids = sorted_idx[:k]
    similar_movies = [(str(movies_df.loc[movies_df['movieId'] == top_id]['title'].values), dists[top_id])
                      for top_id in top_movie_ids]
    return similar_movies

In [90]:
similar_movies('Good, The Bad and The Ugly, The (1966)')

[("['Good, The Bad and The Ugly, The (1966)']", 0.0),
 ("['Fistful of Dollars, A (1964)']", 0.122113965),
 ("['Four Days in September (1997)']", 0.12961872),
 ("['For a Few Dollars More (1965)']", 0.13064696),
 ("['Glengarry Glen Ross (1992)']", 0.13305692)]

### Recommending Movies to a Certain User (Based on Our Prediction for Rating)

In [98]:
def predict(model, iterator):
    model.eval() 
    preds = []
    
    with T.no_grad():
    
        for i, (users, genders, ages, movies, genres, ratings) in enumerate(iterator):

            users = users.to(device)
            genders = genders.to(device)
            ages = ages.to(device)
            movies = movies.to(device)
            genres = genres.to(device)
            ratings = ratings.to(device)
            
            predictions = model(users, genders, ages, movies, genres).squeeze()
            predictions = T.clip(predictions, min=1.0, max=5.0)
            preds += predictions.tolist()
    
    return np.asarray(preds)

In [101]:
preds = np.array(predict(model, val_loader))

In [134]:
def recom_user(user_id):
    user_idx = np.where(users_val == user_id)
    user_preds = preds[user_idx]
    real_ratings = ratings_val[user_idx]
    movies_id = movies_val[user_idx]    
    sorted_idx = sorted(range(len(user_preds)), key=lambda k: user_preds[k], reverse=True)
    sorted_user_preds = user_preds[sorted_idx]
    sorted_movies_id = movies_id[sorted_idx]
    sorted_real_ratings = real_ratings[sorted_idx]
    return [(str(movies_df.loc[movies_df['movieId'] == top_id]['title'].values),
             float(int(sorted_user_preds[i]*100))/100, sorted_real_ratings[i])
            for i, top_id in enumerate(sorted_movies_id)]

In [135]:
recom_user(7)

[("['Braveheart (1995)']", 4.74, 5),
 ("['Taxi Driver (1976)']", 4.65, 5),
 ("['Shine (1996)']", 4.58, 4),
 ("['Good Will Hunting (1997)']", 4.52, 5),
 ("['Ghost in the Shell (Kokaku kidotai) (1995)']", 4.35, 5),
 ("['Truman Show, The (1998)']", 4.25, 4),
 ("['High Fidelity (2000)']", 4.2, 4),
 ("['Like Water for Chocolate (Como agua para chocolate) (1992)']", 4.14, 4),
 ("['Few Good Men, A (1992)']", 3.95, 3),
 ("['Mask of Zorro, The (1998)']", 3.72, 3),
 ("['Primary Colors (1998)']", 3.66, 2),
 ("['Firm, The (1993)']", 3.35, 3),
 ("['Waiting to Exhale (1995)']", 2.69, 3),
 ("['Street Fighter (1994)']", 2.15, 2)]

### Helpful Sources:
- [Recommendation System Implementation With Deep Learning and PyTorch](https://medium.com/swlh/recommendation-system-implementation-with-deep-learning-and-pytorch-a03ee84a96f4)