In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

In [3]:
ratings = pd.read_csv('data/ratings_new.csv')
with open('./pkl/movie_to_index.pkl', 'rb') as movie_mapping:
    movie_to_index = pickle.load(movie_mapping)
with open('./pkl/user_to_index.pkl', 'rb') as user_mapping:
    user_to_index = pickle.load(user_mapping)

In [4]:
movies = pd.read_csv('data/movies_new.csv')

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,3826,2.0,1256677210
1,1,3893,3.5,1256677486
2,4,3190,3.5,1113766176
3,4,3298,4.5,1113766820
4,4,3300,3.5,1113766824


In [6]:
ratings.movieId = ratings.movieId.apply(lambda x: movie_to_index[x])
ratings.userId = ratings.userId.apply(lambda x: user_to_index[x])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.0,1256677210
1,0,1,3.5,1256677486
2,1,2,3.5,1113766176
3,1,3,4.5,1113766820
4,1,4,3.5,1113766824


In [14]:
n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())
n_factors = 100
min_rating, max_rating = ratings.rating.min(),ratings.rating.max()

In [15]:
print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        n_users, n_movies, min_rating, max_rating
    )
)

Number of users: 85758, Number of Movies: 14418, Min rating: 0.5, Max rating: 5.0


In [22]:
df = ratings.sample(frac=1, random_state=42)
X = df[['userId', 'movieId']]
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
dataset_sizes = {'train': len(X_train), 'val': len(X_val)}

In [26]:
class MovieDataset(Dataset):

    def __init__(self, users, movies, ratings):
        self.users, self.movies, self.ratings = users, movies, ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

In [33]:
train_data = MovieDataset(X_train.userId.values, X_train.movieId.values, y_train.values)
valid_data = MovieDataset(X_val.userId.values, X_val.movieId.values, y_val.values)
test_data = MovieDataset(X_test.userId.values, X_test.movieId.values, y_test.values)
datasets = {'train':train_data, 'val':valid_data}
dataloaders = {x: DataLoader(datasets[x], batch_size=4,
                                             shuffle=True, num_workers=2)
              for x in ['train', 'val']}


In [37]:
class EmbeddingModel(nn.Module):
    def __init__(self, num_users, num_movies, n_factors=100,
                 embedding_dropout=0.02, hidden=10, dropouts=0.2):
        super().__init__()
        self.user_embed = nn.Embedding(num_users, n_factors)
        self.movie_embed = nn.Embedding(num_movies, n_factors)
        self.drop_embedding = nn.Dropout(embedding_dropout)
        self.drop = nn.Dropout(dropouts)
        self.fc1 = nn.Linear(in_features=2*n_factors, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=64)
        self.output = nn.Linear(in_features=64, out_features=1)
    
    def forward(self, users, movies):
        user_embedded = self.user_embed(users)
        movie_embedded = self.movie_embed(movies)
        vector = torch.cat([user_embedded, movie_embedded], dim=-1)
        vector = self.drop_embedding(vector)
        vector = nn.ReLU()(self.fc1(vector))
        vector = self.drop(vector)
        vector = nn.ReLU()(self.fc2(vector))
        vector = self.drop(vector)
        pred = nn.Sigmoid()(self.output(vector))
        return pred

In [38]:
model = EmbeddingModel(n_users, n_movies)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(model)

In [39]:
print(model)

EmbeddingModel(
  (user_embed): Embedding(85758, 100)
  (movie_embed): Embedding(14418, 100)
  (drop_embedding): Dropout(p=0.02, inplace=False)
  (drop): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=200, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=1, bias=True)
)


In [40]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    no_improvements = 0
    patience = 10
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0

            # Iterate over data.
            for user, movie, target in dataloaders[phase]:
                user = user.to(device)
                movie = movie.to(device)
                target = target.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    output = model(user, movie)
                    loss = criterion(output.float(), target.float())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item()
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]

            print('{} Loss: {:.4f} '.format(
                phase, epoch_loss))

            # deep copy the model
            if phase == 'val' and epoch_loss < best_loss:
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1

        print()
        if no_improvements >= patience:
            print('early stopping after epoch {epoch:03d}'.format(**stats))
            break

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val loss: {:4f}'.format(best_loss))
    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
model_ft = train_model(model, criterion, optimizer, exp_lr_scheduler,
                       num_epochs=10)