In [1]:
import os
import torch
import pickle
import warnings
import numpy as np
import pandas as pd

from torch import nn
from tqdm.notebook import tqdm
from transformers import pipeline, get_scheduler
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from learningLoop import learning_loop
from dataset import MovieDataset, collate_fn
from model import movieModel, create_model_and_optimizer, load_glove_weights

In [3]:
dt = pd.read_csv("data.csv")

In [4]:
userId = dt.userId.unique()

In [5]:
np.random.seed(777)
proportion = 0.8
users_for_train = np.random.choice([1, 0], size=len(userId), p=[proportion, 1 - proportion]).astype(bool)
users_for_train = userId[users_for_train]

In [6]:
users_for_train = set(users_for_train)
train_ind = np.ones((len(dt)))
for i in tqdm(range(len(dt))):
    if dt.userId.to_numpy()[i] not in users_for_train:
        train_ind[i] = -1

  0%|          | 0/7414489 [00:00<?, ?it/s]

In [7]:
dt_for_train = dt.iloc[train_ind > 0]
dt_for_test = dt.iloc[train_ind < 0]

In [8]:
with open('tok_to_ind.pkl', 'rb') as f:
    tok_to_ind = pickle.load(f)

In [9]:
ds_train = MovieDataset(
    dt_for_train, 
    tok_to_ind,
    count_of_tokens = 18,
    size = 16,
    count_with_rating = 4
)
ds_val = MovieDataset(
    dt_for_test, 
    tok_to_ind,
    count_of_tokens = 18,
    size = 16,
    count_with_rating = 4
)

In [10]:
batch_size = 8

dataloader_train = DataLoader(
    dataset=ds_train,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
)

dataloader_val = DataLoader(
    dataset=ds_val,
    batch_size=batch_size,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=False,
)

In [11]:
glove_path = "./glove.840B.300d.txt"
glove_weights, mask_found = load_glove_weights(glove_path, tok_to_ind, "[PAD]")

Loading Glove Weights


  0%|          | 0/2196018 [00:00<?, ?it/s]

515 words from vocab of size 536 loaded!


In [12]:
user_encoder_info = {
    "embedding_word_size": 300,
    "output_embedding_size": 100,
    "token_count": 18,
    "movie_count": 16,
    "num_layers": 3
}
movie_encoder_info = {
    "input_embedding_size": 100,
    "output_embedding_size": 100,
    "user_count": 8,
    "num_layers": 3
}
params = {
    "user_enc_info": user_encoder_info,
    "movie_enc_info": movie_encoder_info,
    "tok_to_ind": tok_to_ind,
    "glove_weights": glove_weights
}

In [14]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
model, optimizer = create_model_and_optimizer(
    model_class = movieModel,
    model_params = params,
    device = device,
)
    
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=4, threshold=0.001, verbose=True)
   

model, optimizer, losses, metrics = learning_loop(
    model = model,
    optimizer = optimizer,
    train_loader = dataloader_train,
    val_loader = dataloader_val,
    criterion = None,
    device = device,
    scheduler = scheduler,
    epochs = 100,
    min_lr = 1e-7,
    val_every = 1,
    draw_every = 1,
    separate_show = False,
    metric_names = {
        "accuracy": {"plot_id": 1},
    },
    chkp_folder = "./chkp",
    model_name = "movielV1"
)