# Import libraries

In [1]:
import os

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence # это средство борьбы с 
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm  # For nice progress bar!
import matplotlib.pyplot as plt

# Download data

In [2]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, caption_file, preprocessing=True):
        
        self.root_dir = root_dir

        path_to_val_data = 'validation_results_CNN.parquet'
        df_ids = pd.read_parquet(path_to_val_data)[['userId','movieId']]
        df = pd.read_parquet(caption_file)
        df = df.merge(df_ids, on=['userId','movieId'])
        
        self.df = df
            
        
        self.rating = self.df['rating']
        self.id_movie = self.df['movieId']
        self.id_user = self.df['userId']
        self.captions = np.array(self.df[['user_encode','movie_encode','user_mean_encode','movie_mean_encode','user_std_encode', 'movie_std_encode']])
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        encode_text = pad_sequence([torch.tensor(x, dtype=torch.float) for x in self.captions[index]])
        rating_value = torch.tensor(self.rating[index], dtype=torch.float)
        index_movie = self.id_movie[index]
        index_user = self.id_user[index]
  
        return encode_text, rating_value, int(index_movie), int(index_user)

In [3]:
dataset = CustomDataset(".", "BERT_encoded_and_stat_features.parquet")
for idx, (encode_text, rating_value, id_movie, id_user) in enumerate(dataset):
    print(encode_text.shape)
    print(rating_value)
    print(id_movie)
    print(id_user)

torch.Size([947, 6])
tensor(4.)
1
302
torch.Size([947, 6])
tensor(4.5000)
1
719
torch.Size([947, 6])
tensor(4.5000)
1
781
torch.Size([947, 6])
tensor(3.5000)
1
798
torch.Size([947, 6])
tensor(4.)
1
930
torch.Size([947, 6])
tensor(5.)
1
1464
torch.Size([947, 6])
tensor(3.5000)
1
1563
torch.Size([947, 6])
tensor(3.5000)
1
2121
torch.Size([947, 6])
tensor(4.)
1
3017
torch.Size([947, 6])
tensor(4.5000)
1
3236
torch.Size([947, 6])
tensor(3.5000)
1
3369
torch.Size([947, 6])
tensor(3.5000)
1
3488
torch.Size([947, 6])
tensor(4.)
1
3544
torch.Size([947, 6])
tensor(5.)
1
4137
torch.Size([947, 6])
tensor(3.5000)
1
4330
torch.Size([947, 6])
tensor(3.5000)
1
4748
torch.Size([947, 6])
tensor(4.)
1
5012
torch.Size([947, 6])
tensor(5.)
1
5437
torch.Size([947, 6])
tensor(4.5000)
1
5948
torch.Size([947, 6])
tensor(3.)
1
5991
torch.Size([947, 6])
tensor(3.)
1
6080
torch.Size([947, 6])
tensor(4.)
1
6275
torch.Size([947, 6])
tensor(3.)
2
1028
torch.Size([947, 6])
tensor(3.)
2
1179
torch.Size([947, 6])
tens

In [4]:
data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=2)

# Define Neural Network

In [5]:
class CNN(nn.Module): 
    def __init__(self): 
        super().__init__()
        self.conv1 = nn.Conv2d(4, 4, 2)
        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
        self.conv2 = nn.Conv2d(4,4,2) 
        self.fc1 = nn.Linear(472, 512)
        self.drop_0 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(512, 512)
        self.drop_1 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(512, 248)
        self.fc4 = nn.Linear(248,64)
        self.fc5 = nn.Linear(64, 1)
        
    def forward(self, x): 
        x = self.pool(F.relu(self.conv1(x)))
        x = F.relu(self.conv2(x))
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = self.drop_0(x)
        x = F.relu(self.fc2(x))
        x = self.drop_1(x)
        x = F.relu(self.fc3(x))
        x = self.drop_1(x)
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_CNN = CNN()
model_CNN.load_state_dict(torch.load('CNN_rating_prediction_raw.tar', map_location = device))

criterion = nn.MSELoss()

  return torch._C._cuda_getDeviceCount() > 0


# Check model

In [7]:
def inference_check(loader, model):
    df = pd.DataFrame(columns=['movieId','userId','rating','predicted'])

    list_1 = []
    list_2 = []
    list_y = []
    list_scores = []
    loss_list = []
    with torch.no_grad():
        for x, y, _1, _2 in loader:
            list_test = []
            x = x.to(device=device)
            y = y.unsqueeze(1).to(device=device)

            scores = model(x)
            loss = criterion(scores, y)

            list_1.extend([np.array(_1)[i] for i in range(4)])
            list_2.extend([np.array(_2)[i] for i in range(4)])
            list_y.extend([np.array(y)[i][0] for i in range(4)])
            list_scores.extend([np.array(scores)[i][0] for i in range(4)])
            loss_list.append(loss)

    df['movieId'] = list_1
    df['userId'] = list_2
    df['rating'] = list_y
    df['predicted'] = list_scores
    return df

In [8]:
model_CNN.eval()
data = inference_check(data_loader, model_CNN)

# Check recommendation results

In [10]:
data['predicted'] = data['predicted'].apply(lambda x: 5 if x >= 4.7 else 4.5 if x >= 4.3 else 4 if x >= 4 else 3.5 if x >= 3.5 else\
                         round(x))

In [16]:
data.head()

Unnamed: 0,movieId,userId,rating,predicted
0,318,6869,5.0,3.5
1,65,1471,3.0,3.0
2,520,5219,3.0,3.5
3,260,6274,4.5,3.5
4,290,4796,5.0,3.5


Что нужно рекомендовать пользователю? Наверное это все фильмы с рейтингом выше 3.5?

In [18]:
data[data['rating'] >= 3.5]

Unnamed: 0,movieId,userId,rating,predicted
0,318,6869,5.0,3.5
3,260,6274,4.5,3.5
4,290,4796,5.0,3.5
6,520,2629,3.5,3.0
10,300,5219,3.5,3.0
...,...,...,...,...
590,110,1832,4.0,3.5
592,62,1513,4.0,3.0
594,260,1972,4.0,3.5
595,47,2374,3.5,3.5


Допустим что 1 это рекомендовать фильм, а 0 не показывать пользователю

In [26]:
data['recommendation_original'] = [1 if data['rating'][i] >= 3.5 else 0 for i in range(len(data))]

In [27]:
data['recommendation_predicted'] = [1 if data['predicted'][i] >= 3.5 else 0 for i in range(len(data))]

In [44]:
tp = sum([1 if data['recommendation_predicted'][i] ==1 and data['recommendation_original'][i] ==1 else 0 for i in range(len(data))])

In [45]:
fp = sum([1 if data['recommendation_predicted'][i] == 0 and data['recommendation_original'][i] == 0 else 0 for i in range(len(data))])

In [48]:
fn = sum([1 if data['recommendation_predicted'][i] == 0 and data['recommendation_original'][i] == 1 else 0 for i in range(len(data))])

In [49]:
tn = sum([1 if data['recommendation_predicted'][i] == 1 and data['recommendation_original'][i] == 0 else 0 for i in range(len(data))])

In [51]:
(tp+tn) / 600

0.4683333333333333