# A Transformer-based recommendation system


In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
from torch.utils.data import DataLoader

In [3]:
df_train = pd.read_parquet("./artifacts/train_data.parquet")

In [4]:
df_train.head()

Unnamed: 0,user_id_index,movie_sequence,rating_sequence,sex,occupation_index,age_group_index,target_movie,target_rating
0,1,"[3118, 1010, 1673, 1251]","[0.75, 1.0, 0.75, 1.0]",1.0,11,1,1251,1.0
1,1,"[1673, 1251, 2272, 1769]","[0.75, 1.0, 0.5, 1.0]",1.0,11,1,1769,1.0
2,1,"[2272, 1769, 3340, 1190]","[0.5, 1.0, 0.75, 1.0]",1.0,11,1,1190,0.75
3,1,"[3340, 1190, 2736, 258]","[0.75, 0.75, 1.0, 1.0]",1.0,11,1,258,0.75
4,1,"[2736, 258, 1177, 712]","[1.0, 0.75, 1.0, 1.0]",1.0,11,1,712,0.5


In [5]:
item = df_train.iloc[0]

In [6]:
item.to_dict()

{'user_id_index': 1,
 'movie_sequence': array([3118, 1010, 1673, 1251]),
 'rating_sequence': array([0.75, 1.  , 0.75, 1.  ]),
 'sex': 1.0,
 'occupation_index': 11,
 'age_group_index': 1,
 'target_movie': 1251,
 'target_rating': 1.0}

In [7]:
%%writefile ./src/dataset.py
import torch
from torch.utils.data import Dataset


class RatingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        item_dict = self.data.iloc[index].to_dict()

        dtype_dict = {}
        for k,v in item_dict.items():
            dtype_dict[k]=torch.long
        dtype_dict['rating_sequence']=torch.float32
        dtype_dict['target_rating']=torch.float32
        dtype_dict['sex']=torch.float32


        sample = {}
        for k,v in item_dict.items():
            sample[k] = torch.tensor(v,dtype=dtype_dict[k])
            
        return sample

Overwriting ./src/dataset.py


In [8]:
from src.dataset import RatingDataset

In [9]:
train_dataset = RatingDataset(data=df_train) 

In [10]:
train_dataset[0]

{'user_id_index': tensor(1),
 'movie_sequence': tensor([3118, 1010, 1673, 1251]),
 'rating_sequence': tensor([0.7500, 1.0000, 0.7500, 1.0000]),
 'sex': tensor(1.),
 'occupation_index': tensor(11),
 'age_group_index': tensor(1),
 'target_movie': tensor(1251),
 'target_rating': tensor(1.)}

In [11]:
loader = DataLoader(train_dataset,batch_size=4,shuffle=True)

In [12]:
for sample in loader:
    break


In [13]:
sample

{'user_id_index': tensor([3913,  534, 1449, 4216]),
 'movie_sequence': tensor([[2503,  471, 2848, 1990],
         [3510, 3725, 1576,  538],
         [ 572, 1974, 2016,  452],
         [3725, 3677, 3555, 2287]]),
 'rating_sequence': tensor([[1.0000, 0.5000, 0.7500, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000],
         [0.2500, 0.0000, 0.0000, 1.0000],
         [1.0000, 0.7500, 0.7500, 1.0000]]),
 'sex': tensor([0., 0., 0., 0.]),
 'occupation_index': tensor([18, 16, 21, 18]),
 'age_group_index': tensor([2, 3, 4, 3]),
 'target_movie': tensor([1990,  538,  452, 2287]),
 'target_rating': tensor([1.0000, 1.0000, 0.0000, 0.7500])}