In [87]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

In [88]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
device

device(type='mps')

# Load Data

In [89]:
def load_data(file, sep='\t'):
    return pd.read_csv(f'./lfm-challenge-data/{file}', delimiter=sep)

In [90]:
users = load_data('lfm-challenge.user')
items = load_data('lfm-challenge.item')
inter_train = load_data('lfm-challenge.inter_train')
inter_test = load_data('lfm-challenge.inter_test')
test_users = pd.read_csv(f'./lfm-challenge-data/test_indices.txt')['users'].values

n_users = users['user_id'].values.size
n_items = items.index.values.size

In [91]:
inter_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138772 entries, 0 to 138771
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype
---  ------            --------------   -----
 0   user_id           138772 non-null  int64
 1   item_id           138772 non-null  int64
 2   listening_events  138772 non-null  int64
dtypes: int64(3)
memory usage: 3.2 MB


### Training Dataset Class Wrapper

In [92]:
class TrackDataset:
    def __init__(self, users, items, ratings):
        self.users = users
        self.items = items
        self.ratings = ratings
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, item):
        users = self.users[item]
        items = self.items[item]
        ratings = self.ratings[item]
        
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "items": torch.tensor(items, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long)
        }

### Create the model

In [93]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_items):
        super().__init__()
        
        self.user_embed = nn.Embedding(n_users, 32)
        self.item_embed = nn.Embedding(n_items, 32)
        
        self.out = nn.Linear(64, 1)
        
    def forward(self, users, items, ratings=None):
        user_embeds = self.user_embed(users)
        item_embeds = self.item_embed(items)
        output = torch.cat([user_embeds, item_embeds], dim=1)
        
        output = self.out(output)
        
        return output
        

In [94]:
lbl_user = preprocessing.LabelEncoder()
lbl_item = preprocessing.LabelEncoder()
inter_train['user_id'] = lbl_user.fit_transform(inter_train['user_id'].values)
inter_train['item_id'] = lbl_item.fit_transform(inter_train['item_id'].values)

In [95]:
df_train, df_valid = model_selection.train_test_split(
    inter_train, test_size=0.1, random_state=42
)

In [96]:
train_dataset = TrackDataset(
    users=df_train['user_id'].values,
    items=df_train['item_id'].values,
    ratings=df_train['listening_events'].values
)

valid_dataset = TrackDataset(
    users=df_valid['user_id'].values,
    items=df_valid['item_id'].values,
    ratings=df_valid['listening_events'].values
)

In [97]:
train_dataset[2]

{'users': tensor(6745), 'items': tensor(2016), 'ratings': tensor(3)}

In [99]:
train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True)
validation_loader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=True)

# dataiter = iter(train_loader)
# dataloader_data = dataiter.next()
# print(dataloader_data)

In [100]:
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_items=len(lbl_item.classes_)
).to(device)

optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

loss_func = nn.MSELoss()

### Training loop

In [None]:
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = []

model.train()
for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):
        output = model(train_data["users"].to(device),
                       train_data["items"].to(device)
                      ).to(device)
        
        rating = train_data["ratings"].view(4, -1).to(device)
        
        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        step_cnt = step_cnt + len(train_data["users"])
        
        if (step_cnt % plot_steps == 0):
            avg_loss = total_loss/(len(train_data["users"]) * plot_steps)
            print(f"epch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0

In [21]:
print(dataloader_data['users'])

NameError: name 'dataloader_data' is not defined