# (1) Preprocessing Dataset

Make sure dataset file are in correct location.
run preprocess.py

In [1]:
from data.LDA import preprocess_text
from train.saving import save_model_results
from model.mf import MF_Bias, LDANet
from model.utility import RMSELoss

import pickle
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam
import gc
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from train.saving import save_model_results
from skorch import NeuralNetRegressor

In [2]:
subsets = {
        "Toys_and_Games_5.json",
        "Apps_for_Android_5.json",
        "Health_and_Personal_Care_5.json",
    }
pth = os.getcwd()[:-4]
NUM_TOPICS = 10
#preprocess_text(dataset=subsets, pth=pth, n_topics=NUM_TOPICS)
print(pth)

/Users/jb/Documents/GitHub/Intro ML (Nick Pang) Repository/rl-recommender


In [3]:
TG5_df = pd.read_json(os.path.join(pth, "datasets", "raw","Toys_and_Games_5.json"), lines=True)
AA5_df = pd.read_json(os.path.join(pth, "datasets", "raw", "Apps_for_Android_5.json"), lines=True)
HPC_df = pd.read_json(os.path.join(pth, "datasets", "raw", "Health_and_Personal_Care_5.json"), lines=True)

df = pd.concat([TG5_df, AA5_df, HPC_df], axis=0)
del TG5_df, AA5_df, HPC_df


In [4]:
# Keep essentials only
df = df[['reviewerID', 'asin', 'overall']]

# Map values
user_dict = pd.read_csv(os.path.join(pth, "datasets", "processed", "lda", "user_mappings.csv"))
item_dict = pd.read_csv(os.path.join(pth, "datasets", "processed", "lda", "item_mappings.csv"))
user_dict = dict(zip(user_dict.iloc[:, 1], user_dict.index))
item_dict = dict(zip(item_dict.iloc[:, 1], item_dict.index))

df['reviewerID'] = df['reviewerID'].map(user_dict).fillna(df['reviewerID'])
df['asin'] = df['asin'].map(item_dict).fillna(df['asin'])

df[['reviewerID', 'asin']] = df[['reviewerID', 'asin']].astype(int)
df['overall'] = df['overall'].astype(float)

print(df)

        reviewerID   asin  overall
0           124394  31743      5.0
1           124395  31743      4.0
2           124396  31743      5.0
3           124397  31743      5.0
4           124398  31743      4.0
...            ...    ...      ...
346350       29878  18533      5.0
346351       37951  18533      5.0
346352       36501  18533      5.0
346353       36040  18533      5.0
346354       36504  18533      5.0

[1266889 rows x 3 columns]


In [5]:
m = df['overall'].mean()
np.savez_compressed(os.path.join(pth, "datasets", "processed", "Subset_5core_PreprocessLDA.npz"),
                    x = df[['reviewerID', 'asin']],
                    y = df['overall'],
                    u_size = len(user_dict),
                    i_size = len(item_dict),
                    m = m)

# (2) Create Model

This model doesn't use internal LDA YET!

### I wanted to test running on Apple Silicon Chips

In [6]:
device = torch.device('mps' if torch.has_mps else 'cpu')
f_type = torch.float32 if device.type == 'mps' else torch.float64
print(f"device_type: {device}")
print(f"float_type: {f_type}")

device_type: mps
float_type: torch.float32


In [7]:
BATCH_SIZE = 64
LEARNING_RATE = 0.0005
EPOCHS = 15
DECAY = 1e-3
DROPOUT = 0.4

In [8]:
gc.collect()

data = np.load(os.path.join(pth, "datasets", "processed", "Subset_5core_PreprocessLDA.npz"))
x = data['x']
y = data['y']
U_size = data['u_size']
I_size = data['i_size']
G_b = data['m']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
y_train, y_test = torch.tensor(y_train, dtype = f_type).to(device), torch.tensor(y_test, dtype = f_type).to(device)

loss_fn = RMSELoss()
optimizer = Adam

### I am adding the class here only so I append **.to(*device*)** for each tensor

In [9]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, K, dropout=0):
        super().__init__()
        self.user_m = nn.Embedding(
            n_users, K, dtype=f_type
        ).to(device)  # can include option sparse = True for memory
        self.item_m = nn.Embedding(n_items, K, dtype=f_type).to(device)
        self.drop_u = nn.Dropout(dropout)
        self.drop_i = nn.Dropout(dropout)

    def forward(self, x):
        user_ids = x[:, 0]
        item_ids = x[:, 1]
        user_embeds = self.drop_u(self.user_m(user_ids))
        item_embeds = self.drop_i(self.item_m(item_ids))
        prod = user_embeds * item_embeds

        out = torch.sum(prod, 1)

        return out


# Matrix factorization with user/item biases
class MF_Bias(MF):
    def __init__(self, n_users, n_items, K, G_b, dropout=0):
        super().__init__(n_users, n_items, K, dropout)

        self.user_b = nn.Embedding(n_users, 1, dtype=f_type).to(device)
        self.item_b = nn.Embedding(n_items, 1, dtype=f_type).to(device)
        nn.init.zeros_(self.user_b.weight)
        nn.init.zeros_(self.item_b.weight)

        self.G_b = torch.from_numpy(G_b)

    def forward(self, x):
        user_ids = x[:, 0]
        item_ids = x[:, 1]
        out = super().forward(x)

        user_biases = self.user_b(user_ids).squeeze()
        item_biases = self.item_b(item_ids).squeeze()

        out += user_biases + item_biases + self.G_b

        return out

In [10]:
model = MF_Bias(U_size, I_size, NUM_TOPICS, G_b, DROPOUT)

In [11]:
user_map = pd.read_csv(os.path.join(pth, "datasets", "processed", "lda", "user_topics.csv"))
for idx,row  in user_map.iterrows():
    row = row[1:] # to remove 'User_ID' from row
    model.user_m.weight.data[idx] = torch.tensor(row.values, dtype=f_type).to(device)

item_map = pd.read_csv(os.path.join(pth, "datasets", "processed", "lda", "item_topics.csv"))
for idx,row  in item_map.iterrows():
    row = row[1:] # to remove 'Item_ID' from row
    model.item_m.weight.data[idx] = torch.tensor(row.values, dtype=f_type).to(device)

# (3) Run Model

In [12]:
regressor = NeuralNetRegressor(
    model,
    criterion = loss_fn,
    optimizer = optimizer,
    optimizer__param_groups = [
        ('user_m.weight', {'weight_decay': DECAY}),
        ('item_m.weight', {'weight_decay': DECAY})
    ],
    optimizer__lr = LEARNING_RATE,
    batch_size = BATCH_SIZE,
    max_epochs = EPOCHS
)
gc.collect()

0

### Borrowed from *save_model_results()*

In [13]:
results = regressor.fit(x_train, y_train)
history = results.history

tr_losses = [i['train_loss'] for i in history]
t_losses = [i['valid_loss'] for i in history]

with open(os.path.join(pth, "results", f"model_PreprocessingLDA_{device}_{f_type}.pkl"), 'wb') as f:
    pickle.dump(model, f)

np.savez_compressed(os.path.join(pth, "results", f"loss_PreprocessingLDA_{device}_{f_type}.npz"),
                    tr_loss = tr_losses,
                    t_loss = t_losses)

  epoch    train_loss    valid_loss       dur
-------  ------------  ------------  --------
      1        [36m1.2336[0m        [32m1.1960[0m  100.0761
      2        [36m1.1683[0m        [32m1.1693[0m  106.0843
      3        [36m1.1339[0m        [32m1.1529[0m  100.7080
      4        [36m1.1083[0m        [32m1.1421[0m  100.5575
      5        [36m1.0884[0m        [32m1.1347[0m  95.8565
      6        [36m1.0722[0m        [32m1.1295[0m  110.8181
      7        [36m1.0588[0m        [32m1.1259[0m  97.7337
      8        [36m1.0475[0m        [32m1.1233[0m  98.0689
      9        [36m1.0379[0m        [32m1.1215[0m  92.8086
     10        [36m1.0295[0m        [32m1.1203[0m  90.6047
     11        [36m1.0221[0m        [32m1.1195[0m  89.3275
     12        [36m1.0156[0m        [32m1.1190[0m  88.0651
     13        [36m1.0099[0m        [32m1.1189[0m  91.5750
     14        [36m1.0047[0m        1.1190  92.5040
     15        [36m1.0001[0m

# (4) Graphs