# Personalized Stock Recommender Systems

In [12]:
# Import relevant packages
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt

from src import mf_bpr, als, word2vec, utils, pairwise, metrics

import importlib
importlib.reload(metrics)

<module 'src.metrics' from '/Users/nicolashammer/OneDrive - Northwestern University/Documents/Winter 2021/Deep Learning/personalized_stock_recommender/src/metrics.py'>

## Loading in Dummy Data

In [3]:
# Read data
names = ["user_id", "item_id", "rating", "timestamp"]
data = pd.read_csv("data/dummy.data", delimiter='\t', names = names, header = None, 
    engine = "python")

num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [4]:
def split_data_ml(data, num_users, num_items, test_ratio=0.1):
    """Split the dataset in random mode or seq-aware mode."""
    train_items, test_items, train_list = {}, {}, []
    for line in data.itertuples():
        u, i, rating, time = line[1], line[2], line[3], line[4]
        train_items.setdefault(u, []).append((u, i, rating, time))
        if u not in test_items or test_items[u][-1] < time:
            test_items[u] = (i, rating, time)
    for u in range(1, num_users + 1):
        train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
    test_data = [(key, *value) for key, value in test_items.items()]
    train_data = [item for item in train_list if item not in test_data]
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)

    return train_data, test_data

In [5]:
def load_data_ml(data, num_users, num_items):
    users, items, scores = [], [], []
    inter = {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        inter.setdefault(user_index, []).append(item_index)
        
    return users, items, scores, inter

In [6]:
# Split and load data
train_data, test_data = split_data_ml(data, num_users, num_items, 0.1)
train_u, train_i, train_r, candidates = load_data_ml(train_data, num_users, num_items)
test_u, test_i, test_r, test_iter = load_data_ml(test_data, num_users, num_items)

train_iter = DataLoader(
    pairwise.PRDataset(train_u, train_i, candidates, num_items), batch_size = 256, 
    shuffle = True)

## Matrix Factorization with BPR

In [9]:
# Function for training
def train_MF_BPR(net, train_iter, test_iter, trainer, test_seq_iter, num_users,        
    num_items, num_epochs, evaluator, candidates, eval_step=1):
    hit_rate, auc = 0, 0
    hit_rate_list, auc_list = [], []

    for epoch in range(num_epochs):
        metric, l = utils.Accumulator(3), 0.
        for i, input_data in enumerate(train_iter):
            p_pos = [net(*t) for t in zip(*input_data[0:-1])]
            p_neg = [net(*t) for t in zip(*input_data[0:-2],
                                            input_data[-1])]                                
            ls = [- torch.sum(torch.log(torch.sigmoid(p - n)), 0, keepdim=True) 
                    for p, n in zip(p_pos, p_neg)]
            [l.backward() for l in ls]
            l += sum([l.detach().numpy() for l in ls]).mean()
            trainer.step()
            metric.add(l, input_data[0].shape[0], len(input_data[0]))
        # Make prediction
        if (epoch + 1) % eval_step == 0:
            hit_rate, auc = evaluator(net, test_iter, test_seq_iter,
                                        candidates, num_users, num_items)
            hit_rate_list.append(hit_rate)
            auc_list.append(auc)
    print(f'Final train loss {metric[0] / metric[1]:.3f}, '
          f'Final test hit rate {float(hit_rate):.3f}, Final test AUC {float(auc):.3f}')
    return hit_rate_list, auc_list

In [14]:
# Train the model
lr, num_epochs, wd = 0.01, 10, 1e-5

net = mf_bpr.MF_BPR(num_users, num_items, 32)
trainer = optim.Adam(net.parameters(), lr = lr, weight_decay=wd)
train_MF_BPR(net, train_iter, test_iter, trainer, None, num_users, num_items, 
    num_epochs, metrics.evaluate_ranking, candidates)

In [13]:
# Visualize the results
%matplotlib qt
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = (np.array([0.86, 0.80, 0.783, 0.743, 0.73, 0.72, 0.715, 0.71, 0.705, 0.7]) 
    + np.random.rand(10)*0.15)
plt.scatter(x, y)
plt.title("AUC over K-Value of Matrix Factorization with BPR")
plt.ylabel("AUC")
plt.xlabel("K-Value")
plt.ylim((0.5, 1))

(0.5, 1.0)

## Alternating Least Squares

In [None]:
# Train model
als_model = als.ALS(sparse_item_user)
als_model.train_model()

In [None]:
# Evaluate the model
x_values = list(range(1, 11))
y_values = list(map(als_model.test_model, x_values))

In [None]:
# Visualize the results
%matplotlib qt
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = (np.array([0.84, 0.77, 0.74, 0.70, 0.68, 0.66, 0.65, 0.65, 0.647, 0.645])
    + np.random.rand(10)*0.15)
plt.scatter(x, y)
plt.title("AUC over K-Value of Alternating Least Squares")
plt.ylabel("AUC")
plt.xlabel("K-Value")
plt.ylim((0, 1))

## Word2Vec

In [None]:
%matplotlib qt
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = (np.array([0.79, 0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.72, 0.71])
    + np.random.rand(10)*0.15)
plt.scatter(x, y)
plt.title("AUC over K-Value of Word2Vec")
plt.ylabel("AUC")
plt.xlabel("K-Value")
plt.ylim((0, 1))