# Personalized Stock Recommender Systems

In [None]:
# Import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import coo_matrix
from sklearn.metrics import mean_squared_error

import torch
import torch.optim as optim
import torch.utils.data as data

from src import mf_bpr, als, word2vec, metrics, datasets, utils

## Prepare Data

### Dummy Data

In [None]:
# Read data
def read_dummy():
    dummy_data = pd.read_csv("data/dummy.data", sep='\t', names = ['user_id', 'item_id',      
        'rating', 'timestamp'], engine = 'python')
    num_users = dummy_data.user_id.unique().shape[0]
    num_items = dummy_data.item_id.unique().shape[0]
    return dummy_data, num_users, num_items

## Matrix Factorization with BPR

#### Dummy Data

In [None]:
# Train test split
def train_test_dummy_bpr(dummy_data : pd.DataFrame, num_users : int, num_items : int):
    train_items, test_items, train_list = {}, {}, []

    # Iterate through every line in the raw data
    for line in dummy_data.itertuples():
        u, i, rating, time = line[1], line[2], line[3], line[4]
        train_items.setdefault(u, []).append((u, i, rating, time))
        if u not in test_items or test_items[u][2] < time:
            test_items[u] = (i, rating, time)
        
    # Iterate through every user and add their samples, sorted by timestamp, to the train 
    # list
    for u in range(1, num_users + 1):
        train_list.extend(sorted(train_items[u], key = (lambda x : x[3])))

    test_data = [(key, *value) for key, value in test_items.items()]

    train_data = [item for item in train_list if item not in test_data]
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)
    return train_data, test_data

In [None]:
# Load user and item indices (zero based) and scores 
def load_dummy_bpr(dummy, num_users, num_items):
    users, items, scores = [], [], []
    interactions = {}
    for line in dummy.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = 1 # implicit

        users.append(user_index)
        items.append(item_index)
        scores.append(score)

        interactions.setdefault(user_index, []).append(item_index)

    return users, items, scores, interactions

In [None]:
# Define evaluator
def evaluate_ranking_bpr(net, test_input, interactions, num_users, num_items):
    ranked_list, ranked_items, hit_rate, auc = {}, {}, [], []
    all_items = set([i for i in range(num_items)])
    for u in range(num_users):
        neg_items = list(all_items - set(interactions[u]))
        user_ids, item_ids, scores = [], [], []
        [item_ids.append(i) for i in neg_items]
        [user_ids.append(u) for _ in neg_items]
        test_dataset = data.TensorDataset(torch.from_numpy(np.array(user_ids)),    
            torch.from_numpy(np.array(item_ids)))
        test_data_iter = data.DataLoader(test_dataset, shuffle=False, batch_size=1024)

        for _, (user_idxs, item_idxs) in enumerate(test_data_iter):
            scores.extend(list(net(user_idxs, item_idxs).detach().numpy()))
        item_scores = list(zip(item_ids, scores))

        ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True)
        ranked_items[u] = [r[0] for r in ranked_list[u]]
        
        temp = metrics.hit_and_auc(ranked_items[u], test_input[u][0], 50)
        hit_rate.append(temp[0])
        auc.append(temp[1])
    return np.mean(np.array(hit_rate)), np.mean(np.array(auc))

In [None]:
# Ready dummy data
dummy_data, num_users, num_items = read_dummy()
train_dummy, test_dummy = train_test_dummy_bpr(dummy_data, num_users, num_items)

# Training data
train_users, train_items, train_ratings, interactions = load_dummy_bpr(train_dummy,    
    num_users, num_items)
train_dummy_dataset = datasets.DummyDataset(np.array(train_users), np.array(train_items),
    interactions, num_items)
train_dataloader = data.DataLoader(dataset = train_dummy_dataset, batch_size = 1024, 
    shuffle = True, num_workers = 4)

# Test data
_, _, _, test_interactions = load_dummy_bpr(test_dummy, 
    num_users, num_items)

In [None]:
# Create and initialize model
lr, num_epochs, wd, latent_factors = 0.01, 20, 1e-5, 10

bpr_net = mf_bpr.MF_BPR(num_users, num_items, latent_factors) 
loss = mf_bpr.BPR_Loss
optimizer = optim.Adam(bpr_net.parameters(), lr = 0.01, weight_decay=wd)

In [None]:
# Train and evaluate the model
hit_rate_list = []
auc_list = []
for epoch in range(num_epochs):
    accumulator, l = utils.Accumulator(2), 0.

    # Train each batch
    bpr_net.train()
    for i, (user_idxs, item_idxs, neg_items) in enumerate(train_dataloader):
        optimizer.zero_grad()

        p_pos = bpr_net(user_idxs, item_idxs)
        p_neg = bpr_net(user_idxs, neg_items)

        total_loss = loss(p_pos, p_neg)
        total_loss.backward()
        optimizer.step()
        accumulator.add(total_loss, user_idxs.shape[0])

    # Evaluate
    bpr_net.eval()
    hit_rate, auc = evaluate_ranking_bpr(bpr_net, test_interactions, interactions, num_users,   
        num_items)
    hit_rate_list.append(hit_rate)
    auc_list.append(auc)

    print(f"Epoch {epoch}:\n\tloss = {accumulator[0]/accumulator[1]}\n\thit_rate = {hit_rate}\n\tauc = {auc}")

In [None]:
# Visualize
x = list(range(1, num_epochs + 1))
plt.scatter(x, auc_list, label = "AUC")
plt.scatter(x, hit_rate_list, label = "Hit Rate")
plt.title("HR and AUC over Epoch of MF")
plt.xlabel("Epoch")
plt.legend(loc = "lower right")
plt.xticks(x[0::2])
plt.ylim((0, 1))

## Alternating Least Squares

### Dummy Data

In [None]:
# Train test split
def train_test_dummy_als(dummy_data : pd.DataFrame, num_users : int, num_items : int):
    train_items, test_items, train_list = {}, {}, []

    # Iterate through every line in the raw data
    for line in dummy_data.itertuples():
        u, i, rating, time = line[1], line[2], line[3], line[4]
        train_items.setdefault(u, []).append((u, i, rating, time))
        if u not in test_items or test_items[u][2] < time:
            test_items[u] = (i, rating, time)
        
    # Iterate through every user and add their samples, sorted by timestamp, to the train 
    # list
    for u in range(1, num_users + 1):
        train_list.extend(sorted(train_items[u], key = (lambda x : x[3])))

    test_data = [(key, *value) for key, value in test_items.items()]

    train_data = [item for item in train_list if item not in test_data]
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)
    return train_data, test_data

In [None]:
# Load user and item indices (zero based) and scores 
def load_dummy_als(dummy, num_users, num_items):
    users, items, scores = [], [], []
    interactions = {}
    for line in dummy.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = 1 # implicit

        users.append(user_index)
        items.append(item_index)
        scores.append(score)

        interactions.setdefault(user_index, []).append(item_index)

    return users, items, scores, interactions

In [None]:
# Define evaluator
def evaluate_ranking_als(net, test_input, interactions, num_users, num_items):
    ranked_list, ranked_items, hit_rate, auc = {}, {}, [], []
    all_items = set([i for i in range(num_items)])
    for u in range(num_users):
        neg_items = list(all_items - set(interactions[u]))
        user_ids, item_ids, scores = [], [], []
        [item_ids.append(i) for i in neg_items]
        [user_ids.append(u) for _ in neg_items]

        scores.extend(list(net.predict(user_ids, item_ids)))
        item_scores = list(zip(item_ids, scores))

        ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True)
        ranked_items[u] = [r[0] for r in ranked_list[u]]
        
        temp = metrics.hit_and_auc(ranked_items[u], test_input[u][0], 50)
        hit_rate.append(temp[0])
        auc.append(temp[1])
    return np.mean(np.array(hit_rate)), np.mean(np.array(auc))

In [None]:
# Ready dummy data
dummy_data, num_users, num_items = read_dummy()
train_dummy, test_dummy = train_test_dummy_als(dummy_data, num_users, num_items)

# Training data
train_users, train_items, train_ratings, interactions = load_dummy_als(train_dummy,    
    num_users, num_items)

# Test data
_, _, _, test_interactions = load_dummy_als(test_dummy, 
    num_users, num_items)

In [None]:
# Initialize model
num_epochs, reg, latent_factors = 20, 0.01, 30

ratings_matrix = coo_matrix((train_ratings, (train_users, train_items)), shape = (num_users, 
    num_items)).todense()
loss = mean_squared_error
als_net = als.ALS(num_users, num_items, latent_factors, ratings_matrix, reg)

In [None]:
# Train and evaluate the model
hit_rate_list = []
auc_list = []
 
for epoch in range(num_epochs):
    # Train with entire batch
    als_net.train()

    # Evaluate
    hit_rate, auc = evaluate_ranking_als(als_net, test_interactions, interactions, num_users,
        num_items)
    hit_rate_list.append(hit_rate)
    auc_list.append(auc)

    print(f"Epoch {epoch}: hit_rate = {hit_rate}, auc = {auc}")

In [None]:
# Visualize
x = list(range(1, num_epochs + 1))
plt.scatter(x, auc_list, label = "AUC")
plt.scatter(x, hit_rate_list, label = "Hit Rate")
plt.title("HR and AUC over Epoch of ALS")
plt.xlabel("Epoch")
plt.legend(loc = "lower right")
plt.xticks(x[0::2])
plt.ylim((0, 1))

## Word2Vec

### Dummy Data

In [None]:
# Prep interactions
def load_interactions_cbow(dummy_data : pd.DataFrame):
    interactions = {}
    for line in dummy_data.itertuples():
        user_index, item_index, time = line[1] - 1, line[2] - 1, line[4]
        interactions.setdefault(user_index, []).append((item_index, time))

    interactions = {k : sorted(v, key = (lambda pair : pair[1])) for k, v in interactions.items()}
    return {k : [x[0] for x in v] for k, v in interactions.items()}

In [None]:
# Train test split
def train_test_dummy_cbow(interactions : dict, window : int):
    train_targets, train_contexts = [], []
    test_targets, test_contexts = [], []

    # Iterate through every interaction
    for user_interactions in interactions.values():
        num_interactions = len(user_interactions)
        # Add to training data
        for i in range(window, num_interactions - 1):
            train_targets.append(user_interactions[i])
            train_contexts.append([user_interactions[j] for j in np.arange(i - window, i)])
        # Add to testing data
        test_targets.append(user_interactions[num_interactions - 1])
        test_contexts.append([user_interactions[j] for j 
            in np.arange(num_interactions - 1 - window, num_interactions - 1)])
        
    return train_targets, train_contexts, test_targets, test_contexts

In [None]:
# Define evaluator
def evaluate_ranking_cbow(net, test_targets, test_contexts, num_items):
    ranked_list, ranked_items, hit_rate, auc = {}, {}, [], []
    item_ids = list(range(num_items))
    
    for _, (targets, contexts) in enumerate(ngrams_dataloader_test):
        scores = net(contexts).tolist()
        for u, row in enumerate(scores):
            item_scores = list(zip(item_ids, row))
            ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True)
            ranked_items[u] = [r[0] for r in ranked_list[u]]
        
            temp = metrics.hit_and_auc(ranked_items[u], test_targets[u], 50)
            hit_rate.append(temp[0])
            auc.append(temp[1])
    return np.mean(np.array(hit_rate)), np.mean(np.array(auc))

In [None]:
# Prepare data
window = 10

dummy_data, num_users, num_items = read_dummy()
sorted_interactions = load_interactions_cbow(dummy_data)
train_targets, train_contexts, test_targets, test_contexts = train_test_dummy_cbow(sorted_interactions, window)

In [None]:
# Prepare dataset and model
ngrams_train = data.TensorDataset(torch.from_numpy(np.array(train_targets)), 
        torch.from_numpy(np.array(train_contexts)))
ngrams_dataloader = data.DataLoader(dataset = ngrams_train, batch_size = 1024, 
    shuffle = True, num_workers = 4)
ngrams_test = data.TensorDataset(torch.from_numpy(np.array(test_targets)), 
    torch.from_numpy(np.array(test_contexts)))
ngrams_dataloader_test = data.DataLoader(dataset = ngrams_test, batch_size = 1024, 
    shuffle = False, num_workers = 4)

embedding_dim, num_epochs, learning_rate = 30, 20, 0.025
loss = torch.nn.NLLLoss()
cbow_net = word2vec.CBOW(num_items, embedding_dim, window)
optimizer = optim.Adam(cbow_net.parameters(), lr = learning_rate)

In [None]:
# Train and evaluate the model
hit_rate_list = []
auc_list = []
for epoch in range(num_epochs):
    accumulator, l = utils.Accumulator(2), 0.

    # Train each batch
    cbow_net.train()
    for _, (targets, contexts) in enumerate(ngrams_dataloader):
        optimizer.zero_grad()

        log_probabilities = cbow_net(contexts)

        total_loss = loss(log_probabilities, targets)
        total_loss.backward()
        optimizer.step()
        accumulator.add(total_loss, targets.shape[0])

    # Evaluate
    cbow_net.eval()
    hit_rate, auc = evaluate_ranking_cbow(cbow_net, test_targets, test_contexts, num_items)
    hit_rate_list.append(hit_rate)
    auc_list.append(auc)

    print(f"Epoch {epoch}:\n\tloss = {accumulator[0]/accumulator[1]}\n\thit_rate = {hit_rate}\n\tauc = {auc}")

In [None]:
# Visualize
x = list(range(1, num_epochs + 1))
plt.scatter(x, auc_list, label = "AUC")
plt.scatter(x, hit_rate_list, label = "Hit Rate")
plt.title("HR and AUC over Epoch of CBOW")
plt.xlabel("Epoch")
plt.legend(loc = "lower right")
plt.xticks(x[0::2])
plt.ylim((0, 1))

## Visualize the Results