In [73]:
from collections import defaultdict
import csv
from sklearn import linear_model
from sklearn.decomposition import PCA
import scipy
import scipy.optimize
import random
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd 
from IPython.display import display

In [5]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

def readCSV(path):
  f = open(path, 'rt')
  f.readline()
  for l in f:
    yield l.strip().split(',')
    
def calc_model_stats(pred,label):
    
    TP,FP,TN,FN = calc_metrics(pred,label)

    # print("Stats")
    # print(TP,FP,TN,FN)

    # print("Predict N: {} ({}%)".format(TN+FN,(TN+FN)/(TP+TN+FP+FN)))
    # print("Predict P: {} ({}%)".format(TP+FP,(TP+FP)/(TP+TN+FP+FN)))

    accuracy, TPR, TNR, BER = calc_error_rates(TP, FP, TN, FN)

    print("Accuracy: {}".format(accuracy))
    # print("TPR: {}".format(TPR))
    # print("TNR: {}".format(TNR))
    # print("BER: {}".format(BER))
    
    return
 
def calc_metrics(predictions, labels):
    # Calculate True positives, false positives, etc.

    TP_ = numpy.logical_and(predictions, labels)
    FP_ = numpy.logical_and(predictions, numpy.logical_not(labels))
    TN_ = numpy.logical_and(numpy.logical_not(predictions), numpy.logical_not(labels))
    FN_ = numpy.logical_and(numpy.logical_not(predictions), labels)

    TP=sum(TP_)
    FP=sum(FP_)
    TN=sum(TN_)
    FN=sum(FN_)
    
    return TP,FP,TN,FN

def calc_error_rates(TP, FP, TN, FN):
    # Calculate accuracy, TPR, TNR and BER
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    BER = 1.0 - (TPR+TNR)/2
    
    return accuracy, TPR, TNR, BER


In [69]:
class MF(nn.Module):
    itr = 0
    
    def __init__(self, n_user, n_item, k=1, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        # gammas (users and items)
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # alpha and betas (users and items)
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())
        
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse + prior_user + prior_item + prior_bias_user + prior_bias_item
        for name, var in locals().items():
            if type(var) is torch.Tensor and var.nelement() == 1 and self.writer is not None:
                self.writer.add_scalar(name, var, self.itr)
        return total

## user_id and item_id --> user_idx, item_idx

In [13]:
data = pd.read_csv("../datasets/cse258/assignment1/train_Interactions.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
userID    200000 non-null object
bookID    200000 non-null object
rating    200000 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.6+ MB


Unnamed: 0,userID,bookID,rating
0,u79354815,b14275065,4
1,u56917948,b82152306,5
2,u97915914,b44882292,5
3,u49688858,b79927466,5
4,u08384938,b05683889,2


### Reformat Interactions based on (user_idx, book_idx, rating)

In [21]:
book_ids = list(data['bookID'].unique())
user_ids = list(data['userID'].unique())

print(len(book_ids), book_ids[:3])
print(len(user_ids), user_ids[:3])

7170 ['b14275065', 'b82152306', 'b44882292']
11357 ['u79354815', 'u56917948', 'u97915914']


In [25]:
user_ids.index('u08384938')

4

In [39]:
fname = "../datasets/cse258/assignment1/train_interact_reformatted.csv"

reviews = open(fname, 'w')
reviews.write('userIDX' + ',' + 'bookIDX' + ',' + 'rating'+ '\n')

i = 0
for index, row in data.iterrows():
    # print(row['userID'], row['bookID'], row['rating'])
    reviews.write(str(user_ids.index(row['userID'])) + ',' \
                  + str(book_ids.index(row['bookID'])) + ',' \
                  + str(row['rating'])+ '\n')
reviews.close()


In [40]:
fname = "../datasets/cse258/assignment1/user_reformatted.csv"

file = open(fname, 'w')
file.write('userID' + ',' + 'userIDX' '\n')

i = 0
for id in user_ids:
    # print(row['userID'], row['bookID'], row['rating'])
    file.write(str(user_ids.index(id)) + ',' + id + '\n')
file.close()

In [42]:
fname = "../datasets/cse258/assignment1/book_reformatted.csv"

file = open(fname, 'w')
file.write('bookID' + ',' + 'bookIDX' '\n')

i = 0
for id in book_ids:
    # print(row['userID'], row['bookID'], row['rating'])
    file.write(str(book_ids.index(id)) + ',' + id + '\n')
file.close()

## Import reformatted interactions

In [53]:
data = pd.read_csv("../datasets/cse258/assignment1/train_interact_reformatted.csv")
data.info()
data.head()

n_user = len(data['userIDX'].unique())
n_item = len(data['bookIDX'].unique())

print(n_user,n_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 3 columns):
userIDX    200000 non-null int64
bookIDX    200000 non-null int64
rating     200000 non-null int64
dtypes: int64(3)
memory usage: 4.6 MB
11357 7170


In [54]:
# Shuffle the data
shuffled_data = data.sample(frac=1).reset_index(drop=True)
shuffled_data.head()

Unnamed: 0,userIDX,bookIDX,rating
0,1921,391,4
1,3764,3057,5
2,1938,2782,3
3,1812,563,3
4,3661,6903,4


### Split Training Dataset into Train and Validation Datasets

In [80]:
# Split 

split = 190000

train_x = shuffled_data.loc[:split-1, 'userIDX':'bookIDX']
train_y = shuffled_data.loc[:split-1, 'rating':'rating']
test_x = shuffled_data.loc[split:, 'userIDX':'bookIDX']
test_y = shuffled_data.loc[split:, 'rating':'rating']

display(train_x.describe())
display(train_y.describe())
display(test_x.describe())
display(test_y.describe())

Unnamed: 0,userIDX,bookIDX
count,190000.0,190000.0
mean,5002.671689,2696.349316
std,3169.400599,1944.910851
min,0.0,0.0
25%,2244.0,1014.0
50%,4727.0,2343.0
75%,7599.0,4156.0
max,11356.0,7169.0


Unnamed: 0,rating
count,190000.0
mean,3.896
std,1.2149
min,0.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


Unnamed: 0,userIDX,bookIDX
count,10000.0,10000.0
mean,4970.5054,2691.0347
std,3166.930712,1935.087554
min,1.0,0.0
25%,2233.5,1021.0
50%,4685.5,2353.5
75%,7559.25,4149.0
max,11350.0,7165.0


Unnamed: 0,rating
count,10000.0
mean,3.9111
std,1.201974
min,0.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [133]:
# Hyperparameters
lr = 1e-3
k = 1
# New parameter for regularizing bias
c_bias = 1e-5
c_vector = 1e-5
batchsize = 1024

cuda = torch.cuda.is_available()

In [134]:
from random import shuffle

# This code utilizes ignite engine's create_supervised_trainer()
# But we need something more basic

model = MF(n_user, n_item, k=k, c_vector=c_vector)
if cuda:
    model.cuda()
    
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def chunks(X, Y, size):
    """Yield successive n-sized chunks from l."""
    starts = list(range(0, len(X), size))
    shuffle(starts)
    for i in starts:
        yield (X[i:i + size], Y[i:i + size])
        
batch_size = 1024
losses = []
for epoch in range(100+1):
    
    i = 0
    for feature, target in chunks(np.array(train_x), np.array(train_y), batch_size):
        # This zeros the gradients on every parameter. 
        # This is easy to miss and hard to troubleshoot.
        optimizer.zero_grad()
        # Convert 
        feature = Variable(torch.from_numpy(feature))
        target = Variable(torch.from_numpy(target).type(torch.FloatTensor))
        
        if cuda:
            feature = feature.cuda()
            target = target.cuda()
            
        # Compute a prediction for these features
        prediction = model.forward(feature)
        # Compute a loss given what the true target outcome was
        loss = model.loss(prediction, target)
        # break
        # Backpropagate: compute the direction / gradient every model parameter
        # defined in your __init__ should move in in order to minimize this loss
        # However, we're not actually changing these parameters, we're just storing
        # how they should change.

        loss.backward()
        # Now take a step & update the model parameters. The optimizer uses the gradient at 
        # defined on every parameter in our model and nudges it in that direction.
        optimizer.step()
        
        if i%100 == 0 and epoch%10 == 0:
            print("Epoch[{}] Iteration[{}] Training Loss: {:.2f}".format(epoch, i, loss.data))

        # Record the loss per example
        losses.append(loss.cpu().data.numpy() / len(feature))
        
        if i%100 == 0 and epoch%10 == 0:
            feature = torch.from_numpy(np.array(test_x))
            target = torch.from_numpy(np.array(test_y)).type(torch.FloatTensor)
            
            if cuda:
                feature = feature.cuda()
                target = target.cuda()

            prediction = model.forward(feature)
            loss = model.loss(prediction, target)
            print("Epoch[{}] Validation Loss: {:.2f} ".format(epoch, loss.data))

        i += 1
        


Epoch[0] Iteration[0] Training Loss: 18685.96
Epoch[0] Validation Loss: 18656.13 
Epoch[0] Iteration[100] Training Loss: 15946.17
Epoch[0] Validation Loss: 15921.08 
Epoch[10] Iteration[0] Training Loss: 1026.40
Epoch[10] Validation Loss: 1025.01 
Epoch[10] Iteration[100] Training Loss: 871.64
Epoch[10] Validation Loss: 870.23 
Epoch[20] Iteration[0] Training Loss: 37.45
Epoch[20] Validation Loss: 37.42 
Epoch[20] Iteration[100] Training Loss: 31.10
Epoch[20] Validation Loss: 30.98 
Epoch[30] Iteration[0] Training Loss: 2.43
Epoch[30] Validation Loss: 2.44 
Epoch[30] Iteration[100] Training Loss: 2.27
Epoch[30] Validation Loss: 2.28 
Epoch[40] Iteration[0] Training Loss: 1.52
Epoch[40] Validation Loss: 1.56 
Epoch[40] Iteration[100] Training Loss: 1.52
Epoch[40] Validation Loss: 1.55 
Epoch[50] Iteration[0] Training Loss: 1.40
Epoch[50] Validation Loss: 1.53 
Epoch[50] Iteration[100] Training Loss: 1.43
Epoch[50] Validation Loss: 1.53 
Epoch[60] Iteration[0] Training Loss: 1.53
Epoch[6

In [128]:
feature.type()

'torch.cuda.LongTensor'

In [130]:
model.cpu().bias_user.weight.data.numpy()

array([[ 5.4286665e-04],
       [ 9.0880181e-05],
       [ 1.2562422e-04],
       ...,
       [-3.9689295e-04],
       [-5.0739350e-06],
       [ 8.3241666e-06]], dtype=float32)

## Validation Prediction

In [116]:
feature = torch.from_numpy(np.array(test_x))
target = torch.from_numpy(np.array(test_y)).type(torch.FloatTensor)

prediction = model.forward(feature)
loss = model.loss(prediction, target)

In [117]:
loss.data

tensor(1.7856)

## Assignment 1 Kaggle Submission 
### Lambda=1.2e-5,  MSE=1.143, User_Name='Luke Liem'