In [22]:
class ProductEncoder:
    def __init__(self, id_list):
        self.product_idx = {}
        self.product_pid = {}
        for idx, pid in enumerate(id_list):
            self.product_idx[pid] = idx
            self.product_pid[idx] = pid

    def toIdx(self, x):
        if type(x) == int:
            pid = x
            return self.product_idx[pid]
        return [self.product_idx[pid] for pid in x]

    def toPid(self, x):
        if type(x) == int:
            idx = x
            return self.product_pid[idx]
        return [self.product_pid[idx] for idx in x]

    @property
    def num_products(self):
        return len(self.product_idx)

In [9]:
import torch
import pandas as pd
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

In [10]:
hist_data = pd.read_csv('data/hist_data.csv')
test = pd.read_csv('data/test.csv')
full_df = pd.concat([hist_data.iloc[:, :-1], test])
full_df = full_df.assign(sum_price = full_df['count'] * full_df.price_sold)

In [15]:
items_list = list(full_df.item_id.unique())
buyer_list = list(full_df.buyer_id.unique())
product_encoder = ProductEncoder(items_list)
user_encoder = ProductEncoder(buyer_list)
n_users = len(buyer_list)
n_items = len(items_list)

In [12]:
df = full_df.groupby(['buyer_id', 'item_id']).agg({'count': np.size})
data = [(user_encoder.toIdx(ui[0]), product_encoder.toIdx(ui[1]), c) for ui, c in zip(df.index, df['count'].to_list())]
new_df = pd.DataFrame(data, columns=['buyer_id', 'item_id', 'count']).sort_values('buyer_id')

In [13]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [25]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness
# ['buyer_id', 'item_id', 'count']

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = new_df.copy()
        
        # Extract all user IDs and movie IDs
        users = buyer_list.copy()
        items = items_list.copy()
        
        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = user_encoder.product_idx.copy()
        self.itemid2idx = product_encoder.product_idx.copy()
        
        # Obtained continuous ID for users and movies
        self.idx2userid = user_encoder.product_pid.copy()
        self.idx2itemid = product_encoder.product_pid.copy()
        
        # return the id from the indexed values as noted in the lambda function down below.
        # self.ratings.item_id = new_df.item_id.apply(lambda x: self.itemid2idx[x])
        # self.ratings.buyer_id = new_df.buyer_id.apply(lambda x: self.userid2idx[x])
        
        
        self.x = self.ratings.drop(['count'], axis=1).values
        self.y = self.ratings['count'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

model = MatrixFactorization(n_users, n_items, n_factors=8)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

In [31]:
for it in range(8):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()


In [32]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.0400, 0.0409, 0.0167,  ..., 0.0388, 0.0229, 0.0383],
        [0.0365, 0.0457, 0.0500,  ..., 0.0330, 0.0010, 0.0237],
        [0.0164, 0.0459, 0.0431,  ..., 0.0388, 0.0076, 0.0367],
        ...,
        [0.0249, 0.0337, 0.0041,  ..., 0.0378, 0.0453, 0.0441],
        [0.0325, 0.0129, 0.0011,  ..., 0.0379, 0.0079, 0.0238],
        [0.0282, 0.0309, 0.0130,  ..., 0.0163, 0.0440, 0.0022]])
item_factors.weight tensor([[1.3456e-02, 4.7295e-02, 4.8008e-02,  ..., 1.8273e-02, 1.7509e-02,
         1.9464e-02],
        [1.2806e-03, 5.0721e-03, 4.6130e-02,  ..., 3.2279e-02, 3.1882e-02,
         4.9365e-02],
        [2.8263e-02, 2.7758e-02, 9.4749e-03,  ..., 8.6548e-03, 2.5004e-02,
         1.2371e-02],
        ...,
        [6.7294e-05, 1.7693e-02, 9.9693e-03,  ..., 2.0468e-02, 4.4567e-02,
         1.3423e-02],
        [1.9633e-02, 3.5446e-02, 2.4741e-02,  ..., 3.6165e-04, 4.5476e-02,
         8.6686e-03],
        [9.7256e-03, 9.0458e-03, 4.8052e-02,  ..., 1.5226e-02, 6

In [33]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

len(trained_movie_embeddings) # unique movie factor weights

54596

In [34]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:

'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  item_ = []
  for itidx in np.where(kmeans.labels_ == cluster)[0]:
    item_id = product_encoder.toPid(int(itidx))
    rat_count = new_df.loc[new_df['item_id'] == item_id].count()[0]
    item_.append((item_id, rat_count))
  for mov in sorted(item_, key=lambda tup: tup[1], reverse=True)[:20]:
    print("\t", mov[0])

