In [1]:
import pandas as pd
import numpy as np
import time
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from scipy.optimize import minimize, fmin_cg
import pickle as pkl

# enable offline plotting in plotly
init_notebook_mode(connected=True)

In [2]:
# load our 3 datasets
users = pd.read_csv('data/user_features.csv')
problems =  pd.read_csv('data/problem_features.csv')
submissions = pd.read_csv('data/train_submissions.csv')

In [3]:
train_cv, S_test = train_test_split(submissions, test_size=0.25, 
                                 stratify=submissions['attempts_range'], random_state=42)

S_train, S_cv = train_test_split(train_cv, test_size=0.25, 
                                 stratify=train_cv['attempts_range'], random_state=42)

In [4]:
S_train = S_train.set_index(['user_id','problem_id']).unstack(level=-1)
S_cv = S_cv.set_index(['user_id','problem_id']).unstack(level=-1)

S_train.columns = S_train.columns.droplevel()
S_cv.columns = S_cv.columns.droplevel()

In [6]:
empty_sub = pd.DataFrame(np.nan, index=users.user_id.unique(), 
                         columns=problems.problem_id.unique())
empty_sub_ = np.array(empty_sub)

In [7]:
S_train = empty_sub.fillna(S_train)
S_cv = empty_sub.fillna(S_cv)

In [8]:
S_train.head()

Unnamed: 0,prob_1,prob_10,prob_100,prob_1000,prob_1001,prob_1002,prob_1003,prob_1004,prob_1005,prob_1006,...,prob_990,prob_991,prob_992,prob_993,prob_994,prob_995,prob_996,prob_997,prob_998,prob_999
user_1,,,,,,,,,,,...,,,,,,,,,,
user_10,,,,,,,,,,,...,,,,,,,,,,
user_100,,,,,,,,,,,...,,,,,,,,,,
user_1000,,,,,,,,,,,...,,,,,,,,,,
user_1001,,,,,,,,,,,...,,,,,,,,,,


In [9]:
S_cv.head()

Unnamed: 0,prob_1,prob_10,prob_100,prob_1000,prob_1001,prob_1002,prob_1003,prob_1004,prob_1005,prob_1006,...,prob_990,prob_991,prob_992,prob_993,prob_994,prob_995,prob_996,prob_997,prob_998,prob_999
user_1,,,,,,,,,,,...,,,,,,,,,,
user_10,,,,,,,,,,,...,,,,,,,,,,
user_100,,,,,,,,,,,...,,,,,,,,,,
user_1000,,,,1.0,,,,,,,...,,,,,,,,,,
user_1001,,,,,,,,,,,...,,,,,,,,,,


In [11]:
import torch
import torch.autograd as ag
import torch.optim as optim

np.random.seed(42)

In [None]:
# convert matrices to np arrays
M_users = np.array(M_users)
M_items = np.array(M_items)

# flatten 2D arrays into 1D arrays
x_users = M_users.flatten(order='C')
x_items = M_items.flatten(order='C')

# concatenate user and item 1D arrays
x_users_items = np.concatenate((x_users, x_items), axis=0)

return x_users_items

In [13]:
%%time
# data parameters
n_users, n_items = S_train.shape

# hyperparameters
n_features = 20
Lambda=0.01
epochs=30
alpha=0.001
    
# intial random guess of user and item
# latent features
f_users = (torch.rand(n_users*n_features) - 0.5).type(torch.FloatTensor)
f_items = (torch.rand(n_items*n_features) - 0.5).type(torch.FloatTensor)

f_users

Wall time: 4 ms


tensor([ 0.3145,  0.3794,  0.2137,  ...,  0.4640,  0.3576, -0.3977])

In [14]:
import torch
from torch.autograd import Variable

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
	# create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors,
                                               sparse=True)
	# create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors,
                                               sparse=True)

    def forward(self, user, item):
    	# matrix multiplication
        return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:
model = MatrixFactorization(n_users, n_items, n_factors=20)
loss_fn = torch.nn.MSELoss() 
optimizer = torch.optim.SGD(model.parameters(),
                            lr=1e-6)

for user, item in zip(users, items):
    # get user, item and rating data
    rating = Variable(torch.FloatTensor([ratings[user, item]]))
    user = Variable(torch.LongTensor([int(user)]))
    item = Variable(torch.LongTensor([int(item)]))

    # predict
    prediction = model(user, item)
    loss = loss_fn(prediction, rating)

    # backpropagate
    loss.backward()

    # update weights
    optimizer.step()