In [1]:
import sys
from time import time

import torch
from torch import nn
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader

import numpy as np

In [2]:
from torch.nn.functional import ctc_loss

In [4]:
import inspect

In [27]:
DATA_PATH = 'data/'
MODEL_PATH = 'models/'
SUBMISSION_PATH = 'submissions/'
LOG_DIR = 'logs/'

In [28]:
class Primitive(Module):
    def __init__(self, dim, activation):
        super().__init__()
        self.l1 = nn.Linear(dim, 100)
        self.l2 = nn.Linear(100, 1)
        self.activation = activation()
    def forward(self, x):
        x = self.l1(x)
        x = self.activation(x)
        x = self.l2(x)
        return x

In [29]:
class FactorizationModule(Module):
    def __init__(self, dim, k, activation):
        super().__init__()
        self.W = nn.Linear(dim, 1)
        self.V = nn.Parameter(torch.randn(dim, k),requires_grad=True)
        self.activation = activation()
    def forward(self, x):
        h1 = self.W(x)
        h21 = torch.matmul(x, self.V).pow(2).sum(1, keepdim=True)
        h22 = torch.matmul(x.pow(2), self.V.pow(2)).sum(1, keepdim=True)
        h2 = 0.5*(h21 - h22)
        h = h1 + h2
        return h

In [30]:
class factorization_set(Dataset):
    def __init__(self, X, y = None, maxuid = 943, maxmid = 1682, unlabeled_num = 10000, 
                 movie_means = None, user_means = None, 
                 user_dif_rates = None, movie_dif_rates = None):
        self.unlabeled_num = unlabeled_num
        self.maxuid = maxuid
        self.maxmid = maxmid
        if not y is None:
            self.y = torch.from_numpy(y).view((-1,1)).float()
        else:
            self.y = y
        X_tensor = torch.from_numpy(X - 1)
        users = torch.nn.functional.one_hot(X_tensor[:,0], maxuid).float()
        movies = torch.nn.functional.one_hot(X_tensor[:,1], maxmid).float()
        
        self.sample_matrix = np.zeros((maxuid,maxmid), dtype = np.bool)
        if user_means is None:
            self.user_means = np.zeros(maxuid)
            self.movie_means = np.zeros(maxmid)
            self.user_dif_rates = np.zeros((maxuid,maxmid))
            self.movie_dif_rates = np.zeros((maxmid,maxuid))
        else:
            self.user_means = user_means
            self.movie_means = movie_means
            self.user_dif_rates = user_dif_rates
            self.movie_dif_rates = movie_dif_rates
            
        
        self.sample_matrix[(X-1)[:,0],(X-1)[:,1]] = True
        
        self.weights = np.ones(len(X) + unlabeled_num).reshape(-1,1)
        alpha = 20.
        self.weights = torch.from_numpy(self.weights).view((-1,1)).float()
        self.weights[:-unlabeled_num] = alpha
        self.weights = torch.sqrt(self.weights)
        
        uid_mean, uid_inverse = np.unique(X[:,0], return_inverse = True)
        uid_mean = uid_mean.reshape(-1,1)
        uid_mean = np.concatenate((uid_mean,
                    np.zeros((len(uid_mean),1 + maxmid))), axis = 1)
        
        mid_mean, mid_inverse = np.unique(X[:,1], return_inverse = True)
        mid_mean = mid_mean.reshape(-1,1)
        mid_mean = np.concatenate((mid_mean,
                    np.zeros((len(mid_mean),1 + maxuid))), axis = 1)
        #todo prediction modification
        if not y is None:
            for i in range(len(uid_mean)):
                mask = (X[:,0] == uid_mean[i,0])
                all_rates = y[mask]
                rates_vec = (movies[mask]*all_rates.reshape(-1,1)).sum(axis = 0)
                uid_mean[i,1] = all_rates.mean()
                self.user_means[(uid_mean[i,0] - 1).astype(np.int)] = uid_mean[i,1]
                self.user_dif_rates[(uid_mean[i,0] - 1).astype(np.int)] = rates_vec
                uid_mean[i,2:] = rates_vec
            for i in range(len(mid_mean)):
                mask = (X[:,1] == mid_mean[i,0])
                all_rates = y[mask]
                rates_vec = (users[mask]*all_rates.reshape(-1,1)).sum(axis = 0)
                mid_mean[i,1] = all_rates.mean()
                self.movie_means[(mid_mean[i,0] - 1).astype(np.int)] = mid_mean[i,1]
                self.movie_dif_rates[(mid_mean[i,0] - 1).astype(np.int)] = rates_vec
                mid_mean[i,2:] = rates_vec
        else:
            for i in range(len(uid_mean)):
                uid_mean[i,1] = self.user_means[(uid_mean[i,0] - 1).astype(np.int)]
                rates_vec = self.user_dif_rates[(uid_mean[i,0] - 1).astype(np.int)] 
                uid_mean[i,2:] = rates_vec
            for i in range(len(mid_mean)):
                mid_mean[i,1] = self.movie_means[(mid_mean[i,0] - 1).astype(np.int)]
                rates_vec = self.movie_dif_rates[(mid_mean[i,0] - 1).astype(np.int)]
                mid_mean[i,2:] = rates_vec
                
        #without other rates [,1] and .view((-1,1)) with [,1:]
        movie_mean_all = torch.from_numpy(mid_mean[mid_inverse,1]).view(-1,1).float()
        user_mean_all = torch.from_numpy(uid_mean[uid_inverse,1]).view(-1,1).float()
        
        self.X = torch.cat((users, movies,movie_mean_all,
                            user_mean_all), 1).float()
        
        unlabeled_samples = self.get_samples(unlabeled_num)
        self.X = torch.cat((self.X, unlabeled_samples), 0)
        self.shown_idxs = torch.from_numpy(np.zeros((unlabeled_num,2),dtype = np.int))
        self.shown_idxs.copy_(self.idxs)
        
        if not self.y is None:
            unlabeled_targets = torch.zeros(unlabeled_num).view(-1,1)
            self.y = torch.cat((self.y, unlabeled_targets),0)
            
        self.n_dims = self.X.size()[1]
    def __len__(self):
        return len(self.X)
    
    def get_samples(self, num = 100000):
        self.samples = 'empty'
        self.idxs = 'empty'
        idxs = np.nonzero(~self.sample_matrix)
        idxs = np.concatenate((idxs[0].reshape(-1,1), idxs[1].reshape(-1,1)),axis = 1)
        np.random.shuffle(idxs)
        idxs = idxs[:num]
        
        movie_mean_all = np.zeros((num,self.maxuid + 1))
        movie_mean_all[:,0] = self.movie_means[idxs[:,1]]
        movie_mean_all[:,1:] = self.movie_dif_rates[idxs[:,1]]
        
        user_mean_all = np.zeros((num,self.maxmid + 1))
        user_mean_all[:,0] = self.user_means[idxs[:,0]]
        user_mean_all[:,1:] = self.user_dif_rates[idxs[:,0]]
        self.idxs = torch.from_numpy(idxs)
        movie_mean_all = torch.from_numpy(movie_mean_all).float()
        user_mean_all = torch.from_numpy(user_mean_all).float()
        users = torch.nn.functional.one_hot(self.idxs[:,0], self.maxuid).float()
        movies = torch.nn.functional.one_hot(self.idxs[:,1], self.maxmid).float()
        
        ##important there is feature u can choose [,0] for mean [,0:] for all .view(-1,1)
        generated_samples = torch.cat((users, movies,
                            movie_mean_all[:,0].view(-1,1),user_mean_all[:,0].view(-1,1)), 1)
        
        self.samples = generated_samples
        return generated_samples
        
        
    def set_samples(self,idxs,pos1, pool = False):
        idxs = idxs.view(-1)
        if pool:
            mark_idxs = self.shown_idxs
        else:
            mark_idxs = self.idxs
        
        if pos1 + len(idxs) != 0:
            self.X[pos1:pos1 + len(idxs)] = self.samples[idxs]
            self.shown_idxs[pos1:pos1 + len(idxs)] = mark_idxs[idxs]
        else:
            self.X[pos1:] = self.samples[idxs]
            self.shown_idxs[pos1:] = mark_idxs[idxs]
        
        if pool:
            if pos1 + len(idxs) != 0:
                (self.sample_matrix[mark_idxs[pos1:pos1 + len(idxs),0].numpy(),
                                    mark_idxs[pos1:pos1 + len(idxs),1].numpy()]) = True
            else:
                (self.sample_matrix[mark_idxs[pos1:,0].numpy(),
                                    mark_idxs[pos1:,1].numpy()]) = True
        else:
            self.sample_matrix[mark_idxs[idxs,0].numpy(),mark_idxs[idxs,1].numpy()] = True

    def __getitem__(self, idx): 
        if not self.y is None:
            return self.X[idx], self.y[idx], self.weights[idx]
        else:
            return self.X[idx]

In [31]:
class FactorizationMachine:
    def __init__(self, dim, activation, loss_fn, k = 100, primitive = False, load = False):
        if not load is None:
            self.model = torch.load(MODEL_PATH + load)
        else:
            if primitive:
                self.model = Primitive(dim, activation)
            else:
                self.model = FactorizationModule(dim, k, activation)
        self.model.to('cuda')
        self.loss_fn = loss_fn()
        
    def fit(self, X, y, n_epoch = 10, batchsize = 50000, ler = 1e-1, log = True, 
            update_freq = 10, save_freq = 100, eps=1e-03, start_update = 1000):
        self.model.train()
        self.batchsize = batchsize
        self.print_each = 1
        self.dataset = factorization_set(X,y)
        self.loader = DataLoader(
            self.dataset, batch_size=batchsize, shuffle = True, drop_last=True)
        
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=ler, weight_decay=1e-3)
        #self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        #    self.optimizer, 'min', verbose = True, patience = 60*len(self.loader))
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
                            self.optimizer, 1000, eta_min=1e-5)
        torch.optim.lr_scheduler.StepLR(self.optimizer, 1000, gamma=0.1, last_epoch=-1, verbose=False)
        self.X_batch = torch.cuda.FloatTensor(batchsize, self.dataset.n_dims)
        self.y_batch = torch.cuda.FloatTensor(batchsize, 1)
        self.w_batch = torch.cuda.FloatTensor(batchsize, 1)
        
        losses_per_epoch = [0. for i in range(self.print_each)]
        losses_per_batch = [0. for i in range(len(self.loader))]
        
        min_loss = 1e9
        
        if log:
            self.original_stdout = sys.stdout
        
        with open(LOG_DIR + 'log.txt', 'w') as f:
            if log:
                sys.stdout = f
            
            print('fit begins...')
            sys.stdout.flush()
            
            for epoch in range(n_epoch):
                for i, batch in enumerate(self.loader):
                    X_batch = self.X_batch.copy_(batch[0])
                    y_batch = self.y_batch.copy_(batch[1])
                    w_batch = self.w_batch.copy_(batch[2])
                    h = self.model(X_batch)
                    h = w_batch*h
                    y_batch = y_batch*w_batch
                    loss = self.loss_fn(h,y_batch)

                    self.optimizer.zero_grad()
                    loss.backward()

                    self.optimizer.step()
                    self.scheduler.step()

                    losses_per_batch[i] = loss.item()
                
                losses_per_epoch[epoch%self.print_each] = (sum(losses_per_batch)/len(losses_per_batch))
                
                if (epoch + 1)%self.print_each == 0:
                    print(epoch + 1, sum(losses_per_epoch)/len(losses_per_epoch))
                    sys.stdout.flush()
                    
                if (epoch + 1)%save_freq == 0:
                    torch.save(self.model, MODEL_PATH + f'{epoch+1}')
                
                if (epoch + 1 > start_update) and (epoch+1)%update_freq == 0:
                    self.model.eval()
                    X_w,_, __ = self.dataset[-10000:]
                    X_batch = self.X_batch[:10000].copy_(X_w)
                    h = self.model(X_batch)
                    args = torch.argsort(h, descending=True)
                    self.dataset.set_samples(args[:1000], -10000, pool = True)
                    for i in range(9,0,-1):
                        temp_samples = self.dataset.get_samples(10000)
                        X_batch = self.X_batch[:10000].copy_(temp_samples)
                        h = self.model(X_batch)
                        args = torch.argsort(h, descending=True)
                        self.dataset.set_samples(args[:1000], -i*1000)
                    self.model.train()
                    
                    print('updated unlabeled data')
            
            if log:
                sys.stdout = self.original_stdout
    
    def predict(self, X):
        #todo prediction modification
        self.model.eval()
        with torch.no_grad():
            test_set = factorization_set(X,movie_means = self.dataset.movie_means, 
                                         user_means = self.dataset.user_means, 
                                         user_dif_rates = self.dataset.user_dif_rates, 
                                         movie_dif_rates = self.dataset.movie_dif_rates)
            answers = []
            batch_num = len(test_set)//self.batchsize + 1
            for i in range(batch_num):
                if i != batch_num - 1:
                    X_batch = self.X_batch.copy_(test_set
                                [i*self.batchsize:(i+1)*self.batchsize])
                else:
                    X_batch = (self.X_batch[:len(test_set)%self.batchsize]
                               .copy_(test_set[i*self.batchsize:(i+1)*self.batchsize]))
                
                answer_batch = self.model(X_batch).detach().cpu().numpy().flatten()
                answers.append(answer_batch)

        answer = np.concatenate(answers)
        return answer

In [32]:
def read_data(fname):
    with open(fname,'r') as f:
        text = f.read()
    text = text.split('\n')
    text.pop()
    text = [[int(number) for number in line.split()] for line in text]
    data = np.array(text, dtype = np.int64)
    return data

In [33]:
train_data = read_data(DATA_PATH + 'train.txt')

In [34]:
some = np.unique(train_data[:,1])
for i in range(some.max()):
    if not i+1 in some:
        print(i+1)

In [35]:
some = np.unique(train_data[:,0])
for i in range(some.max()):
    if not i+1 in some:
        print(i+1)

In [36]:
dimensions = train_data[:,:-1].max(axis = 0)
dimensions

array([ 943, 1682], dtype=int64)

In [37]:
dimension = dimensions.sum()
dimension

2625

In [38]:
dimension = dimension + 2

In [39]:
dimension

2627

In [41]:
machine = FactorizationMachine(dimension, torch.nn.ReLU, torch.nn.MSELoss, k = 100, primitive = False, load = '5000')
machine.fit(train_data[:,:-1],train_data[:,-1], n_epoch = 500, batchsize = 20000, 
            ler = 1e-2, log = False, update_freq = 100, save_freq = 100, start_update = 2000)

fit begins...
1 844.3066003084183
2 371.677507019043
3 210.09121055603026
4 122.96328353881836
5 74.367631149292
6 47.81370086669922
7 31.2848762512207
8 20.167290115356444
9 13.334589004516602
10 9.048922491073608
11 6.1200361251831055
12 4.38610692024231
13 3.4207558631896973
14 2.902648401260376
15 2.580278158187866
16 2.329804301261902
17 2.096774935722351
18 1.9431039094924927
19 1.886708927154541
20 1.840683937072754
21 1.789383554458618
22 1.7517578840255736
23 1.7299987077713013
24 1.7115173101425172
25 1.6906277656555175
26 1.6777133464813232
27 1.6637107133865356
28 1.6515342473983765
29 1.6387967348098755
30 1.6294005155563354
31 1.6210793256759644
32 1.612468671798706
33 1.6049748420715333
34 1.5963948965072632
35 1.5894833564758302
36 1.5828024387359618
37 1.5756308078765868
38 1.5706130981445312
39 1.5644574880599975
40 1.5582899093627929
41 1.5543992280960084
42 1.5477132081985474
43 1.5461216688156127
44 1.5393113613128662
45 1.534599208831787
46 1.5329376459121704
47 1

366 1.2770514488220215
367 1.2769563436508178
368 1.277229404449463
369 1.2774723529815675
370 1.274705672264099
371 1.2744000196456908
372 1.2746735095977784
373 1.2741244554519653
374 1.2745604991912842
375 1.2735772848129272
376 1.2745623350143434
377 1.2742133855819702
378 1.2733174562454224
379 1.2717703104019165
380 1.2731369972229003
381 1.2728600978851319
382 1.2728051900863648
383 1.273056697845459
384 1.272372341156006
385 1.2720932245254517
386 1.2720498085021972
387 1.2715240240097045
388 1.2712956666946411
389 1.2710339784622193
390 1.2712174654006958
391 1.2706684589385986
392 1.2708056449890137
393 1.270956325531006
394 1.2702241897583009
395 1.271064782142639
396 1.2700422286987305
397 1.2712478160858154
398 1.270595908164978
399 1.2701725721359254
400 1.2707071781158448
401 1.39321870803833
402 34.242176222801206
403 688.9356872558594
404 294.287557220459
405 211.40211181640626
406 125.5978775024414
407 82.7266944885254
408 50.62587776184082
409 37.795912170410155
410 

In [None]:
def write_answer(answer, fname):
    with open(SUBMISSION_PATH + fname,'w') as fout:
        fout.write('Id,Score\n')
        for i in range(len(answer)):
            fout.write(f'{i+1},{answer[i]}\n')

In [None]:
test_data = read_data(DATA_PATH + 'test.txt')

In [None]:
answer = machine.predict(test_data)

In [None]:
write_answer(answer, 'Primitive_submission.txt')

In [None]:
machine.dataset[-10001]