In [9]:
import torch
import h5py
import numpy as np
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pickle
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, hstack

In [10]:
class RecDataset(Dataset):
    def __init__(self, recs_list, answer_dict, query_num, item_num, transform=None, target_transform=None):
        # rec_matrix = [query num, model_num, item_num]
        self.rec_matrix = []
        for i in query_num:
            self.rec_matrix.append(csr_matrix((len(rec_file_list), item_num)))
        for i, recs in enumerate(recs_list):
            for query in recs.keys():
                rec = recs[query]
                rec_items, rec_scores = [rec_ for rec_, score in rec], [score for rec_, score in rec]
                rec_scores = normalize(np.array(rec_scores)[:,np.newaxis], axis=0).ravel()
                for item, score in zip(rec_items, rec_scores):
                    self.rec_matrix[query][i, item] = score
        self.labels = answer_dict
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return self.rec_matrix.shape[0]

    def __getitem__(self, idx):
        rec_matrix = self.rec_matrix[idx].toarray()
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return rec_matrix, label

In [11]:
class Network(nn.Module):
    def __init__(self, model_len, k=10):
        super(Network, self).__init__()
        self.w1 = torch.nn.Parameter(torch.randn(k, model_len))
        self.w2 = torch.nn.Parameter(torch.randn(1, k))
        
    def forward(self, x):
        #import ipdb; ipdb.set_trace()
        x = x.float()
        x = torch.einsum('nm, bmp -> bnp', self.w1, x)
        x = torch.einsum('nm, bmp -> bnp', self.w2, x).squeeze(1)
        return x

In [12]:
rec_file_list = ["./train_recs/CF_rec_cpl_dim_64.pickle",
                "./train_recs/Graph_rec_cpl_1_2_depth_5.pickle",
                "./train_recs/Graph_rec_cpl_1_4_depth_3.pickle",
                "./train_recs/Graph_rec_cpl_1_8_depth_3.pickle",
                "./train_recs/Graph_rec_cpl_1_8_depth_1.pickle",]

recs_list = []
for rec_file in rec_file_list:
    with open(rec_file, 'rb') as f:
        recs = pickle.load(f)
        recs_list.append(recs)

query_num = len(recs_list[0])
item_num = 6714

h5f_valid = h5py.File('../Hanseul/Container/train_cpl', 'r')
answer = h5f_valid['labels_id'][:].astype(np.int64)
h5f_valid.close()

answer_dict = {}
for i, ans in enumerate(answer):
    answer_dict[i] = ans

train_data = RecDataset(recs_list, answer_dict, query_num, item_num)
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)
###############
rec_file_list = ["./valid_recs/CF_rec_cpl_dim_64.pickle",
                "./valid_recs/Graph_rec_cpl_1_2_depth_5.pickle",
                "./valid_recs/Graph_rec_cpl_1_4_depth_3.pickle",
                "./valid_recs/Graph_rec_cpl_1_8_depth_3.pickle",
                "./valid_recs/Graph_rec_cpl_1_8_depth_1.pickle",]
recs_list = []
for rec_file in rec_file_list:
    with open(rec_file, 'rb') as f:
        recs = pickle.load(f)
        recs_list.append(recs)

query_num = len(recs_list[0])
item_num = 6714

h5f_valid = h5py.File('./Container/valid_cpl', 'r')
answer = h5f_valid['labels_id'][:].astype(np.int64)
h5f_valid.close()

answer_dict = {}
for i, ans in enumerate(answer):
    answer_dict[i] = ans

test_data = RecDataset(recs_list, answer_dict, query_num, item_num)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

ValueError: unable to infer matrix dimensions

In [None]:
model = Network(len(rec_file_list), k=10)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    total_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / (batch + 1)
    print(f"Train loss: {loss:>7f}")
            
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
""" 
###구현할 거###
3. 모델 학습마다 앙상블된 결과 제작 -> metric 측정
3. train set / valid set 따로 앙상블
4. WanDB 적용
"""
        
epochs = 1000
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

In [15]:
print(model.parameters())

<generator object Module.parameters at 0x0000021908C45580>


In [None]:
nm, bmp -> bnp (10x3) X 64x(3x6714) = 64x(10x6714)