In [1]:
import torch
from torch import nn
from torch import optim
import numpy as np
from tqdm.notebook import tqdm

In [2]:
def get_subset(dataset, indices):
    subset = []
    for i in indices:
        subset.append(dataset[i])
    return subset

def slash_train_val(dataset, percentage):
    total = len(dataset)
    val_data = np.random.choice(list(range(total)),int(total*percentage), replace=False).tolist()
    train_data = list(set(range(total)) - set(val_data))
    return val_data, train_data

In [14]:
# dataSetFile: 2D numpy array, [# of moives, # of attributes]
# dataSetFile[0] --> 211 elements, [0:50] plot embeddings, [50:209] attributes, [209] IMDB rating, [210] Douban rating; Missing data = -1
dataSetFile = np.load("npAttrEmbOvwDoubanR.npy")

isMethod1 = True

if isMethod1:
    dataset = []
    for i in range(len(dataSetFile)):
        dataset.append((np.concatenate([dataSetFile[i,0:50]/1000, dataSetFile[i,50:209],dataSetFile[i,209:210]]), dataSetFile[i,210])) # data normalization
else:
    dataset = []
    for i in range(len(dataSetFile)):
        dataset.append((np.concatenate([dataSetFile[i,0:50]/1000, dataSetFile[i,50:209]]), dataSetFile[i,210])) # data normalization

# IMDBintersectDouban: 1D array, movie indices in dataSetFile that IMDB $\cap$ Douban
IMDBintersectDouban = np.load("IMDBIntersectDouban.npy")
dataset_intersect = get_subset(dataset, IMDBintersectDouban)

# IMDBintersectDouban: 1D array, movie indices in dataSetFile that IMDB $-$ Douban
IMDBDifferenceDouban = np.load("IMDBDifferenceDouban.npy")
dataset_difference = get_subset(dataset, IMDBDifferenceDouban)

percentage = 0.15 # percentage of the evaluation set

# slash the three datasets into training and evaluation set
train_total, val_total = slash_train_val(dataset, percentage)
train_intersect, val_intersect = slash_train_val(dataset_intersect, percentage)
train_difference, val_difference = slash_train_val(dataset_difference, percentage)

# sanity check
print(len(train_total))
print(len(val_total))

1276
7234


In [15]:
# The model we use
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        exp = 128
        emb1 = 10
        emb2 = 10
        self.fc11 = nn.Linear(50, emb1)
        self.fc12 = nn.Linear(159, emb2)
        self.fc2 = nn.Linear(emb1+emb2, exp)
        self.fc3 = nn.Linear(exp, exp)
        self.fc4 = nn.Linear(exp,10)
        self.relu = nn.LeakyReLU()
        self.tail = nn.Softmax(dim=1)
        self.drop = torch.nn.Dropout(0.0)
    def forward(self, x):
        x1 = self.drop(self.relu(self.fc11(x[:,:50])))
        x2 = self.drop(self.relu(self.fc12(x[:,50:])))
        x = torch.cat([x1,x2],dim=1)
        x = self.drop(self.relu(self.fc2(x)))
        x = self.drop(self.relu(self.fc3(x)))
        return (self.tail(self.fc4(x)) * (torch.Tensor(list(range(1,11))).to(x.device))).sum(dim=1)

In [22]:
# The model we use
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(210, 32)
        self.fc2 = torch.nn.Linear(32, 32)
        self.fc3 = torch.nn.Linear(32, 10)
        self.drop = torch.nn.Dropout(0.0)
        self.tail = torch.nn.Softmax(dim=1)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.drop(self.relu(self.fc1(x)))
        x = self.drop(self.relu(self.fc2(x)))
        return (self.tail(self.fc3(x)) * (torch.Tensor(list(range(1,11))).to(x.device))).sum(dim=1)

In [23]:
# Training and testing functions for pytorch
def train(epochs):
    loss_min = 10000
    iter_loader = tqdm(range(epochs))
    for _ in iter_loader:
        running_loss = 0.
        i = 0
        for data, labels in train_loader:
            # data = expand_data(data)
            data, labels = data.float().to(device), labels.to(device)
            # print(data[0], labels[0])
            # break
            optimizer.zero_grad()
            output = model(data)
            # print(output.size())
            loss = criteria(output, labels.view(output.size()).float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            i += 1
        # print(running_loss/i)
        scheduler.step()
        val_loss = val()
        iter_loader.set_description(f"{running_loss/i:.4f}, {val_loss:.4f}, {loss_min:.4f}")
        if val_loss < loss_min:
            loss_min = val_loss

def val():
    running_loss = 0.
    i = 0
    for data, labels in val_loader:
        # data = expand_data(data)
        data, labels = data.float().to(device), labels.to(device)
        output = model(data)
        loss = criteria(output, labels.view(output.size()).float())
        running_loss += loss.item()
        i+=1
    return running_loss/i

In [25]:
# training and validation loader definition
train_loader = torch.utils.data.DataLoader(torch.utils.data.Subset(dataset_intersect, train_intersect), batch_size=1024, shuffle=True)
val_loader = torch.utils.data.DataLoader(torch.utils.data.Subset(dataset_intersect, val_intersect), batch_size=1024, shuffle=False)

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = Net()
model = model.to(device)
optimizer = optim.SGD(model.parameters(), 0.1, momentum=0.9, weight_decay=0.)
# optimizer = optim.Adam(model.parameters(), 0.1)
# scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [40, 60, 80], 0.8)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [300, 600, 900], 0.1)
criteria = nn.MSELoss()
train(1000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [45]:
from sklearn import svm
TS = torch.utils.data.Subset(dataset, train_data)
VS = torch.utils.data.Subset(dataset, val_data)
train_loader = torch.utils.data.DataLoader(TS, batch_size=len(TS), shuffle=True)
val_loader = torch.utils.data.DataLoader(VS, batch_size=len(VS), shuffle=False)
for X,y in train_loader:
    pass
regr = svm.SVR(kernel="poly", degree=10, gamma="scale", tol=1e-4, verbose=True)
regr.fit(X, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=10, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.0001, verbose=False)

In [48]:
for X,y in val_loader:
    pass
T = regr.predict(X).astype("float32")
criteria(torch.Tensor(T), y)

tensor(287.7985, dtype=torch.float64)

In [31]:

for i in dataset_intersect:
    a = i[-1]
    
    break

7.8
