In [8]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/feature/')
import data_utils as du
import perf_utils as pu
import gc
from scipy.sparse import find

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
torch.manual_seed(20180429)    # reproducible
# Hyper Parameters
EPOCH = 100                                       #Haven't figured out, let these just be here
BATCH_SIZE = 256
LR = 5e-5         # learning rate

In [11]:
def csr_to_tensor(csr_matrix, size):
    data = csr_matrix.data
    indices = csr_matrix.indices
    
    # http://pytorch.org/docs/stable/sparse.html
    i = torch.LongTensor([[0, num] for num in indices], device=device)
    v = torch.FloatTensor(data.astype(np.float), device=device)
    result_tensor = torch.sparse.FloatTensor(i.t(), v, size, device=device).to_dense()
    return result_tensor
        

In [12]:
# Here you should convert data into formats that torch accepts
# Make a dataloader
class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x.to(device)
        self.label = y.to(device)

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

In [13]:
# Here we should probably define the auto-encoder
class AutoEncoder(nn.Module):
    def __init__(self, input_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.Tanh(),
            nn.Linear(128, 32),
        )
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, input_size),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded


In [14]:
# load data
df_train = du.load_raw_data("train")
gc.collect()

# train_X: (8798814, 374251)
# train_y: (8798814,)
train_X, train_y = du.get_set(df_train, test = False, features_u_want = ['house', 'interest2', 'kw1', 'kw2',  'appIdInstall'], a_features_u_want = ['aid', 'productId'])
print(X[0]) # eg. (0, 1) is the position of non-zero data whose value is 1
del df_train
gc.collect()

FileNotFoundError: File b'/home/lilylee/TencentAlgo2018/code/utils/../../data/raw/preliminary_contest_data/train.csv' does not exist

In [203]:
with pu.profiler("Setting up autoencoder"):
    autoencoder = AutoEncoder(train_X.shape[1])
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LR, weight_decay = 1e-5)
    criterion = nn.MSELoss()

[08:57:56] Finish Setting up autoencoder. △M: -522.59MB. △T: 1.3 seconds.


In [204]:
train_dataset = MyDataset(train_X, train_y)

In [205]:
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [206]:
EPOCH = 1

In [None]:
for epoch in range(EPOCH):
    for step, (x, y) in enumerate(train_loader): # step: batch index; x.shape: [BATCH_SIZE, 1, 374251]; y.shape: [BATCH_SIZE]   
        b_x = torch.tensor(x)                   # batch x
        b_y = torch.tensor(x)
        b_label = torch.tensor(y)               # batch label

        encoded, decoded = autoencoder(b_x)

        loss = criterion(decoded, b_y)      # mean square error
        optimizer.zero_grad()               # clear gradients for this training step since gradients are accumulated in the process
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients. "The function can be called once the gradients are computed using eg. backward()"

        # Currently the training speed is really slow, change the numbers below according to what you need
        if step % 10 == 0:
            print('epoch [{}/{}], samples[{}/{}], loss:{:.8f}'
                  .format(epoch + 1, EPOCH, (step + 1) * BATCH_SIZE, X.shape[0], loss.item()))
        if step == 100:
            break;

epoch [1/1], samples[256/8798814], loss:0.00487959
epoch [1/1], samples[2816/8798814], loss:0.00456661
epoch [1/1], samples[5376/8798814], loss:0.00426460
epoch [1/1], samples[7936/8798814], loss:0.00400074
epoch [1/1], samples[10496/8798814], loss:0.00377028
epoch [1/1], samples[13056/8798814], loss:0.00356811
epoch [1/1], samples[15616/8798814], loss:0.00339412
epoch [1/1], samples[18176/8798814], loss:0.00324184
epoch [1/1], samples[20736/8798814], loss:0.00310849


In [172]:
first_input = train_dataset.input[0]
first_data = csr_to_tensor(view_input, torch.Size([1,first_input.shape[1]]))

In [173]:
encoded_data, _ = autoencoder(first_data)

In [174]:
encoded_data.shape

torch.Size([1, 32])

In [175]:
encoded_data

tensor([[ 0.0013,  0.0130, -0.0125, -0.0441,  0.0347,  0.0522,  0.0297,
          0.0638, -0.0641,  0.0720,  0.1001, -0.0168, -0.0378, -0.0615,
          0.0375, -0.0068,  0.0060, -0.0434,  0.0877, -0.0646,  0.0087,
          0.0025,  0.0305,  0.0745,  0.0341,  0.0336,  0.0147, -0.0297,
         -0.0067,  0.0187, -0.0785,  0.0901]])