In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import numpy as np
from sklearn.externals import joblib
import sys
sys.path.append('../../../code/utils/')
sys.path.append('../../../code/feature/')
import data_utils as du
import perf_utils as pu
import gc
from scipy.sparse import find

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
t = torch.FloatTensor([1,2,3]).to(device)
print(t.get_device())
non_cuda = torch.FloatTensor([1,2,3])
print(non_cuda.get_device())

0


RuntimeError: get_device is not implemented for type torch.FloatTensor

In [3]:
torch.manual_seed(20180429)    # reproducible
# Hyper Parameters
EPOCH = 100                                       #Haven't figured out, let these just be here
BATCH_SIZE = 256
LR = 5e-5         # learning rate

In [4]:
def csr_to_tensor(csr_matrix, size):
    data = csr_matrix.data
    indices = csr_matrix.indices
    
    # http://pytorch.org/docs/stable/sparse.html
    i = torch.LongTensor([[0, num] for num in indices], device=device)
    v = torch.FloatTensor(data.astype(np.float), device=device)
    result_tensor = torch.sparse.FloatTensor(i.t(), v, size, device=device).to_dense()
    return result_tensor
        

In [5]:
# Here you should convert data into formats that torch accepts
# Make a dataloader
class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.label = y

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

In [6]:
# Here we should probably define the auto-encoder
class AutoEncoder(nn.Module):
    def __init__(self, input_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.Tanh(),
            nn.Linear(128, 32),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, input_size),
        ).to(device)

    def forward(self, x):
        encoded = self.encoder(x.to(device))
        decoded = self.decoder(encoded)
        return encoded, decoded


In [7]:
# load data
df_train = du.load_raw_data("train")
gc.collect()

# train_X: (8798814, 374251)
# train_y: (8798814,)

train_X = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/xxx.pkl') # on Server
train_y = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/yyy.pkl')
#train_X, train_y = du.get_set(df_train, test = False, features_u_want = ['house', 'interest2', 'kw1', 'kw2',  'appIdInstall'], a_features_u_want = ['aid', 'productId'])

print(train_X[0]) # eg. (0, 1) is the position of non-zero data whose value is 1
del df_train
gc.collect()

  (0, 1)	1
  (0, 7)	1
  (0, 10)	1
  (0, 23)	1
  (0, 31)	1
  (0, 728)	1
  (0, 893)	1
  (0, 908)	1
  (0, 923)	1
  (0, 941)	1
  (0, 985)	1
  (0, 986)	1
  (0, 1004)	1
  (0, 1092)	1
  (0, 1103)	1
  (0, 1114)	1
  (0, 1251)	1
  (0, 28269)	1
  (0, 114175)	1
  (0, 122177)	1
  (0, 177443)	1
  (0, 177584)	1
  (0, 310357)	1
  (0, 322279)	1
  (0, 323861)	1
  (0, 324111)	1
  (0, 324797)	1
  (0, 329213)	1
  (0, 330004)	1
  (0, 342260)	1
  (0, 348133)	1
  (0, 412989)	1
  (0, 419204)	1
  (0, 419206)	1
  (0, 419260)	1
  (0, 419395)	1
  (0, 419567)	1
  (0, 419658)	1
  (0, 419775)	1
  (0, 419790)	1
  (0, 419825)	1
  (0, 419859)	1


7

In [8]:
with pu.profiler("Setting up autoencoder"):
    autoencoder = AutoEncoder(train_X.shape[1]).to(device)
    optimizer = torch.optim.Adam(autoencoder.parameters(), lr=LR, weight_decay = 1e-5)
    criterion = nn.MSELoss()

[14:01:43] Finish Setting up autoencoder. △M: +1.48GB. △T: 4.9 seconds.


In [9]:
train_dataset = MyDataset(train_X, train_y)

In [10]:
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
EPOCH = 1

In [12]:
for epoch in range(EPOCH):
    for step, (x, y) in enumerate(train_loader): # step: batch index; x.shape: [BATCH_SIZE, 1, 374251]; y.shape: [BATCH_SIZE]   
        b_x = torch.tensor(x).to(device)                   # batch x
        b_y = torch.tensor(x).to(device)
        b_label = torch.tensor(y).to(device)               # batch label

        encoded, decoded = autoencoder(b_x)

        loss = criterion(decoded, b_y)      # mean square error
        optimizer.zero_grad()               # clear gradients for this training step since gradients are accumulated in the process
        loss.backward()                     # backpropagation, compute gradients
        optimizer.step()                    # apply gradients. "The function can be called once the gradients are computed using eg. backward()"

        # Currently the training speed is really slow, change the numbers below according to what you need
        if step % 10 == 0:
            print('epoch [{}/{}], samples[{}/{}], loss:{:.8f}'
                  .format(epoch + 1, EPOCH, (step + 1) * BATCH_SIZE, train_X.shape[0], loss.item()))
            print('end time: {}'.format(pu.get_time_str()))
            print('-' * 80)
        if step == 100:
            break;

epoch [1/1], samples[256/8798814], loss:0.00593919
end time: 14:01:46
--------------------------------------------------------------------------------
epoch [1/1], samples[2816/8798814], loss:0.00531732
end time: 14:01:57
--------------------------------------------------------------------------------
epoch [1/1], samples[5376/8798814], loss:0.00492823
end time: 14:02:09
--------------------------------------------------------------------------------
epoch [1/1], samples[7936/8798814], loss:0.00466196
end time: 14:02:20
--------------------------------------------------------------------------------
epoch [1/1], samples[10496/8798814], loss:0.00445479
end time: 14:02:31
--------------------------------------------------------------------------------
epoch [1/1], samples[13056/8798814], loss:0.00426609
end time: 14:02:42
--------------------------------------------------------------------------------
epoch [1/1], samples[15616/8798814], loss:0.00410123
end time: 14:02:54
---------------

In [13]:
first_input = train_dataset.input[0]
first_data = csr_to_tensor(first_input, torch.Size([1,first_input.shape[1]]))

In [14]:
encoded_data, _ = autoencoder(first_data)

In [15]:
encoded_data.shape

torch.Size([1, 32])

In [16]:
encoded_data

tensor([[ 0.1276,  0.1587,  0.0816, -0.0512,  0.1101,  0.0090,  0.1257,
         -0.0423, -0.1263, -0.0075, -0.0329, -0.0233, -0.1533, -0.0071,
          0.0448,  0.0120, -0.0661,  0.0033, -0.1367,  0.0635,  0.0082,
          0.0133, -0.0374,  0.0835,  0.0618,  0.0555, -0.0762, -0.1035,
          0.0736,  0.1615,  0.1884, -0.0919]], device='cuda:0')