In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
from sklearn.externals import joblib
import numpy as np
import sys
sys.path.append('../../../code/utils/')
import perf_utils as pu
import gc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
torch.manual_seed(20180429)    # reproducible
# Hyper Parameters                                      #Haven't figured out, let these just be here
BATCH_SIZE = 256

In [4]:
def csr_to_tensor(csr_matrix, size):
    data = csr_matrix.data
    indices = csr_matrix.indices
    
    # http://pytorch.org/docs/stable/sparse.html
    i = torch.LongTensor([[0, num] for num in indices], device=device)
    v = torch.FloatTensor(data.astype(np.float), device=device)
    result_tensor = torch.sparse.FloatTensor(i.t(), v, size, device=device).to_dense()
    return result_tensor
        

In [5]:
# Here you should convert data into formats that torch accepts
# Make a dataloader
class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.label = y

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

In [6]:
class AutoEncoder(nn.Module):
    def __init__(self, input_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.Tanh(),
            nn.Linear(128, 32),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, input_size),
        ).to(device)

    def forward(self, x):
        encoded = self.encoder(x.to(device))
        decoded = self.decoder(encoded)
        return encoded, decoded

In [7]:
ae = torch.load('./useall_step16000_ae_full_model_checkpoint.pth')

In [8]:
train_X = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/xxx.pkl') # on Server
train_y = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/yyy.pkl')

In [9]:
train_X.shape

(8798814, 419862)

In [10]:
train_dataset = MyDataset(train_X, train_y)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
del train_X
del train_y
gc.collect()

0

In [11]:
results = []
for step, (x, y) in enumerate(train_loader): # step: batch index; x.shape: [BATCH_SIZE, 1, 374251]; y.shape: [BATCH_SIZE]   
    b_x = torch.tensor(x).to(device)                   # batch x
    b_y = torch.tensor(x).to(device)
    b_label = torch.tensor(y).to(device)               # batch label

    encoded, _ = ae(b_x)
    results.append(np.asarray(encoded.detach().cpu().numpy(), dtype=np.float32))
    if step % 300 == 0:
        print('Step {}, Current time: {}'.format(step, pu.get_time_str()))
    if step % 1500 == 1499:
        joblib.dump(results, './encoded_data_step{}.pkl'.format(step))
joblib.dump(results, './encoded_data.pkl')

Step 0, Current time: 15:34:22
Step 300, Current time: 15:39:44
Step 600, Current time: 15:45:05
Step 900, Current time: 15:50:27
Step 1200, Current time: 15:55:51
Step 1500, Current time: 16:01:14
Step 1800, Current time: 16:06:38
Step 2100, Current time: 16:12:04
Step 2400, Current time: 16:17:34
Step 2700, Current time: 16:23:00
Step 3000, Current time: 16:28:29
Step 3300, Current time: 16:33:56
Step 3600, Current time: 16:39:22
Step 3900, Current time: 16:44:48
Step 4200, Current time: 16:50:19
Step 4500, Current time: 16:55:42
Step 4800, Current time: 17:01:10
Step 5100, Current time: 17:06:42
Step 5400, Current time: 17:12:21
Step 5700, Current time: 17:17:50
Step 6000, Current time: 17:23:21
Step 6300, Current time: 17:28:52
Step 6600, Current time: 17:34:22
Step 6900, Current time: 17:39:48
Step 7200, Current time: 17:45:12
Step 7500, Current time: 17:50:40
Step 7800, Current time: 17:56:09
Step 8100, Current time: 18:01:34
Step 8400, Current time: 18:07:02
Step 8700, Current t

OSError: [Errno 28] No space left on device