In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
from sklearn.externals import joblib
import numpy as np
import sys
sys.path.append('../../../code/utils/')
import perf_utils as pu
import gc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
torch.manual_seed(20180429)    # reproducible
# Hyper Parameters                                      #Haven't figured out, let these just be here
BATCH_SIZE = 256

In [4]:
def csr_to_tensor(csr_matrix, size):
    data = csr_matrix.data
    indices = csr_matrix.indices
    
    # http://pytorch.org/docs/stable/sparse.html
    i = torch.LongTensor([[0, num] for num in indices], device=device)
    v = torch.FloatTensor(data.astype(np.float), device=device)
    result_tensor = torch.sparse.FloatTensor(i.t(), v, size, device=device).to_dense()
    return result_tensor
        

In [5]:
# Here you should convert data into formats that torch accepts
# Make a dataloader
class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.label = y

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

In [6]:
class AutoEncoder(nn.Module):
    def __init__(self, input_size):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.Tanh(),
            nn.Linear(128, 32),
        ).to(device)
        self.decoder = nn.Sequential(
            nn.Linear(32, 128),
            nn.Tanh(),
            nn.Linear(128, input_size),
        ).to(device)

    def forward(self, x):
        encoded = self.encoder(x.to(device))
        decoded = self.decoder(encoded)
        return encoded, decoded

In [7]:
ae = torch.load('./useall_step16000_ae_full_model_checkpoint.pth')

In [8]:
train_X = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/xxx.pkl') # on Server
train_y = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/yyy.pkl')

In [9]:
train_X.shape

(8798814, 419862)

In [10]:
train_y.shape

(8798814,)

In [11]:
completed_index = 21001
new_start_index = completed_index + 1

In [12]:
remaining_train_X = train_X[new_start_index*256:,]
remaining_train_y = train_y[new_start_index*256:]

In [13]:
remaining_train_X.shape

(3422302, 419862)

In [14]:
remaining_train_y.shape

(3422302,)

In [15]:
assert train_X.shape[0] - (completed_index + 1) * BATCH_SIZE == remaining_train_X.shape[0]

In [16]:
train_dataset = MyDataset(remaining_train_X, remaining_train_y)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
del train_X
del train_y
del remaining_train_X
del remaining_train_y
gc.collect()

0

In [17]:
results = joblib.load('./encoded_data_step{}.pkl'.format(completed_index))

In [18]:
len(results)

21002

In [19]:
print('Start from step {}'.format(new_start_index)) # step starts from 0
for step, (x, y) in enumerate(train_loader): # step: batch index; x.shape: [BATCH_SIZE, 1, 374251]; y.shape: [BATCH_SIZE]   
    b_x = torch.tensor(x).to(device)                   # batch x
    b_label = torch.tensor(y).to(device)               # batch label

    encoded, _ = ae(b_x)
    results.append(np.asarray(encoded.detach().cpu().numpy(), dtype=np.float32))
    if step % 300 == 0:
        print('Step {}, Current time: {}'.format(step + new_start_index, pu.get_time_str()))
    if step % 3000 == 0:
        joblib.dump(results, './encoded_data_step{}.pkl'.format(step + new_start_index))
joblib.dump(results, './encoded_data.pkl')

Start from step 21002
Step 21002, Current time: 05:06:22
Step 21302, Current time: 05:10:15
Step 21602, Current time: 05:14:15
Step 21902, Current time: 05:18:17
Step 22202, Current time: 05:22:16
Step 22502, Current time: 05:26:03
Step 22802, Current time: 05:29:50
Step 23102, Current time: 05:33:35
Step 23402, Current time: 05:37:19
Step 23702, Current time: 05:41:06
Step 24002, Current time: 05:44:52
Step 24302, Current time: 05:48:40
Step 24602, Current time: 05:52:23
Step 24902, Current time: 05:56:07
Step 25202, Current time: 05:59:52
Step 25502, Current time: 06:03:36
Step 25802, Current time: 06:07:23
Step 26102, Current time: 06:11:09
Step 26402, Current time: 06:14:54
Step 26702, Current time: 06:18:41
Step 27002, Current time: 06:22:29
Step 27302, Current time: 06:26:18
Step 27602, Current time: 06:30:02
Step 27902, Current time: 06:33:45
Step 28202, Current time: 06:37:33
Step 28502, Current time: 06:41:19
Step 28802, Current time: 06:45:05
Step 29102, Current time: 06:48:4

['./encoded_data.pkl']