In [1]:
import torch
import torch.nn as nn
import torch.utils.data as Data
from sklearn.model_selection import StratifiedKFold
import os
import gc
import sys
from sklearn.externals import joblib
sys.path.append('../../../code/utils')
import data_utils as du
import perf_utils as pu

In [2]:
torch.manual_seed(20180429) # reproducible

<torch._C.Generator at 0x7fe15dc3bc70>

In [3]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)

cuda:1


In [4]:
# ================
# Hyper Parameters
# ================
# To be figured out
EPOCH = 3
BATCH_SIZE = 512
LR = 0.005     # learning rate

In [5]:
# =========
# Load data
# =========

# run locally
# -----------
# input_folder = '../../../data/input'
# input_file = 'train.raw.binary.pkl'
# input_path = os.path.join(input_folder, input_file)

# with pu.profiler('loading train binary data'):
#     (feat_names, X_tv) = du.load_pickle(input_path)

# run on server
# -------------
X = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/xxx.pkl')
y = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/yyy.pkl')

In [6]:
X.shape

(8798814, 419862)

In [7]:
print('Average non-zero entry per row: {:.1f}'.format(X.getnnz() / X.shape[0]))

Average non-zero entry per row: 75.2


In [8]:
# ===============================================
# Split data into training set and validation set
# ===============================================
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(X, y)]

with pu.profiler('splitting training set and validation set'):
    train_index, valid_index = split_indices[0]
    X_train, y_train = X[train_index, :], y[train_index]
    X_valid, y_valid = X[valid_index, :], y[valid_index]
    gc.collect()

[13:39:25] Finish splitting training set and validation set. △M: +3.2GB. △T: 13.8 seconds.


In [9]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(7039050, 419862)
(7039050,)
(1759764, 419862)
(1759764,)


In [10]:
# =====================
# Create dataset for nn
# =====================
def csr_to_tensor(csr_matrix, size):
    data = csr_matrix.data
    indices = csr_matrix.indices
    
    # http://pytorch.org/docs/stable/sparse.html
    i = torch.LongTensor([[0, num] for num in indices]).to(device)
    v = torch.FloatTensor(data.astype(np.float)).to(device)
    result_tensor = torch.sparse.FloatTensor(i.t(), v, size).to(device).to_dense()
    return result_tensor

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.label = y

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

train_dataset = MyDataset(X_train, y_train)
validation_dataset = MyDataset(X_valid, y_valid)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = Data.DataLoader(dataset=validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [11]:
# =====================
# Define Neural Network
# =====================
class  Net(nn.Module):
    def  __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 10000) 
        self.relu1 = nn.ELU()
        self.fc2 = nn.Linear(100000, 20000) 
        self.relu2 = nn.ELU()
        self.fc3 = nn.Linear(20000, 100)
        self.relu3 = nn.ELU()
        self.fc4 = nn.Linear(100, 2)
        

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [13]:
# ================================
# Declare mlp, loss, and optimizer
# ================================
with pu.profiler("Setting up mlp"):
    mlp = Net(32).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=LR)

[13:43:53] Finish Setting up mlp. △M: +9.16MB. △T: 21.8 seconds.


In [None]:
# ========
# Train nn
# ========
for epoch in range(EPOCH):
    for step, (inputs, labels) in enumerate(train_loader):
        print('inputs device is {}'.format(inputs.get_device())

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mlp(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        if step % 512 == 511:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, step + 1, loss))

print('Finished Training')