In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.utils.data as Data
from sklearn.model_selection import StratifiedKFold
import os
import gc
from sklearn.externals import joblib
import sys
sys.path.append('../../../code/utils')
import data_utils as du
import perf_utils as pu

In [2]:
torch.manual_seed(20180429) # reproducible

<torch._C.Generator at 0x7fa8cc66ac70>

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
# ================
# Hyper Parameters
# ================
# To be figured out
EPOCH = 3
BATCH_SIZE = 256
LR = 0.005     # learning rate

In [5]:
# =========
# Load data
# =========

# run locally
# -----------
# input_folder = '../../../data/input'
# input_file = 'train.raw.binary.pkl'
# input_path = os.path.join(input_folder, input_file)

# with pu.profiler('loading train binary data'):
#     (feat_names, X_tv) = du.load_pickle(input_path)

# run on server
# -------------
X = joblib.load('../autoencoder/encoded_data.pkl')
y = joblib.load('../../../../../zhangez698/TencentAlgo2018/playground/Elvin/autoencoder/yyy.pkl')

In [6]:
len(X)

34371

In [7]:
type(X[0])

numpy.ndarray

In [8]:
flat_X = np.asarray([item[0] for sublist in X for item in sublist])

In [9]:
type(flat_X)

numpy.ndarray

In [10]:
flat_X.shape

(8798814, 32)

In [11]:
flat_X[0]

array([ 0.69011015, -0.64750737,  0.40262654, -0.54495615,  0.73520672,
        0.00882893,  1.93452048,  0.00959935, -1.07703042, -0.01607291,
       -0.53368723, -0.59908909, -0.08460009,  0.29870245,  1.00886631,
       -0.37318343,  0.83975756,  0.66984785, -1.47023582,  0.51318473,
       -0.25033969,  0.14096561, -0.67528933,  0.41401684,  0.34977892,
        0.60410148, -0.71698856, -0.74840277, -0.46575007,  1.11945546,
        0.76301444, -0.43682319], dtype=float32)

In [12]:
# ===============================================
# Split data into training set and validation set
# ===============================================
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits)
split_indices = [(train_index, valid_index) for train_index, valid_index in skf.split(flat_X, y)]

In [13]:
len(split_indices[0][0])

7039050

In [14]:
with pu.profiler('splitting training set and validation set'):
    train_index, valid_index = split_indices[0]
    X_train, y_train = flat_X[train_index, :], y[train_index]
    X_valid, y_valid = flat_X[valid_index, :], y[valid_index]
    gc.collect()

[09:44:43] Finish splitting training set and validation set. △M: +1.11GB. △T: 0.8 seconds.


In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(7039050, 32)
(7039050,)
(1759764, 32)
(1759764,)


In [16]:
# =====================
# Create dataset for nn
# =====================
# def csr_to_tensor(csr_matrix, size):
#     data = csr_matrix.data
#     indices = csr_matrix.indices
    
#     # http://pytorch.org/docs/stable/sparse.html
#     i = torch.LongTensor([[0, num] for num in indices]).to(device)
#     v = torch.FloatTensor(data.astype(np.float)).to(device)
#     result_tensor = torch.sparse.FloatTensor(i.t(), v, size).to(device).to_dense()
#     return result_tensor

class MyDataset(Data.Dataset):
    def __init__(self, x, y):
        self.input = x
        self.label = y

    def __getitem__(self, index):#返回的是tensor
        # convert self.input[index] to tensor
        input_item = self.input[index]
        #x = csr_to_tensor(input_item, torch.Size([1,self.input.shape[1]]))
        x = torch.tensor(self.input[index])
        
        # convert self.label[index] to tensor
        y = torch.tensor(self.label[index])
        return x, y

    def __len__(self):
        return len(self.label)

train_dataset = MyDataset(X_train, y_train)
validation_dataset = MyDataset(X_valid, y_valid)
train_loader = Data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_loader = Data.DataLoader(dataset=validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [36]:
# =====================
# Define Neural Network
# =====================
class  Net(nn.Module):
    def  __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 10000) 
        self.relu1 = nn.ELU()
        self.fc2 = nn.Linear(10000, 2000) 
        self.relu2 = nn.ELU()
        self.fc3 = nn.Linear(2000, 100)
        self.relu3 = nn.ELU()
        self.fc4 = nn.Linear(100, 2)
        

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        return out

In [37]:
# ================================
# Declare mlp, loss, and optimizer
# ================================
with pu.profiler("Setting up mlp"):
    mlp = Net(32) #.to(device) # not using gpu due to current memory limit
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=LR)

[10:14:26] Finish Setting up mlp. △M: +76.05MB. △T: 0.2 seconds.


In [38]:
# ========
# Train nn
# ========
for epoch in range(EPOCH):
    for step, (inputs, labels) in enumerate(train_loader):
#         print('inputs device is {}'.format(inputs.get_device()))

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mlp(inputs)
        labels_long = labels.long()
        loss = criterion(outputs, labels_long)
        loss.backward()
        optimizer.step()
        print('[%d, %5d] loss: %.3f' % (epoch + 1, step + 1, loss))
        # print statistics
        if step % 512 == 511:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, step + 1, loss))

print('Finished Training')

[1,     1] loss: 0.626
[1,     2] loss: 3.457
[1,     3] loss: 72.438
[1,     4] loss: 0.437
[1,     5] loss: 0.209
[1,     6] loss: 0.184
[1,     7] loss: 0.219
[1,     8] loss: 0.236
[1,     9] loss: 0.202
[1,    10] loss: 0.241
[1,    11] loss: 0.194
[1,    12] loss: 0.275
[1,    13] loss: 21.612
[1,    14] loss: 0.169
[1,    15] loss: 0.223


KeyboardInterrupt: 