In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd

# Data Pipeline

In [3]:
data = pd.read_csv('DSL-StrongPasswordData.csv')

In [4]:
time_stamp = data.iloc[:, 3:]

typist = np.zeros(data.shape[0])
typist_list = list(set(data.subject))
for i, s in data.subject.iteritems():
    typist[i] = typist_list.index(s)

In [5]:
for i, t in time_stamp.iterrows():
    print(t.values, typist_list[i])
    # _t = transform(t.values)
    # pred = model(_t)
    # loss = cross_entropy(pred, typist_list[i])
    # loss.backward()
    break

[0.1491 0.3979 0.2488 0.1069 0.1674 0.0605 0.1169 0.2212 0.1043 0.1417
 1.1885 1.0468 0.1146 1.6055 1.4909 0.1067 0.759  0.6523 0.1016 0.2136
 0.112  0.1349 0.1484 0.0135 0.0932 0.3515 0.2583 0.1338 0.3509 0.2171
 0.0742] s022


In [6]:
np_time_stamp = np.array(time_stamp)

In [7]:
training_step = 100
batch_size = 8
for t_s in range(training_step):
    idx_sample = np.random.randint(low=0, high=len(np_time_stamp), size=batch_size)
    # pred = model(np_time_stamp[idx_sample, :])
    # loss = cross_entropy(pred, typist_list[i])
    # loss.backward()
    break

1. read into (cpu) memory in advance
2. batching with multi-workers
3. more portable
4. shuffling the data

## Dataset

In [8]:
# https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataset.html#Dataset
class KeyStrokeDataset(torch.utils.data.Dataset):
    """
    A Map-style datasets.
    """
    def __init__(self, time_stamp, typist, transform=None):
        self.x = time_stamp
        self.label = typist
        self.transform = transform

    def __len__(self):
        # for sampler
        return len(self.label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        _x = self.x[idx, :]
        _label = self.label[idx]
        sample = {'x': _x, 'label': _label}

        if self.transform:
            sample = self.transform(sample)

        return sample
    
    def __iter__(self):
        """
        For a Iterable-style datasets.
        """
        raise NotImplementedError

In [9]:
training = KeyStrokeDataset(np_time_stamp, typist)

In [1]:
for t in training:
    break

NameError: name 'training' is not defined

In [10]:
len(training)

20400

In [11]:
training.__dict__

{'x': array([[0.1491, 0.3979, 0.2488, ..., 0.3509, 0.2171, 0.0742],
        [0.1111, 0.3451, 0.234 , ..., 0.2756, 0.1917, 0.0747],
        [0.1328, 0.2072, 0.0744, ..., 0.2847, 0.1762, 0.0945],
        ...,
        [0.0939, 0.1189, 0.025 , ..., 0.2017, 0.0983, 0.0905],
        [0.0923, 0.1294, 0.0371, ..., 0.1917, 0.0938, 0.0931],
        [0.0596, 0.131 , 0.0714, ..., 0.1993, 0.1186, 0.1018]]),
 'label': array([25., 25., 25., ..., 24., 24., 24.]),
 'transform': None}

In [12]:
training.x, training.label

(array([[0.1491, 0.3979, 0.2488, ..., 0.3509, 0.2171, 0.0742],
        [0.1111, 0.3451, 0.234 , ..., 0.2756, 0.1917, 0.0747],
        [0.1328, 0.2072, 0.0744, ..., 0.2847, 0.1762, 0.0945],
        ...,
        [0.0939, 0.1189, 0.025 , ..., 0.2017, 0.0983, 0.0905],
        [0.0923, 0.1294, 0.0371, ..., 0.1917, 0.0938, 0.0931],
        [0.0596, 0.131 , 0.0714, ..., 0.1993, 0.1186, 0.1018]]),
 array([25., 25., 25., ..., 24., 24., 24.]))

In [13]:
# https://pytorch.org/docs/stable/_modules/torch/utils/data/sampler.html#RandomSampler
sampler = torch.utils.data.RandomSampler(training)
# https://pytorch.org/docs/stable/_modules/torch/utils/data/sampler.html#BatchSampler
batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size=8, drop_last=True)

In [14]:
len(batch_sampler) == 20400 // 8

True

In [15]:
iter_sampler = iter(sampler)
iter_batch_sampler = iter(batch_sampler)
next(iter_sampler), next(iter_batch_sampler)

(2188, [20061, 14466, 19994, 17129, 19705, 1817, 14918, 11423])

### Customized Sampler

In [16]:
class RandomSampler(torch.utils.data.Sampler):
    r"""Samples elements randomly, without replacement.
    Arguments:
        data_source (Dataset): dataset to sample from
    """

    def __init__(self, model, data_source, train_data, train_target, batch_size):
        self.model = model
        self.data_source = data_source
        self.batch_size = batch_size
        self.data = train_data
        self.target = train_target
        # self.data = torch.unsqueeze(self.data, 1)
        # self.data = self.data.type(torch.cuda.FloatTensor)

    def get_hard_examples_score(self):
        with torch.no_grad():
            output, feat = self.model.forward(self.data)
            criterion = nn.CrossEntropyLoss(reduce=False)
            loss = criterion(output, self.target)
        return loss

    def __iter__(self):
        num_batches = len(self.data_source) // self.batch_size
        while num_batches > 0:
            loss = self.get_hard_examples_score()
            sampled = []
            while len(sampled) < self.batch_size:
                sampled.append(torch.argmax(loss))
            yield sampled
            num_batches -=1

    def __len__(self):
        return len(self.data_source)

In [17]:
class BatchSampler(torch.utils.data.Sampler):
    def __init__(self, sampler, batch_size, drop_last):
        self.sampler = sampler
        self.batch_size = batch_size
        self.drop_last = drop_last

    def __iter__(self):
        batch = []
        for _, idx in enumerate(iter(self.sampler)):
            batch = idx
            yield batch

        if len(batch) > 0 and not self.drop_last:
            yield batch

    def __len__(self):
        return len(self.sampler) // self.batch_size

## Dataloader

In [18]:
data_loader = torch.utils.data.DataLoader(
        training, batch_sampler=batch_sampler, num_workers=4,
        collate_fn=None, pin_memory=True)

In [19]:
for d in data_loader:
    print(d, type(d['x']))
    print(d['x'].shape)
    break

{'x': tensor([[ 0.0327,  0.2673,  0.2346,  0.0578,  0.1348,  0.0770,  0.0464,  0.1081,
          0.0617,  0.0652,  0.1401,  0.0749,  0.0699,  0.3638,  0.2939,  0.1061,
          0.2530,  0.1469,  0.0549,  0.1502,  0.0953,  0.0636,  0.1119,  0.0483,
          0.0528,  0.1454,  0.0926,  0.0594,  0.2217,  0.1623,  0.0578],
        [ 0.0886,  0.2576,  0.1690,  0.0855,  0.1164,  0.0309,  0.1027,  0.0934,
         -0.0093,  0.0971,  0.2582,  0.1611,  0.0907,  0.3226,  0.2319,  0.0910,
          0.1301,  0.0391,  0.0731,  0.1302,  0.0571,  0.0987,  0.1265,  0.0278,
          0.0902,  0.2025,  0.1123,  0.1071,  0.2731,  0.1660,  0.1034],
        [ 0.0855,  0.1411,  0.0556,  0.1032,  0.1185,  0.0153,  0.0744,  0.1082,
          0.0338,  0.0655,  0.2821,  0.2166,  0.1320,  0.7161,  0.5841,  0.1054,
          0.3165,  0.2111,  0.0774,  0.2108,  0.1334,  0.1312,  0.3070,  0.1758,
          0.1201,  0.0971, -0.0230,  0.1085,  0.3068,  0.1983,  0.0665],
        [ 0.0898,  0.2225,  0.1327,  0.0911,  

In [20]:
def collate_fn_train(batch):
    print(batch, type(batch))
    return batch

In [21]:
data_loader = torch.utils.data.DataLoader(
        training, batch_sampler=batch_sampler, num_workers=4,
        collate_fn=collate_fn_train, pin_memory=True)

In [22]:
for d in data_loader:
    break

[{'x': array([0.0675, 0.1608, 0.0933, 0.0902, 0.1841, 0.0939, 0.0921, 0.1589,
       0.0668, 0.1161, 0.6258, 0.5097, 0.1047, 0.3893, 0.2846, 0.0914,
       0.1505, 0.0591, 0.0575, 0.176 , 0.1185, 0.0636, 0.0803, 0.0167,
       0.057 , 0.1691, 0.1121, 0.0921, 0.2281, 0.136 , 0.1066]), 'label': 9.0}, {'x': array([ 0.0731,  0.1155,  0.0424,  0.0623,  0.1842,  0.1219,  0.0974,
        0.1439,  0.0465,  0.0581,  0.2629,  0.2048,  0.0903,  0.316 ,
        0.2257,  0.0866,  0.1106,  0.024 ,  0.0771,  0.0893,  0.0122,
        0.1006,  0.0842, -0.0164,  0.0724,  0.1426,  0.0702,  0.057 ,
        0.2195,  0.1625,  0.1006]), 'label': 21.0}, {'x': array([0.1013, 0.8044, 0.7031, 0.086 , 0.1103, 0.0243, 0.0736, 0.133 ,
       0.0594, 0.0865, 0.7848, 0.6983, 0.0615, 0.4437, 0.3822, 0.072 ,
       0.1673, 0.0953, 0.0668, 0.1428, 0.076 , 0.0995, 0.5223, 0.4228,
       0.091 , 0.2186, 0.1276, 0.095 , 0.2995, 0.2045, 0.086 ]), 'label': 44.0}, {'x': array([ 0.0795,  0.2209,  0.1414,  0.1053,  0.1346,  0.0

       0.0623, 0.1887, 0.1264, 0.0415, 0.1979, 0.1564, 0.0536]), 'label': 31.0}]<class 'list'><class 'list'> 

 <class 'list'><class 'list'>

[{'x': array([ 0.0971,  0.1345,  0.0374,  0.1016,  0.1293,  0.0277,  0.0829,
        0.0776, -0.0053,  0.1398,  0.1667,  0.0269,  0.106 ,  0.2081,
        0.1021,  0.1261,  0.0976, -0.0285,  0.0889,  0.108 ,  0.0191,
        0.0724,  0.0589, -0.0135,  0.0587,  0.1705,  0.1118,  0.1071,
        0.1626,  0.0555,  0.1026]), 'label': 29.0}, {'x': array([ 0.1293,  0.1456,  0.0163,  0.1892,  0.2106,  0.0214,  0.1473,
        0.1563,  0.009 ,  0.1423,  0.2307,  0.0884,  0.1201,  0.3066,
        0.1865,  0.1325,  0.1819,  0.0494,  0.112 ,  0.0785, -0.0335,
        0.182 ,  0.1664, -0.0156,  0.1226,  0.0988, -0.0238,  0.122 ,
        0.2217,  0.0997,  0.1288]), 'label': 50.0}, {'x': array([0.0816, 0.2349, 0.1533, 0.0731, 0.1636, 0.0905, 0.066 , 0.1518,
       0.0858, 0.0808, 0.1623, 0.0815, 0.0729, 0.2994, 0.2265, 0.0774,
       0.1637, 0.0863, 0.09  , 0.

        0.2692,  0.1721,  0.081 ]), 'label': 18.0}]<class 'list'><class 'list'> <class 'list'>


<class 'list'>


def collate_fn_train(batch):
    x = []
    label = []
    for b in batch:
        x.append(b['x'] * 10)
        label.append(b['label'])
        
    return torch.FloatTensor(x), torch.FloatTensor(label)

In [23]:
data_loader = torch.utils.data.DataLoader(
        training, batch_sampler=batch_sampler, num_workers=4,
        collate_fn=collate_fn_train, pin_memory=True)
for x, label in data_loader:
    print(x, label)
    print(x.shape, label.shape)
    break

[{'x': array([0.0892, 0.3617, 0.2725, 0.0731, 0.1863, 0.1132, 0.0547, 0.1067,
       0.052 , 0.0752, 0.5642, 0.489 , 0.0668, 0.3012, 0.2344, 0.0829,
       0.1391, 0.0562, 0.0795, 0.1202, 0.0407, 0.0708, 0.0822, 0.0114,
       0.0686, 0.1583, 0.0897, 0.08  , 0.5029, 0.4229, 0.081 ]), 'label': 7.0}, {'x': array([0.1217, 0.1306, 0.0089, 0.0863, 0.1333, 0.047 , 0.0787, 0.1112,
       0.0325, 0.0966, 0.2752, 0.1786, 0.0913, 0.4141, 0.3228, 0.1217,
       0.2363, 0.1146, 0.0956, 0.1619, 0.0663, 0.1206, 0.1294, 0.0088,
       0.0774, 0.1771, 0.0997, 0.0898, 0.2124, 0.1226, 0.0776]), 'label': 25.0}, {'x': array([ 0.0918,  0.1265,  0.0347,  0.0934,  0.1164,  0.023 ,  0.0615,
        0.0634,  0.0019,  0.0578,  0.1796,  0.1218,  0.0857,  0.3018,
        0.2161,  0.1315,  0.1193, -0.0122,  0.1302,  0.1244, -0.0058,
        0.1219,  0.0473, -0.0746,  0.1517,  0.1224, -0.0293,  0.0799,
        0.1703,  0.0904,  0.0681]), 'label': 6.0}, {'x': array([ 0.0708,  0.2407,  0.1699,  0.071 ,  0.0877,  0.01

        0.2127,  0.12  ,  0.0708]), 'label': 11.0}] 
 <class 'list'> <class 'list'>
<class 'list'>
[{'x': array([ 0.0931,  0.165 ,  0.0719,  0.0831,  0.0855,  0.0024,  0.0879,
        0.0855, -0.0024,  0.1166,  0.2118,  0.0952,  0.0913,  0.4967,
        0.4054,  0.1369,  0.1771,  0.0402,  0.0855,  0.2325,  0.147 ,
        0.1256,  0.1195, -0.0061,  0.0858,  0.1238,  0.038 ,  0.0942,
        0.2202,  0.126 ,  0.0791]), 'label': 23.0}, {'x': array([ 0.1619,  0.3599,  0.198 ,  0.1219,  0.2931,  0.1712,  0.1406,
        0.1395, -0.0011,  0.1396,  0.244 ,  0.1044,  0.0876,  0.425 ,
        0.3374,  0.1482,  0.1807,  0.0325,  0.1577,  0.1451, -0.0126,
        0.1667,  0.0963, -0.0704,  0.144 ,  0.1638,  0.0198,  0.163 ,
        0.2462,  0.0832,  0.1137]), 'label': 33.0}, {'x': array([0.0393, 0.9894, 0.9501, 0.062 , 0.1625, 0.1005, 0.0522, 0.6601,
       0.6079, 0.0464, 0.2059, 0.1595, 0.0565, 0.5342, 0.4777, 0.0367,
       0.5905, 0.5538, 0.0457, 0.2098, 0.1641, 0.0641, 0.3772, 0.3131,
     

ValueError: too many values to unpack (expected 2)