In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

import json
import pickle

import os
import moxing as mox

mox.file.shift('os', 'mox')

np.random.seed(2020)
torch.manual_seed(2020)

NUM_SAMPLES = 2001

device = 'cuda'

root_dir = 's3://my-modelarts-xyx-010/notebook-001/'

data_dir = os.path.join(root_dir, 'data')
model_dir = os.path.join(root_dir, 'model')

pca_path = os.path.join(model_dir, 'classifier-pca.pkl')
model_path = os.path.join(model_dir, 'classifier.pt')

INFO:root:Using MoXing-v1.17.3-
INFO:root:Using OBS-Python-SDK-3.20.7


In [2]:
def get_data(filename):
    with open(filename, encoding='utf-8') as file:
        return json.load(file)

def sample_feats(feats, num):
    idx = np.arange(len(feats))
    np.random.shuffle(idx)
    idx = idx[:num]
    assert len(idx) == num
    return np.sum(feats[idx], axis=0) / num

def sample_from_record(record, num_per_sample, X, y):
    X.append(sample_feats(record['feats'], num_per_sample).reshape(-1))
    y.append(0 if record['name'] == 'cotton' else 1)

def repetitive_sample_from_record(record, num_samples, num_per_sample, X, y):
    [
        sample_from_record(record, num_per_sample, X, y)
        for _ in range(num_samples)
    ]

def get_id_dict(dataset):
    id_dict = {'cotton': [], 'cotton_spandex': []}
    for i, record in enumerate(dataset):
        id_dict[record['name']].append(i)
    return id_dict

def sample_dataset(dataset, num_samples, num_per_sample, balance):
    id_dict = get_id_dict(dataset)
    X = []
    y = []
    for record in dataset:
        repetitive_sample_from_record(record, num_samples, num_per_sample, X, y)
    a = len(id_dict['cotton'])
    b = len(id_dict['cotton_spandex']) * 1.0
    b = int(b)
    print(a, b)
    if balance and a != b:
        if a > b:
            diff = a - b
            idx = np.arange(b)
            name = 'cotton_spandex'
        else:  # b > a
            diff = b - a
            idx = np.arange(a)
            name = 'cotton'
        np.random.shuffle(idx)
        idx = idx[:diff]
        [
            repetitive_sample_from_record(
                dataset[id_dict[name][i]],
                num_samples, num_per_sample, X, y
            ) for i in idx
        ]
    return np.array(X), np.array(y, dtype=np.int64)

def process_dataset(dataset, num_samples=NUM_SAMPLES, num_per_sample=15, balance=False, dev=0):
    id_dict = get_id_dict(dataset)
    for record in dataset:
        feats = np.array(record['feat'], np.float32)
        feats2 = []
        for feat in feats:
            feat[:, 0] /= 1800
            feat[:, 2] /= 500000
            feat = feat[:, 1:]
            feats2.append(feat)
        record.pop('feat')
        feats = np.array(feats2, np.float32)
        record['feats'] = feats  # not `feat`
    if dev == 0:  # no dev set
        return sample_dataset(dataset, num_samples, num_per_sample, balance)
    else:
        def sample_id(id_list):
            idx = np.arange(len(id_list))
            np.random.shuffle(idx)
            return [id_list[i] for i in idx[:dev]], [id_list[i] for i in idx[dev:]]
        c_dev, c_train = sample_id(id_dict['cotton'])
        cs_dev, cs_train = sample_id(id_dict['cotton_spandex'])
        dev_id = c_dev + cs_dev
        train_id = c_train + cs_train
        dev_ds = [dataset[i] for i in dev_id]
        train_ds = [dataset[i] for i in train_id]
        return [d for d in sample_dataset(train_ds, num_samples, num_per_sample, balance)] + \
               [d for d in sample_dataset(dev_ds, num_samples, num_per_sample, balance)]

In [3]:
def get_models():
    models = [
        nn.Sequential(
            nn.Linear(228, 512),
            nn.Sigmoid(),
            nn.LayerNorm(512),
            nn.Dropout(),
            nn.Linear(512, 512),
            nn.Sigmoid(),
            nn.LayerNorm(512),
            nn.Dropout(),
            nn.Linear(512, 256),
            nn.Sigmoid(),
            nn.LayerNorm(256),
            nn.Dropout(),
            nn.Linear(256, 128),
            nn.Sigmoid(),
            nn.LayerNorm(128),
            nn.Dropout(),
            nn.Linear(128, 2),
            nn.Softmax(dim=-1)
        ) for _ in range(15)
    ]
    for model in models:
        model.to(device)
    return models

def predict_on_single_model(model, data_X):
    data_loader = DataLoader(data_X, batch_size=512)
    y_pred = torch.zeros(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for batch_X in data_loader:
            batch_X = batch_X.to(device)
            output = model(batch_X)
            y_pred = torch.cat([y_pred, torch.argmax(output, dim=-1)])
    return y_pred.cpu().numpy()

def predict(models, data_X):
    preds = [predict_on_single_model(model, data_X) for model in models]
    for pred in preds:
        assert len(pred) % NUM_SAMPLES == 0
        assert len(preds[0]) == len(pred)
    y_pred = []
    for k in range(0, len(preds[0]), NUM_SAMPLES):
        a = 0
        b = 0
        for pred in preds:
            a += np.sum(pred[k:k + NUM_SAMPLES] == 0)
            b += np.sum(pred[k:k + NUM_SAMPLES] == 1)
        assert a != b
        y_pred.append(0 if a > b else 1)
    return y_pred

def evaluate(models, data_X, y):
    y_true = []
    assert len(y) % NUM_SAMPLES == 0
    for k in range(0, len(y), NUM_SAMPLES):
        assert np.sum(y[k:k + NUM_SAMPLES] == y[k]) == NUM_SAMPLES
        y_true.append(y[k])
    y_pred = predict(models, data_X)
    print(classification_report(y_true, y_pred, digits=4))

In [4]:
def train_single_model(model, pca_train_X, train_y, pca_dev_X, dev_y, pca_test_X, test_y):
    n = len(pca_train_X)
    idx = np.arange(n)
    idx = np.random.choice(idx, size=int(n * 0.9))
    training_set = DataLoader(list(zip(pca_train_X[idx], train_y[idx])),
                              batch_size=512,
                              shuffle=True)
    optimizer = optim.Adam(model.parameters())
    loss_fn = nn.BCELoss()
    epoch = 0
    best_acc, best_epoch = 0, 0
    while epoch < 100:
        epoch += 1
        model.train()
        with torch.enable_grad():
            for batch_X, batch_y in training_set:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                optimizer.zero_grad()
                output = model(batch_X)
                loss = loss_fn(output, torch.eye(2).to(device)[batch_y])
                loss.backward()
                optimizer.step()
        acc = np.sum(predict_on_single_model(model, pca_dev_X) == dev_y)
        if acc > best_acc:
            best_acc = acc
            best_epoch = epoch
        elif epoch - best_epoch >= 10:
            print('early stopping on epoch {}, dev acc {:.4f}'.format(epoch, acc / dev_y.shape[0]))
            break
    y_pred = predict_on_single_model(model, pca_test_X)
    return np.sum(y_pred == test_y) / y_pred.shape[0]

def train():
    models = get_models()
    
    dataset_train = get_data(os.path.join(data_dir, 'train.json'))
    dataset_test = get_data(os.path.join(data_dir, 'test.json'))

    train_X, train_y, dev_X, dev_y = process_dataset(dataset_train, balance=True, dev=10)
    test_X, test_y = process_dataset(dataset_test)

    del dataset_train, dataset_test

    pca = PCA(n_components=228)
    pca_train_X = pca.fit_transform(train_X)
    pca_dev_X = pca.transform(dev_X)
    pca_test_X = pca.transform(test_X)
    
    del train_X, dev_X, test_X
    
    print('\nsaving PCA to: ' + pca_path)
    with open(pca_path, 'wb') as file:
        pickle.dump(pca, file)
    
    print('\ndata preprocessed\n\ntraining models\n')

    for i, model in enumerate(models, start=1):
        acc = train_single_model(model,
                                 pca_train_X, train_y,
                                 pca_dev_X, dev_y,
                                 pca_test_X, test_y)
        print(f'{i}/{len(models)} - test acc: {acc:.4f}')
    
    print('\nmodels trained\n\nevaluating on the training set:')
    evaluate(models, pca_train_X, train_y)
    print('\nevaluating on the dev set:')
    evaluate(models, pca_dev_X, dev_y)
    print('\nevaluating on the test set:')
    evaluate(models, pca_test_X, test_y)

    print('\nsaving models to ' + model_path)
    torch.save([model.state_dict() for model in models], model_path)

In [5]:
# training
train()

60 60
10 10
25 25

saving PCA to: s3://my-modelarts-xyx-010/notebook-001/model/classifier-pca.pkl

data preprocessed

training models

early stopping on epoch 11, dev acc 1.0000
1/15 - test acc: 0.9624
early stopping on epoch 12, dev acc 1.0000
2/15 - test acc: 0.9514
early stopping on epoch 11, dev acc 1.0000
3/15 - test acc: 0.9596
early stopping on epoch 12, dev acc 1.0000
4/15 - test acc: 0.9540
early stopping on epoch 11, dev acc 1.0000
5/15 - test acc: 0.9302
early stopping on epoch 11, dev acc 1.0000
6/15 - test acc: 0.9532
early stopping on epoch 12, dev acc 1.0000
7/15 - test acc: 0.9521
early stopping on epoch 12, dev acc 1.0000
8/15 - test acc: 0.9776
early stopping on epoch 11, dev acc 1.0000
9/15 - test acc: 0.9423
early stopping on epoch 11, dev acc 1.0000
10/15 - test acc: 0.9452
early stopping on epoch 12, dev acc 1.0000
11/15 - test acc: 0.9407
early stopping on epoch 12, dev acc 1.0000
12/15 - test acc: 0.9539
early stopping on epoch 11, dev acc 1.0000
13/15 - test ac

In [6]:
# evaluating the saved models

dataset_test = get_data(os.path.join(data_dir, 'test.json'))
test_X, test_y = process_dataset(dataset_test)

del dataset_test

print('loading PCA from: ' + pca_path)
with open(pca_path, 'rb') as file:
    pca = pickle.load(file)

pca_test_X = pca.transform(test_X)

del pca, test_X

models = get_models()
print('loading models from: ' + model_path)
state_dicts = torch.load(model_path)
assert len(state_dicts) == len(models)
for model, state_dict in zip(models, state_dicts):
    model.load_state_dict(state_dict)

print('evaluating on the test set:')
evaluate(models, pca_test_X, test_y)

del models, pca_test_X, test_y

25 25
loading PCA from: s3://my-modelarts-xyx-010/notebook-001/model/classifier-pca.pkl
loading models from: s3://my-modelarts-xyx-010/notebook-001/model/classifier.pt
evaluating on the test set:
             precision    recall  f1-score   support

          0     1.0000    0.9200    0.9583        25
          1     0.9259    1.0000    0.9615        25

avg / total     0.9630    0.9600    0.9599        50

