In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import csv
import os

In [2]:
df = pd.read_excel('labels.xlsx', header=None)
print(df)
cotton_id2label = {str(int(df.iloc[i][0])): df.iloc[i][1] for i in range(df.index.stop)}
spandex_id2label = {str(int(df.iloc[i][0])): df.iloc[i][2] for i in range(df.index.stop)}
#print(cotton_id2label)
#print(spandex_id2label)

       0     1    2
0      1  97.0  3.0
1      2  98.8  1.2
2      3  97.0  3.0
3      4  98.0  2.0
4      5  97.5  2.5
..   ...   ...  ...
110  111  98.0  2.0
111  112  97.0  3.0
112  113  98.0  2.0
113  114  97.0  3.0
114  115  98.0  2.0

[115 rows x 3 columns]


In [3]:
def process_file(filename):
    res = []
    with open(filename, encoding='utf-8') as file:
        reader = csv.reader(file)
        start = False
        for row in reader:
            if row[0] == 'Wavelength (nm)':
                start = True
            elif start:
                res.append(row)
    return res

def get_data_3(root_dir):
    filenames = list(filter(lambda x : x.endswith('_a.csv')
                            or x.endswith('_i.csv')
                            or x.endswith('_r.csv'),
                            os.listdir(root_dir)))
    filenames = [(filename, filename[7:-6]) for filename in filenames]
    id2file = {file_id: [] for _, file_id in filenames}
    for filename, file_id in filenames:
        id2file[file_id].append(filename)
#     print(id2file)
#     id2feat = {}
#     if len(id2file) != 20:
#         print(str(len(id2file)) + ' ' + str(root_dir))
#         return False, False
#     res = np.zeros((228, 4), dtype=np.float32)
    res = []
    for file_id, filenames in id2file.items():
        feat = {}
        for filename in filenames:
            if filename.endswith('_a.csv'):
                feat['a'] = process_file(os.path.join(root_dir, filename))
            elif filename.endswith('_i.csv'):
                feat['i'] = process_file(os.path.join(root_dir, filename))
            elif filename.endswith('_r.csv'):
                feat['r'] = process_file(os.path.join(root_dir, filename))
            else:
                assert False
        column0a = [col0 for col0, _ in feat['a']]
        column0i = [col0 for col0, _ in feat['i']]
        column0r = [col0 for col0, _ in feat['r']]
        assert column0a == column0i \
               and column0a == column0r
        column1a = [col1 for _, col1 in feat['a']]
        column1i = [col1 for _, col1 in feat['i']]
        column1r = [col1 for _, col1 in feat['r']]
        feat = list(zip(column0a, column1a, column1i, column1r))
#         print(feat)
#         res += np.array(feat, dtype=np.float32)
        res.append(np.array(feat, dtype=np.float32))
#         break
#     assert len(res) == 20
    return True, res

def get_data_2(root_dir, name):
    dataset = []
    for sub_dir in os.listdir(root_dir):
#         print(name, sub_dir)
#         print(id2feat)
        record = {}
        record['name'] = name
        record['label'] = cotton_id2label[sub_dir] \
                    if name == 'cotton' \
                    else spandex_id2label[sub_dir]
        res, feat = get_data_3(os.path.join(root_dir, sub_dir))
        if res:
            record['feat'] = feat
            dataset.append(record)
#             print(record['feat'][0].shape)
#             break
    return dataset

def get_data(root_dir):
    dataset0 = get_data_2(os.path.join(root_dir, 'cotton'), 'cotton')
    dataset1 = get_data_2(os.path.join(root_dir, 'cotton_spandex'), 'cotton_spandex')
#     column0 = [col0 for col0, _, _, _ in dataset0[0]['feat']]
    return dataset0 + dataset1

In [4]:
dataset_train = get_data('train')
dataset_test  = get_data('test')

print('OK')

OK


In [5]:
def sample_feats(feats, num):
    idx = np.arange(len(feats))
    np.random.shuffle(idx)
    idx = idx[:num]
    assert len(idx) == num
    return np.sum(feats[idx], axis=0) / num

def sample_from_record(record, num_per_sample, X, y):
    X.append(sample_feats(record['feats'], num_per_sample).reshape(-1))
    y.append(0 if record['name'] == 'cotton' else 1)

def repetitive_sample_from_record(record, num_samples, num_per_sample, X, y):
    [
        sample_from_record(record, num_per_sample, X, y)
        for _ in range(num_samples)
    ]

def get_id_dict(dataset):
    id_dict = {'cotton': [], 'cotton_spandex': []}
    for i, record in enumerate(dataset):
        id_dict[record['name']].append(i)
    return id_dict

def sample_dataset(dataset, num_samples, num_per_sample, balance):
    id_dict = get_id_dict(dataset)
    X = []
    y = []
    for record in dataset:
        repetitive_sample_from_record(record, num_samples, num_per_sample, X, y)
    a = len(id_dict['cotton'])
    b = len(id_dict['cotton_spandex']) * 1.0
    b = int(b)
    print(a, b)
    if balance and a != b:
        if a > b:
            diff = a - b
            idx = np.arange(b)
            name = 'cotton_spandex'
        else:  # b > a
            diff = b - a
            idx = np.arange(a)
            name = 'cotton'
        np.random.shuffle(idx)
        idx = idx[:diff]
        [
            repetitive_sample_from_record(
                dataset[id_dict[name][i]],
                num_samples, num_per_sample, X, y
            ) for i in idx
        ]
    return np.array(X), np.array(y)

def process_dataset(dataset, num_samples=2001, num_per_sample=15, balance=False, dev=0):
    id_dict = get_id_dict(dataset)
    for record in dataset:
        feats = np.array(record['feat'])
#         assert len(feats) == 20
        feats2 = []
        for feat in feats:
            feat[:, 0] /= 1800
            feat[:, 2] /= 500000
            feat = feat[:, 1:]
            feats2.append(feat)
        feats = np.array(feats2)
        record['feats'] = feats  # not `feat`
#         print(np.min(feats), np.max(feats))
    
    if dev == 0:  # no dev set
        return sample_dataset(dataset, num_samples, num_per_sample, balance)
    else:
        def sample_id(id_list):
            idx = np.arange(len(id_list))
            np.random.shuffle(idx)
            return [id_list[i] for i in idx[:dev]], [id_list[i] for i in idx[dev:]]
        c_dev, c_train = sample_id(id_dict['cotton'])
        cs_dev, cs_train = sample_id(id_dict['cotton_spandex'])
        dev_id = c_dev + cs_dev
        train_id = c_train + cs_train
        dev_ds = [dataset[i] for i in dev_id]
        train_ds = [dataset[i] for i in train_id]
        return *sample_dataset(train_ds, num_samples, num_per_sample, balance), \
               *sample_dataset(dev_ds, num_samples, num_per_sample, balance)

In [6]:
from sklearn.decomposition import PCA

np.random.seed(2020)

train_X, train_y, dev_X, dev_y = process_dataset(dataset_train, balance=True, dev=10)
test_X, test_y = process_dataset(dataset_test)

print(np.sum(train_y == 0), np.sum(train_y == 1))
print(np.sum(dev_y == 0), np.sum(dev_y == 1))
print(np.sum(test_y == 0), np.sum(test_y == 1))

pca = PCA(n_components=228)
pca_train_X = pca.fit_transform(train_X)
pca_dev_X = pca.transform(dev_X)
pca_test_X = pca.transform(test_X)

print('OK')

60 60
10 10
25 25
120060 120060
20010 20010
50025 50025
OK


In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [8]:
def get_models():
    models = [tf.keras.Sequential([
        layers.Dense(512, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(512, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(2),
        layers.Softmax()
    ]) for _ in range(15)]

    for model in models:
        model.compile(optimizer=keras.optimizers.Adam(),
                      loss=keras.losses.SparseCategoricalCrossentropy(),
                      metrics=[keras.metrics.SparseCategoricalAccuracy()])

        callback = keras.callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy',
                                                 patience=10,
                                                 restore_best_weights=True)

        n = len(pca_train_X)
        idx = np.arange(n)
        idx = np.random.choice(idx, size=int(n * 0.9))

        model.fit(pca_train_X[idx], train_y[idx],
                  callbacks=[callback],
                  batch_size=512,
                  epochs=100,
                  verbose=0,
                  validation_data=(pca_dev_X, dev_y))

        model.evaluate(pca_test_X, test_y, verbose=2)
    
    return models

In [9]:
from sklearn.metrics import classification_report

def get_pred(models, X):
    preds = []
    for model in models:
        pred = model(X).numpy()
        preds.append(np.argmax(pred, axis=-1))
    return preds

def evaluate(preds, y):
    group = 2001
    for pred in preds:
        assert len(pred) % group == 0
        assert len(preds[0]) == len(pred)
        # print(len(pred))
    y_true = []
    y_pred = []
    for k in range(0, len(preds[0]), group):
        assert np.sum(y[k:k + group] == y[k]) == group
        y_true.append(y[k])
        a = 0
        b = 0
        for pred in preds:
            a += np.sum(pred[k:k + group] == 0)
            b += np.sum(pred[k:k + group] == 1)
        assert a != b
        y_pred.append(0 if a > b else 1)
    print(classification_report(y_true, y_pred, digits=4))

In [10]:
tf.random.set_seed(2020)
np.random.seed(2020)

num_tests = 10

for i in range(num_tests):
    print(f'round {i}:')
    models = get_models()
    print('training set:')
    evaluate(get_pred(models, pca_train_X), train_y)
    print('-' * 30 + '\ndev set:')
    evaluate(get_pred(models, pca_dev_X), dev_y)
    print('-' * 30 + '\ntest set:')
    evaluate(get_pred(models, pca_test_X), test_y)

round 0:
3127/3127 - 4s - loss: 0.3523 - sparse_categorical_accuracy: 0.9597
3127/3127 - 4s - loss: 0.3543 - sparse_categorical_accuracy: 0.9497
3127/3127 - 3s - loss: 0.3462 - sparse_categorical_accuracy: 0.9369
3127/3127 - 3s - loss: 0.3478 - sparse_categorical_accuracy: 0.9595
3127/3127 - 3s - loss: 0.4241 - sparse_categorical_accuracy: 0.9544
3127/3127 - 3s - loss: 0.3347 - sparse_categorical_accuracy: 0.9420
3127/3127 - 3s - loss: 0.3933 - sparse_categorical_accuracy: 0.9479
3127/3127 - 3s - loss: 0.2926 - sparse_categorical_accuracy: 0.9545
3127/3127 - 3s - loss: 0.4401 - sparse_categorical_accuracy: 0.9377
3127/3127 - 3s - loss: 0.4077 - sparse_categorical_accuracy: 0.9403
3127/3127 - 3s - loss: 0.3442 - sparse_categorical_accuracy: 0.9559
3127/3127 - 3s - loss: 0.3394 - sparse_categorical_accuracy: 0.9584
3127/3127 - 3s - loss: 0.3734 - sparse_categorical_accuracy: 0.9597
3127/3127 - 3s - loss: 0.3611 - sparse_categorical_accuracy: 0.9593
3127/3127 - 3s - loss: 0.3761 - sparse_

              precision    recall  f1-score   support

           0     1.0000    0.9200    0.9583        25
           1     0.9259    1.0000    0.9615        25

    accuracy                         0.9600        50
   macro avg     0.9630    0.9600    0.9599        50
weighted avg     0.9630    0.9600    0.9599        50

round 4:
3127/3127 - 3s - loss: 0.3409 - sparse_categorical_accuracy: 0.9576
3127/3127 - 3s - loss: 0.3624 - sparse_categorical_accuracy: 0.9484
3127/3127 - 3s - loss: 0.3458 - sparse_categorical_accuracy: 0.9585
3127/3127 - 3s - loss: 0.3119 - sparse_categorical_accuracy: 0.9543
3127/3127 - 3s - loss: 0.3721 - sparse_categorical_accuracy: 0.9577
3127/3127 - 3s - loss: 0.3554 - sparse_categorical_accuracy: 0.9574
3127/3127 - 3s - loss: 0.3909 - sparse_categorical_accuracy: 0.9578
3127/3127 - 3s - loss: 0.3027 - sparse_categorical_accuracy: 0.9578
3127/3127 - 3s - loss: 0.3477 - sparse_categorical_accuracy: 0.9444
3127/3127 - 3s - loss: 0.3655 - sparse_categorical_a

              precision    recall  f1-score   support

           0     1.0000    0.9200    0.9583        25
           1     0.9259    1.0000    0.9615        25

    accuracy                         0.9600        50
   macro avg     0.9630    0.9600    0.9599        50
weighted avg     0.9630    0.9600    0.9599        50

round 8:
3127/3127 - 3s - loss: 0.5026 - sparse_categorical_accuracy: 0.9292
3127/3127 - 3s - loss: 0.2841 - sparse_categorical_accuracy: 0.9597
3127/3127 - 3s - loss: 0.4185 - sparse_categorical_accuracy: 0.9397
3127/3127 - 3s - loss: 0.4514 - sparse_categorical_accuracy: 0.9284
3127/3127 - 3s - loss: 0.3672 - sparse_categorical_accuracy: 0.9425
3127/3127 - 3s - loss: 0.3869 - sparse_categorical_accuracy: 0.9598
3127/3127 - 3s - loss: 0.3817 - sparse_categorical_accuracy: 0.9401
3127/3127 - 3s - loss: 0.4710 - sparse_categorical_accuracy: 0.9397
3127/3127 - 3s - loss: 0.3710 - sparse_categorical_accuracy: 0.9547
3127/3127 - 3s - loss: 0.3935 - sparse_categorical_a