In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import csv
import os

In [2]:
df = pd.read_excel('labels.xlsx', header=None)
print(df)
cotton_id2label = {str(int(df.iloc[i][0])): df.iloc[i][1] for i in range(df.index.stop)}
spandex_id2label = {str(int(df.iloc[i][0])): df.iloc[i][2] for i in range(df.index.stop)}
#print(cotton_id2label)
#print(spandex_id2label)

       0     1    2
0      1  97.0  3.0
1      2  98.8  1.2
2      3  97.0  3.0
3      4  98.0  2.0
4      5  97.5  2.5
..   ...   ...  ...
110  111  98.0  2.0
111  112  97.0  3.0
112  113  98.0  2.0
113  114  97.0  3.0
114  115  98.0  2.0

[115 rows x 3 columns]


In [3]:
def process_file(filename):
    res = []
    with open(filename, encoding='utf-8') as file:
        reader = csv.reader(file)
        start = False
        for row in reader:
            if row[0] == 'Wavelength (nm)':
                start = True
            elif start:
                res.append(row)
    return res

def get_data_3(root_dir):
    filenames = list(filter(lambda x : x.endswith('_a.csv')
                            or x.endswith('_i.csv')
                            or x.endswith('_r.csv'),
                            os.listdir(root_dir)))
    filenames = [(filename, filename[7:-6]) for filename in filenames]
    id2file = {file_id: [] for _, file_id in filenames}
    for filename, file_id in filenames:
        id2file[file_id].append(filename)
#     print(id2file)
#     id2feat = {}
#     if len(id2file) != 20:
#         print(str(len(id2file)) + ' ' + str(root_dir))
#         return False, False
#     res = np.zeros((228, 4), dtype=np.float32)
    res = []
    for file_id, filenames in id2file.items():
        feat = {}
        for filename in filenames:
            if filename.endswith('_a.csv'):
                feat['a'] = process_file(os.path.join(root_dir, filename))
            elif filename.endswith('_i.csv'):
                feat['i'] = process_file(os.path.join(root_dir, filename))
            elif filename.endswith('_r.csv'):
                feat['r'] = process_file(os.path.join(root_dir, filename))
            else:
                assert False
        column0a = [col0 for col0, _ in feat['a']]
        column0i = [col0 for col0, _ in feat['i']]
        column0r = [col0 for col0, _ in feat['r']]
        assert column0a == column0i \
               and column0a == column0r
        column1a = [col1 for _, col1 in feat['a']]
        column1i = [col1 for _, col1 in feat['i']]
        column1r = [col1 for _, col1 in feat['r']]
        feat = list(zip(column0a, column1a, column1i, column1r))
#         print(feat)
#         res += np.array(feat, dtype=np.float32)
        res.append(np.array(feat, dtype=np.float32))
#         break
#     assert len(res) == 20
    return True, res

def get_data_2(root_dir, name):
    dataset = []
    for sub_dir in os.listdir(root_dir):
#         print(name, sub_dir)
#         print(id2feat)
        record = {}
        record['name'] = name
        record['label'] = cotton_id2label[sub_dir] \
                    if name == 'cotton' \
                    else spandex_id2label[sub_dir]
        res, feat = get_data_3(os.path.join(root_dir, sub_dir))
        if res:
            record['feat'] = feat
            dataset.append(record)
#             print(record['feat'][0].shape)
#             break
    return dataset

def get_data(root_dir):
#     dataset0 = get_data_2(os.path.join(root_dir, 'cotton'), 'cotton')
    dataset1 = get_data_2(os.path.join(root_dir, 'cotton_spandex'), 'cotton_spandex')
#     column0 = [col0 for col0, _, _, _ in dataset0[0]['feat']]
#     return dataset0 + dataset1
    return dataset1

In [4]:
dataset_train = get_data('train')
dataset_test  = get_data('test')

print('OK')

OK


In [5]:
def sample_feats(feats, num):
    idx = np.arange(len(feats))
    np.random.shuffle(idx)
    idx = idx[:num]
    assert len(idx) == num
    return np.sum(feats[idx], axis=0) / num

def sample_from_record(record, num_per_sample, X, y):
    sample = sample_feats(record['feats'], num_per_sample)  #.reshape(-1)
    assert sample.shape == (228, 3)
    X.append((sample[:, 0], sample[:, 1], sample[:, 2]))
    y.append(record['label'])

def repetitive_sample_from_record(record, num_samples, num_per_sample, X, y):
    [
        sample_from_record(record, num_per_sample, X, y)
        for _ in range(num_samples)
    ]

def get_id_dict(dataset):
    id_dict = {2: [], 3: []}
    for i, record in enumerate(dataset):
        v = int(round(record['label']))
        if v not in id_dict:
            continue
        id_dict[v].append(i)
    return id_dict

def sample_dataset(dataset, num_samples, num_per_sample, balance):
    id_dict = get_id_dict(dataset)
    X = []  # (n, 3, 228)
    y = []  # (n, )
    for record in dataset:
        repetitive_sample_from_record(record, num_samples, num_per_sample, X, y)
    a = len(id_dict[2])
    b = len(id_dict[3]) * 1.0
    b = int(b)
    print(a, b)
    if balance and a != b:
        if a > b:
            diff = a - b
            idx = np.arange(b)
            name = 3
        else:  # b > a
            diff = b - a
            idx = np.arange(a)
            name = 2
        np.random.shuffle(idx)
        idx = idx[:diff]
        [
            repetitive_sample_from_record(
                dataset[id_dict[name][i]],
                num_samples, num_per_sample, X, y
            ) for i in idx
        ]
    X = np.array(X)  # (n, 3, 228)
    X = X.transpose([1, 0, 2])
    assert X.shape[0] == 3 and X.shape[2] == 228
#     X = X.reshape(-1, 3 * 228)
    # (3, n, 228), (n, )
    return X, np.array(y)

def process_dataset(dataset, num_samples=1001, num_per_sample=15, balance=False, dev=0):
    id_dict = get_id_dict(dataset)
    for record in dataset:
        feats = np.array(record['feat'])
#         assert len(feats) == 20
        feats2 = []
        for feat in feats:
            feat[:, 0] /= 1800
            feat[:, 2] /= 500000
            feat = feat[:, 1:]
            assert feat.shape == (228, 3)
            feats2.append(feat)
        feats = np.array(feats2)
        record['feats'] = feats  # not `feat`
#         print(np.min(feats), np.max(feats))
    
    if dev == 0:  # no dev set
        return sample_dataset(dataset, num_samples, num_per_sample, balance)
    else:
        def sample_id(id_list):
            idx = np.arange(len(id_list))
            np.random.shuffle(idx)
            return [id_list[i] for i in idx[:dev]], [id_list[i] for i in idx[dev:]]
        c_dev, c_train = sample_id(id_dict[2])
        cs_dev, cs_train = sample_id(id_dict[3])
        dev_id = c_dev + cs_dev
        train_id = c_train + cs_train
        dev_ds = [dataset[i] for i in dev_id]
        train_ds = [dataset[i] for i in train_id]
        return *sample_dataset(train_ds, num_samples, num_per_sample, balance), \
               *sample_dataset(dev_ds, num_samples, num_per_sample, balance)

In [6]:
from sklearn.decomposition import PCA

np.random.seed(2020)

train_X, train_y, dev_X, dev_y = process_dataset(dataset_train, balance=True, dev=5)
test_X, test_y = process_dataset(dataset_test)

print(min(train_y), max(train_y))
print(min(dev_y), max(dev_y))
print(min(test_y), max(test_y))

print(np.sum(np.round(train_y) == 2), np.sum(np.round(train_y) == 3))
print(np.sum(np.round(dev_y) == 2), np.sum(np.round(dev_y) == 3))
print(np.sum(np.round(test_y) == 2), np.sum(np.round(test_y) == 3))

print(np.min(train_X), np.max(train_X))
print(np.min(dev_X), np.max(dev_X))
print(np.min(test_X), np.max(test_X))

pca_train_X = []
pca_dev_X = []
pca_test_X = []

for i in range(3):
    print(f'processing {i}:')
    assert train_X[i].shape[1:] == (228, )
    assert dev_X[i].shape[1:] == (228, )
    assert test_X[i].shape[1:] == (228, )
    
    pca = PCA(n_components=228 // 2)
    pca_train_X.append(pca.fit_transform(train_X[i]))
    pca_dev_X.append(pca.transform(dev_X[i]))
    pca_test_X.append(pca.transform(test_X[i]))

    print(np.min(pca_train_X), np.max(pca_train_X))
    print(np.min(pca_dev_X), np.max(pca_dev_X))
    print(np.min(pca_test_X), np.max(pca_test_X))

    print(pca_train_X[-1].shape)
    print(pca_dev_X[-1].shape)
    print(pca_test_X[-1].shape)

print('OK')

25 30
5 5
7 18
1.7 3.0
2.0 3.0
2.0 3.0
30030 30030
5005 5005
7007 18018
-0.24095383 1.7450907
-0.25725943 1.8084242
-0.21706857 1.6509572
processing 0:
-0.6419308 3.1682384
-0.6847996 2.608361
-0.5215433 2.0283337
(60060, 114)
(10010, 114)
(25025, 114)
processing 1:
-1.0805106 3.788773
-1.0300065 2.608361
-0.7315318 2.0283337
(60060, 114)
(10010, 114)
(25025, 114)
processing 2:
-1.1810124 3.788773
-1.3288139 3.2110639
-0.8398875 2.9191904
(60060, 114)
(10010, 114)
(25025, 114)
OK


In [7]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [8]:
def get_models():
    models = [[tf.keras.Sequential([
        layers.Dense(256, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(64, activation=tf.sigmoid),
        layers.LayerNormalization(),
        layers.Dropout(0.5),
        layers.Dense(1)
    ]) for _ in range(3)] for _ in range(3)]
    
    for i in range(3):
        print(f'#{i}:')
        for model in models[i]:
            model.compile(optimizer=keras.optimizers.Adam(),
                          loss=keras.losses.MeanAbsoluteError(),
                          metrics=[keras.metrics.MeanAbsoluteError(),
                                   keras.metrics.MeanSquaredError()])

            callback = keras.callbacks.EarlyStopping(monitor='val_mean_absolute_error',
                                                     patience=10,
                                                     restore_best_weights=True)

            n = len(pca_train_X[i])
            idx = np.arange(n)
            idx = np.random.choice(idx, size=int(n * 0.5))

    #         print(min(idx), max(idx))
    #         print(pca_train_X.shape)
    #         print(train_y.shape)

            model.fit(pca_train_X[i][idx], train_y[idx],
                      callbacks=[callback],
                      batch_size=512,
                      epochs=100,
                      verbose=0,
                      validation_data=(pca_dev_X[i], dev_y))

            model.evaluate(pca_test_X[i], test_y, verbose=2)
    
    return models

In [9]:
from sklearn.metrics import classification_report, \
    mean_absolute_error, mean_squared_error, r2_score

def get_pred(models, X):
    preds = []
    for i in range(3):
        for model in models[i]:
            pred = model(X[i]).numpy()
            preds.append(pred.reshape(-1))
    return preds

def evaluate(preds, y):
    group = 1001
    for pred in preds:
        assert len(pred) % group == 0
        assert len(preds[0]) == len(pred)
        # print(len(pred))
    y_true = []
    y_pred = []
    y_true_c = []
    y_pred_c = []
    n = len(preds[0])
#     preds = [preds[i * 5:i * 5 + 5] for i in range(3)]
    for k in range(0, n, group):
        assert np.sum(y[k:k + group] == y[k]) == group
        y_true.append(y[k])
        s = 0
#         n = 0
#         for i in range(3):
#             ss = 0
        for pred in preds:
#                 ss = max(ss, max(pred[k:k + group]))
#             s = max(ss, s)
#             s = max(s, max(pred[k:k + group]))
#             s = max(s, np.mean(pred[k:k + group]))
#             s += max(pred[k:k + group]) / len(preds)
            s += np.mean(pred[k:k + group]) / len(preds)
#             n += group
#         assert n == len(preds) * group
#         y_pred.append(s / n)
        y_pred.append(s)
        y_true_c.append(int(round(y_true[-1])))
        y_pred_c.append(int(round(y_pred[-1])))
#         print(y_true_c)
#         print(y_pred_c)
    
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    r_mae = mean_absolute_error(y_true, y_pred_c)
    r_mse = mean_squared_error(y_true, y_pred_c)
    r_r2 = r2_score(y_true, y_pred_c)
    
    print('MAE =', mae)
    print('MSE =', mse)
    print('r^2 =', r2)
    print()
    print('Rounded MAE =', r_mae)
    print('Rounded MSE =', r_mse)
    print('Rounded r^2 =', r_r2)
    
    print(classification_report(y_true_c, y_pred_c))
    
    return mae, mse, r2, r_mae, r_mse, r_r2

def test_pipeline():
    models = get_models()
    print('\n' + '=' * 30 + '\n\ntraining set:')
    evaluate(get_pred(models, pca_train_X), train_y)
    print('-' * 30 + '\ndev set:')
    evaluate(get_pred(models, pca_dev_X), dev_y)
    print('-' * 30 + '\ntest set:')
    return evaluate(get_pred(models, pca_test_X), test_y)

In [10]:
tf.random.set_seed(2020)
np.random.seed(2020)

num_tests = 10

mae = np.zeros(num_tests)
mse = np.zeros(num_tests)
r2 = np.zeros(num_tests)

r_mae = np.zeros(num_tests)
r_mse = np.zeros(num_tests)
r_r2 = np.zeros(num_tests)

for i in range(num_tests):
    print(f'round {i}:')
    mae[i], mse[i], r2[i], r_mae[i], r_mse[i], r_r2[i] = test_pipeline()
    print('\n' + '+' * 50 + '\n')

print('*' * 50)
print(f'MAE = {mae.mean():.4f} (+/- {mae.std():.4f})')
print(f'MSE = {mse.mean():.4f} (+/- {mse.std():.4f})')
print(f'r^2 = {r2.mean():.4f} (+/- {r2.std():.4f})')
print()
print(f'Rounded MAE = {r_mae.mean():.4f} (+/- {r_mae.std():.4f})')
print(f'Rounded MSE = {r_mse.mean():.4f} (+/- {r_mse.std():.4f})')
print(f'Rounded r^2 = {r_r2.mean():.4f} (+/- {r_r2.std():.4f})')

round 0:
#0:
783/783 - 1s - loss: 0.2539 - mean_absolute_error: 0.2539 - mean_squared_error: 0.2056
783/783 - 1s - loss: 0.2870 - mean_absolute_error: 0.2870 - mean_squared_error: 0.2262
783/783 - 1s - loss: 0.2627 - mean_absolute_error: 0.2627 - mean_squared_error: 0.2004
#1:
783/783 - 1s - loss: 0.3085 - mean_absolute_error: 0.3085 - mean_squared_error: 0.2145
783/783 - 1s - loss: 0.2948 - mean_absolute_error: 0.2948 - mean_squared_error: 0.2256
783/783 - 1s - loss: 0.3101 - mean_absolute_error: 0.3101 - mean_squared_error: 0.2096
#2:
783/783 - 1s - loss: 0.3214 - mean_absolute_error: 0.3214 - mean_squared_error: 0.3029
783/783 - 1s - loss: 0.3224 - mean_absolute_error: 0.3224 - mean_squared_error: 0.2390
783/783 - 1s - loss: 0.2505 - mean_absolute_error: 0.2505 - mean_squared_error: 0.1907


training set:
MAE = 0.13161218020651072
MSE = 0.028567983984431524
r^2 = 0.8903994817085623

Rounded MAE = 0.020000000000000004
Rounded MSE = 0.007333333333333334
Rounded r^2 = 0.971865808431732

#0:
783/783 - 1s - loss: 0.3094 - mean_absolute_error: 0.3094 - mean_squared_error: 0.2530
783/783 - 1s - loss: 0.2505 - mean_absolute_error: 0.2505 - mean_squared_error: 0.1896
783/783 - 1s - loss: 0.3107 - mean_absolute_error: 0.3107 - mean_squared_error: 0.2390
#1:
783/783 - 1s - loss: 0.3181 - mean_absolute_error: 0.3181 - mean_squared_error: 0.1744
783/783 - 1s - loss: 0.2777 - mean_absolute_error: 0.2777 - mean_squared_error: 0.1734
783/783 - 1s - loss: 0.3594 - mean_absolute_error: 0.3594 - mean_squared_error: 0.1615
#2:
783/783 - 1s - loss: 0.3290 - mean_absolute_error: 0.3290 - mean_squared_error: 0.3090
783/783 - 1s - loss: 0.3258 - mean_absolute_error: 0.3258 - mean_squared_error: 0.2943
783/783 - 1s - loss: 0.2607 - mean_absolute_error: 0.2607 - mean_squared_error: 0.2156


training set:
MAE = 0.159656526071054
MSE = 0.03674356435444179
r^2 = 0.8590340256660659

Rounded MAE = 0.020000000000000004
Rounded MSE = 0.007333333333333334
Rounded r^2 = 0.971865808431732
           

#0:
783/783 - 1s - loss: 0.3448 - mean_absolute_error: 0.3448 - mean_squared_error: 0.3013
783/783 - 1s - loss: 0.2331 - mean_absolute_error: 0.2331 - mean_squared_error: 0.1850
783/783 - 1s - loss: 0.3404 - mean_absolute_error: 0.3404 - mean_squared_error: 0.2837
#1:
783/783 - 1s - loss: 0.3282 - mean_absolute_error: 0.3282 - mean_squared_error: 0.1887
783/783 - 1s - loss: 0.2773 - mean_absolute_error: 0.2773 - mean_squared_error: 0.2122
783/783 - 1s - loss: 0.3956 - mean_absolute_error: 0.3956 - mean_squared_error: 0.1823
#2:
783/783 - 1s - loss: 0.3316 - mean_absolute_error: 0.3316 - mean_squared_error: 0.3016
783/783 - 1s - loss: 0.3551 - mean_absolute_error: 0.3551 - mean_squared_error: 0.3253
783/783 - 1s - loss: 0.2417 - mean_absolute_error: 0.2417 - mean_squared_error: 0.1793


training set:
MAE = 0.16600125109707867
MSE = 0.040646620545062864
r^2 = 0.8440600260430684

Rounded MAE = 0.020000000000000004
Rounded MSE = 0.007333333333333334
Rounded r^2 = 0.971865808431732
        