In [1]:
import os
import numpy as np
import re
import tqdm

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error as mae

### Functions related to the model
(i.e. k-fold cv, running the model, metrics)

In [2]:
def cs(gt, pred, alpha):
    count = 0
    for i in range(len(gt)):
        absolute_error = abs(gt[i] - pred[i])
        if absolute_error <= alpha:
            count += 1

    return count / len(gt) * 100

In [3]:
def kfold_cv(model, features, labels, k, seed):
    mae_k = []
    cs_k = []
    cs_1 = []
    # As the dataset is imbalanced --> stratified kfold + seed to get the same validation/train splits
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)

    for train_index, test_index in kf.split(features, labels):
        train_features, test_features = features[train_index], features[test_index]
        train_labels, test_labels = [labels[i] for i in train_index], [labels[i] for i in test_index]

        model.fit(train_features, train_labels)
        pred = model.predict(test_features)
        pred = [int(i) for i in pred]

        mae_k.append(mae(test_labels, pred))
        cs_k.append(cs(test_labels, pred, 25))
        cs_1.append(cs(test_labels, pred, 0))

    return np.mean(mae_k), np.std(mae_k), np.mean(cs_k), np.std(cs_k), np.mean(cs_1), np.std(cs_1)

In [18]:
def kfold_cv_aug(model, features_norm, features_aug, labels_norm, labels_aug, k, n_aug, seed):
    mae_k = []
    cs_k = []
    cs_1 = []

    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    for train_index, test_index in tqdm.tqdm(kf.split(features_norm, labels_norm)):
        test = [features_norm[idx] for idx in test_index]
        test_labels = [labels_norm[idx] for idx in test_index]

        train = [features_norm[idx] for idx in train_index]
        train_labels = [labels_norm[idx] for idx in train_index]

        for idx in range(len(features_aug)):
            if int(idx/n_aug) in test_index:
                continue
            else:
                train.append(features_aug[idx])
                train_labels.append(labels_aug[idx])

        model.fit(train, train_labels)
        pred = model.predict(test)
        pred = [int(i) for i in pred]

        mae_k.append(mae(test_labels, pred))
        cs_k.append(cs(test_labels, pred, 25))
        cs_1.append(cs(test_labels, pred, 0))
    return np.mean(mae_k), np.std(mae_k), np.mean(cs_k), np.std(cs_k), np.mean(cs_1), np.std(cs_1)

In [5]:
def model(model, train_features, test_features, train_labels, test_labels):
    model.fit(train_features, train_labels)
    pred = model.predict(test_features)
    pred = [int(i) for i in pred]
    return round(mae(test_labels, pred), 4), round(cs(test_labels, pred, 25), 4), round(cs(test_labels, pred, 1), 4)

def plot_model(model, train_features, test_features, train_labels, test_labels):
    model.fit(train_features, train_labels)
    pred = model.predict(test_features)
    pred = [int(i) for i in pred]

    plt.scatter(test_labels, pred)
    plt.ylabel('pred')
    plt.xlabel('true')
    plt.show()
    return round(mae(test_labels, pred), 4), round(cs(test_labels, pred, 25), 4), round(cs(test_labels, pred, 1), 4)

### Functions to process the features

In [6]:
HINGE = 0
JUNC = 1

In [7]:
def rescale_features(features):
    # rescaling features between 0 and 1
    features = np.asarray(features)
    scaler = MinMaxScaler()
    scaler.fit(features)
    data_rescaled = scaler.transform(features)

    return data_rescaled

In [8]:
def rescale_features_split(features1, features2):
    features1 = np.array(features1)
    features2 = np.array(features2)

    scaler = MinMaxScaler()
    scaler.fit(np.concatenate((features1, features2), axis=0))
    data_rescaled1 = scaler.transform(features1)
    data_rescaled2 = scaler.transform(features2)
    return data_rescaled1, data_rescaled2

In [13]:
def get_features2(feature_dir, hinge_or_junc, n_aug):
    features = []
    labels = []
    count = 0
    print(n_aug, feature_dir)

    for file in sorted(os.listdir(feature_dir)):
        aug_num = re.search("(_[0-9]?[0-9].p)", file)

        if aug_num is not None:
            aug_num = re.search("([0-9]?[0-9])", aug_num.group())
        if aug_num is None or (aug_num is not None and int(aug_num .group()) <= n_aug):
            if file != ".DS_Store" and not os.path.isdir(feature_dir + '/' + file):
                f = open(feature_dir + "/" + file)
                for line in f.readlines():
                    line = line.rstrip().split(" ")
                    if hinge_or_junc == HINGE:
                        features.append(line[2:])
                    elif hinge_or_junc == JUNC:
                        features.append([float(el) for el in line])
                        count += 1

                    label = re.search("([0-9][0-9][0-9][0-9])", file) #MPS
                    #label = re.search("(-?[0-9][0-9][0-9])", file) #DSS
                    labels.append(int(label.group()))

    return features, labels

In [10]:
def get_features(featuredir, size, n_aug):
    if featuredir != 'junclets' and featuredir != 'test/junclets':
        prefix = './Data/DSS/features/'
        features, labels = get_features2(prefix + featuredir, HINGE, n_aug)
    elif featuredir == 'junclets':
        #prefix = './Data/DSS/features/' + featuredir + '/size_' + str(size) + '/'
        features, labels = get_features2(prefix, JUNC, n_aug)
    return features, labels

def get_features_aug(featuredir, size, n_aug):
    if featuredir != 'junclets':
        prefix = './Data/DSS/features/'
        features_norm, labels_norm = get_features2(prefix + featuredir, HINGE, n_aug)
        features_aug, labels_aug = get_features2(prefix + 'features_aug_15/' + featuredir + '_aug', HINGE, n_aug)
    else:
        #prefix = './Data/DSS/junc_aug/features/' + featuredir + '/size_' + str(size) + '/'
        prefix = './tfsom/MPS/all/junclets/size_' + str(size) + '/'
        #features_norm, labels_norm = get_features2('./Data/DSS/features/'+ featuredir + '/size_' + str(size) + '/', JUNC, n_aug)
        features_norm, labels_norm = get_features2(prefix, JUNC, n_aug)
        #prefix = './Data/DSS/junc_aug/features_aug/' + featuredir + '/size_' + str(size) + '/'
        prefix = './tfsom/MPS/all/junclets_aug/size_' + str(size) + '/'
        features_aug, labels_aug = get_features2(prefix, JUNC, n_aug)

    return features_norm, labels_norm, features_aug, labels_aug

### Getting indices for test set

In [13]:
# MPS
size_data = 3267
test_size = 0.1 #as a fraction
test_indices = np.random.choice(np.array([i for i in range(size_data)]), int(size_data * test_size), replace=False)
np.save('./test_indices.npy', test_indices)

In [None]:
# EA and DSS
idx = 0
for file in sorted(os.listdir('./Data/DSS/DSS_jpg_re/')):
    label = re.search("(-?[0-9][0-9][0-9])", file)
    print(label.group(), idx)
    idx += 1

np.save('./Data/DSS/test_indices.npy', [0, 5, 6, 16, 17])

In [11]:
#test_indices = np.load('./Data/DSS/test_indices.npy')
test_indices = np.load('./tfsom/MPS/test_indices.npy')

## Tuning Hyperparameters
Uses K-fold cross validation with k = 10 for MPS and k = 4 for EA and DSS data

With original data

In [None]:
feat_names = ['hinge', 'cohinge', 'quadhinge', 'deltahinge', 'tcchinge']

k = 4 ## EA and DSS
# k = 10 ## MPS
Cs = [pow(2, n) for n in range(-7, 10, 1)]
seeds = [0, 50, 100, 150, 200, 250]

for featuredir in feat_names:
    print(featuredir)
    features, labels = get_features(featuredir, 0, 0)
    features = rescale_features(features)
    test = [features[idx] for idx in test_indices]
    test_labels = [labels[idx] for idx in test_indices]

    train = []
    train_labels = []
    for idx in range(len(features)):
        if idx not in test_indices:
            train.append(features[idx])
            train_labels.append(labels[idx])

    results = [[0, 0, 0, 0, 0, 0, 0] for i in range(len(Cs))]

    for seed in seeds:
        for c_idx in range(len(Cs)):
            results[c_idx][0] = Cs[c_idx]
            clf = kfold_cv(svm.SVC(kernel='linear', decision_function_shape='ovr', C=Cs[c_idx]), np.array(train), np.array(train_labels), k, seed)
            for res_idx in range(len(clf)):
                results[c_idx][res_idx + 1] += clf[res_idx]
    
    for c_idx in range(len(results)):
        for res_idx in range(1, len(results[c_idx])):
            results[c_idx][res_idx] = results[c_idx][res_idx]/len(seeds)

    for res in results:
        print(res)
    np.save('./Data/DSS/validation_' + featuredir + '.npy', results) ## Change according to data set

In [None]:
featuredir = 'junclets'

k = 4 ## EA and DSS
# k = 10 ## MPS
Cs = [pow(2, n) for n in range(-7,10, 1)]
seeds = [0, 50, 100, 150, 200, 250]

for cb_size in range(5, 35, 5):
    print(cb_size)
    features, labels = get_features(featuredir, cb_size, 0)
    features = rescale_features(features)
    test = [features[idx] for idx in test_indices]
    test_labels = [labels[idx] for idx in test_indices]

    train = []
    train_labels = []
    for idx in range(len(features)):
        if idx not in test_indices:
            train.append(features[idx])
            train_labels.append(labels[idx])

    results = [[0, 0, 0, 0, 0, 0, 0] for i in range(len(Cs))]
    for seed in seeds:
        for c_idx in range(len(Cs)):
            results[c_idx][0] = Cs[c_idx]
            clf = kfold_cv(svm.SVC(kernel='linear', decision_function_shape='ovr', C=Cs[c_idx]), np.array(train), np.array(train_labels), k, seed)
            for res_idx in range(len(clf)):
                results[c_idx][res_idx + 1] += clf[res_idx]
    
    for c_idx in range(len(results)):
        for res_idx in range(1, len(results[c_idx])):
            results[c_idx][res_idx] = results[c_idx][res_idx]/len(seeds)

    for res in results:
        print(res)
    np.save('./Data/DSS/validation_' + featuredir + '_' + str(cb_size) + '.npy', results) ## Change according to data set

With augmented data

In [None]:
feat_names = ['hinge', 'cohinge', 'quadhinge', 'deltahinge', 'tcchinge', 'junclets']

## EA and DSS
# k = 4
# n_aug = 15
# cb_size = 15

## MPS
k = 10
n_aug = 3
cb_size = 25

Cs = [pow(2, n) for n in range(-7,10, 1)]
seeds = [0, 50, 100, 150, 200, 250]


for featuredir in feat_names:
    print(featuredir)
    features_norm, labels_norm, features_aug, labels_aug = get_features_aug(featuredir, cb_size, n_aug)
    features_norm, features_aug = rescale_features_split(features_norm, features_aug)
    test = [features_norm[idx] for idx in test_indices]
    test_labels = [labels_norm[idx] for idx in test_indices]

    train_aug = []
    train_aug_labels = []
    for idx in range(len(features_aug)):
        if int(idx/n_aug) in test_indices:
            continue
        else:
            train_aug.append(features_aug[idx])
            train_aug_labels.append(labels_aug[idx])

    train_norm = []
    train_norm_labels = []
    for idx in range(len(features_norm)):
        if idx not in test_indices:
            train_norm.append(features_norm[idx])
            train_norm_labels.append(labels_norm[idx])
            
    results = [[0, 0, 0, 0, 0, 0, 0] for i in range(len(Cs))]

    for seed in tqdm.tqdm(seeds):
        print(seed)
        for c_idx in range(len(Cs)):
            print(Cs[c_idx])
            results[c_idx][0] = Cs[c_idx]
            clf = kfold_cv_aug(svm.SVC(kernel='linear', decision_function_shape='ovr', C=Cs[c_idx]), train_norm, train_aug, train_norm_labels, train_aug_labels, k, n_aug, seed=seed)
            for res_idx in range(len(clf)):
                results[c_idx][res_idx + 1] += clf[res_idx]
    
    for c_idx in range(len(results)):
        for res_idx in range(1, len(results[c_idx])):
            results[c_idx][res_idx] = results[c_idx][res_idx]/len(seeds)

    for res in results:
        print(res)
    np.save('./Data/MPS/validation_' + featuredir + '_aug_' + '.npy', results) ## Change according to data set

# Testing

With original data

In [None]:
feat_names = ['hinge', 'cohinge', 'quadhinge', 'deltahinge', 'tcchinge', 'junclets']

## EA and DSS
# k = 4
# n_aug = 15
# cb_size = 15
# Cs = [1, 1, 1, 0.125, 0.125, 0.03125]

## MPS
k = 10
n_aug = 3
cb_size = 25
Cs = [8, 0.0625, 0.125, 1, 1, 0.0625]

idx_c = 0


for featuredir in feat_names:
    print(featuredir)
    features, labels = get_features(featuredir, cb_size, 0)
    features = rescale_features(features)
    test = [features[idx] for idx in test_indices]
    test_labels = [labels[idx] for idx in test_indices]

    train = []
    train_labels = []
    for idx in range(len(features)):
        if idx not in test_indices:
            train.append(features[idx])
            train_labels.append(labels[idx])

    print(idx_c, Cs[idx_c])
    mae_res, cs_res25, cs_res1 = plot_model(svm.SVC(kernel='linear', decision_function_shape='ovr', C=Cs[idx_c]), train, test, train_labels, test_labels)
    print("MAE: %.4f  \t CS (=25): %.4f  \t CS(=1): %.4f " % (mae_res, cs_res25, cs_res1))
    print("%.4f,%.4f,%.4f" % (mae_res, cs_res25, cs_res1))

    idx_c += 1

With augmented data

In [None]:
feat_names = ['hinge', 'cohinge', 'quadhinge', 'deltahinge','tcchinge', 'junclets']

## EA and DSS
# k = 4
# n_aug = 15
# cb_size = 15
# Cs = [1, 1, 1, 2, 0.25, 0.25]

## MPS
k = 10
n_aug = 3
cb_size = 25
Cs = [2, 0.0625, 0.0625, 1, 1, 0.0625]

idx_c = 0


for featuredir in feat_names:
    print(featuredir)
    features_norm, labels_norm, features_aug, labels_aug = get_features_aug(featuredir, cb_size, n_aug)
    features_norm, features_aug = rescale_features_split(features_norm, features_aug)

    test = [features_norm[idx] for idx in test_indices]
    test_labels = [labels_norm[idx] for idx in test_indices]
    
    train_aug = []
    train_aug_labels = []
    for idx in range(len(features_aug)):
        if int(idx/n_aug) in test_indices:
            continue
        else:
            train_aug.append(features_aug[idx])
            train_aug_labels.append(labels_aug[idx])

    train_norm = []
    train_norm_labels = []
    for idx in range(len(features_norm)):
        if idx not in test_indices:
            train_norm.append(features_norm[idx])
            train_norm_labels.append(labels_norm[idx])

    train = train_norm + train_aug
    train_labels = train_norm_labels + train_aug_labels
    print(len(train), len(train_labels), len(train_aug))

    mae_res, cs_res25, cs_res1 = plot_model(svm.SVC(kernel='linear', decision_function_shape='ovr', C=Cs[idx_c]), train, test, train_labels, test_labels)
    print("MAE: %.4f  \t CS (=25): %.4f  \t CS(=1): %.4f " % (mae_res, cs_res25, cs_res1))
    print("%.4f,%.4f,%.4f" % (mae_res, cs_res25, cs_res1))

    idx_c += 1

#### Other

In [None]:
aug = [i for i in sorted(os.listdir('./all/hinge/'))]
origin = [i for i in sorted(os.listdir('./all/hinge/'))]
print(len(origin))

test_labels = []
for idx in test_indices:
    label = re.search("([0-9][0-9][0-9][0-9])", origin[idx])
    test_labels.append(label.group())

for year in range(1300, 1575, 25):
    print(test_labels.count(str(year)))

In [None]:
files = [i for i in sorted(os.listdir('./Data/DSS/features/cchinge/'))]

files.remove('.DS_Store')
print(len(test_indices))
for idx in test_indices:
    print(files[idx])
