In [1]:
import pickle
import numpy as np
# load the datasets
with open('datasets/datasets.pkl', 'rb') as f:
    datasets = pickle.load(f)

In [2]:
label2index = {k: v for v, k in enumerate(list(datasets["base"].keys()))}
label2index

{'0': 0,
 '1': 1,
 '2': 2,
 '3': 3,
 '4': 4,
 '5': 5,
 '6': 6,
 '7': 7,
 '8': 8,
 '9': 9,
 'A': 10,
 'B': 11,
 'C': 12,
 'D': 13,
 'E': 14,
 'F': 15,
 'G': 16,
 'H': 17,
 'J': 18,
 'K': 19,
 'L': 20,
 'M': 21,
 'N': 22,
 'P': 23,
 'Q': 24,
 'R': 25,
 'S': 26,
 'T': 27,
 'U': 28,
 'V': 29,
 'W': 30,
 'X': 31,
 'Y': 32,
 'Z': 33}

In [3]:
def from_dict_to_XY(adict, one_hot=False):
    num_class = len(label2index.keys())
    Y = []
    X = []
    for key in adict:
        if one_hot:
            label = np.zeros([len(adict[key]), num_class])
            label[:, label2index[key]] = 1.0
        else:
            label = np.ones(len(adict[key]))*label2index[key]
        Y.append(label)
        X.append(adict[key])
    X = np.vstack(X)
    if one_hot:
        Y = np.vstack(Y)
    else:
        Y = np.concatenate(Y)
    return X, Y

In [4]:
dataset_names = list(datasets.keys())
dataset_names

['base', 'challenge', 'db', 'fn', 'weather']

In [5]:
X_base, Y_base = from_dict_to_XY(datasets["base"])

In [6]:
Y_base.shape, X_base.shape

((8446,), (8446, 512))

In [7]:
model_zoo = {}

# SVM classification

In [8]:
from sklearn.svm import SVC

model = SVC(random_state=0, gamma='auto')
model.fit(X_base, Y_base)
model_zoo["svm"] = model

In [9]:
model.score(X_base, Y_base)

1.0

In [10]:
accs = {}
for each_name in dataset_names[1:]:
    print("Processing", each_name)
    X, Y = from_dict_to_XY(datasets[each_name])
    accs[each_name] = model.score(X, Y)

Processing challenge
Processing db
Processing fn
Processing weather


In [11]:
accs

{'challenge': 0.17335766423357665,
 'db': 0.19903691813804172,
 'fn': 0.2222222222222222,
 'weather': 0.19622245540398742}

# Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0)
model.fit(X_base, Y_base)
model_zoo["lgr"] = model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
model.score(X_base, Y_base)

0.952758702344305

In [14]:
accs = {}
for each_name in dataset_names[1:]:
    print("Processing", each_name)
    X, Y = from_dict_to_XY(datasets[each_name])
    accs[each_name] = model.score(X, Y)

Processing challenge
Processing db
Processing fn
Processing weather


In [15]:
accs

{'challenge': 0.7992700729927007,
 'db': 0.7517388978063135,
 'fn': 0.9176954732510288,
 'weather': 0.8764428121720882}

# Save baseline models

In [16]:
with open('models/lgr.pkl', 'wb') as f:
    pickle.dump(model_zoo["lgr"], f)
with open('models/svm.pkl', 'wb') as f:
    pickle.dump(model_zoo["svm"], f)