In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

In [2]:
# Use the result from previous notebook
train = pd.read_csv('../data/train_2.csv')
test = pd.read_csv('../data/test_2.csv')
preds = np.load('../data/cat3_preds.npy')

In [3]:
BATCHES = np.array([0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 65, 70])
CATEGORIES = np.array([1, 1, 2, 3, 5, 4, 2, 3, 4, 5, 6, 3, 4, 6, 2, 5, 4, 5, 6, 3, 6, 6])
CATEGORY = 3

signal = np.concatenate((train['signal'].values, test['signal'].values))

ix = np.where(CATEGORIES == CATEGORY)[0]
starts = BATCHES[ix]
ends = BATCHES[ix + 1]

X = []
y = []
for start, end in zip(starts, ends):
    subsignal = signal[start*100_000:end*100_000]
    if start < 50:
        subchannels = train['open_channels'].values[start*100_000:end*100_000]
    else:
        subchannels = [-1]*((end-start)*100_000)
        
    if start == 35:
        subsignal = list(subsignal[:100000]) + list(subsignal[-100000:])
        subchannels = list(subchannels[:100000]) + list(subchannels[-100000:])
    
    X.extend(subsignal)
    y.extend(subchannels)
    
X = np.array(X)
y = np.array(y)
print(len(X), len(y))

900000 900000


In [4]:
def optimize_thres_unsupervised(pred):
    """
    Unsupervised threshold optimization. We first take the "clean" data by filtering
    data points, per batch of 100K that are very close to their rounded value, 
    i.e. x - round(x) < thresh. We then calculate the the percentage that each open 
    channel value occurs int hese clean signal values. Afterwards, we determine the
    thresholds to extrapolate these percentages to the 100K batch again.
    """
    sig = X
    
    sY = 0.26
    L = 100_000
    Y = pred.copy()
    Thres = {}
    Yopt = pred.copy()
    for k in range(len(pred) // L):
        Thres[k] = np.zeros(12)
        Thres[k][0] = -99
        Thres[k][-1] = 99
        Yloc = Y[k*L:(k+1)*L]
        floc = sig[k*L:(k+1)*L]
        floc2 = floc[np.abs(floc-np.round(floc)) - sY < 0]
        
        for i in range(10):
            ni = len(floc2[np.round(floc2)<=i])
            ni2 = np.round(ni*len(floc)/ max(1, len(floc2))).astype(int)
            Ys = np.concatenate([np.sort(floc), [19]])
            Thres[k][i+1] = 0.5*(Ys[max(0,ni2)]+Ys[min(len(Ys)-1,ni2)])

        for i in range(11):
            Yloc[(Yloc>=Thres[k][i])&(Yloc<Thres[k][i+1])] = i
            
        print(Thres[k][:5])
        Yopt[k*L:(k+1)*L] = Yloc
    
    return Yopt

In [5]:
Yopt = optimize_thres_unsupervised(preds)

[-99.           0.41302985   1.48158938   2.51492738   3.72526563]
[-99.           0.41097594   1.45913473   2.50627229   3.73387474]
[-99.           0.44383453   1.4555706    2.51116669   3.71789684]
[-99.           0.44615578   1.46626967   2.50449531   3.72105359]
[-99.           0.40283286   1.45451166   2.51360322   3.71912623]
[-99.           0.41234146   1.45968372   2.50484058   3.71966778]
[-99.           0.42733735   1.4651892    2.51040336   3.73400133]
[-99.           0.4115519    1.466672     2.49394677   3.7259163 ]
[-99.           0.41387161   1.4600116    2.49813522   3.72481574]


In [6]:
print(f1_score(y[y >= 0], Yopt[y>=0].astype(int), average='macro'))

0.9869704508621362
