In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *
plt.style.use('ggplot')

In [10]:
import pandas as pd

def process_data(data_paths):
    rows = []
    raw_data = [np.load(data_path, allow_pickle=True) for data_path in data_paths]
    balanced_data = balance_data(raw_data)
    clean_data = filter_data(balanced_data)

    for data_idx, events in enumerate(clean_data):
        for event in tqdm(events, desc=f'Processing {data_paths[data_idx]}'):
            features = build_features(event)
            rows.append({'label': data_idx, **features})
    
    data = pd.DataFrame(rows)
    features_data = data[list(set(data.columns) - set(['label']))]
    features_data = (features_data - features_data.mean(axis=0)) / (features_data.std(axis=0))
    return pd.concat([data['label'], features_data], axis=1)

In [11]:
polymer_df = process_data(['../data/AA66266AA.npy', '../data/AA66466AA.npy', '../data/AA66566AA.npy'])

Processing ../data/AA66266AA.npy: 100%|██████████| 20085/20085 [00:40<00:00, 495.16it/s]
Processing ../data/AA66466AA.npy: 100%|██████████| 20776/20776 [01:22<00:00, 252.91it/s]
Processing ../data/AA66566AA.npy: 100%|██████████| 20018/20018 [00:30<00:00, 648.38it/s]


In [12]:
polymer_df

Unnamed: 0,label,num_lows,mean_lows,std_lows,min_current,max_current,dwell_start,min_amp,num_peaks,max_amp,dwell_end,mean_peaks,dwell_time,std_peaks,std_current,mean_current,duration,num_signals
0,0,-0.689178,-0.501487,-0.024723,-0.046507,-1.099090,-0.649125,-0.423924,-0.695280,-0.749460,-0.704987,-0.724043,-0.709915,-0.323105,-0.822554,-0.426987,-0.701811,-0.704204
1,0,-0.322069,-0.673217,-0.447235,-0.707269,-0.558921,-0.326952,-0.251180,-0.328456,-0.409549,-0.365428,-0.779077,-0.368856,-0.416507,-0.081836,-0.481903,-0.362236,-0.364718
2,0,-0.169108,-0.037411,0.049875,-0.051630,-0.813057,-0.165866,-0.534117,-0.175613,-0.204176,-0.211083,-0.348665,-0.215178,-0.285304,-0.625369,-0.192753,-0.212959,-0.213203
3,0,-0.658586,-0.823156,-0.397465,-0.468876,1.316526,-0.751634,5.165931,-0.634142,-0.650920,-0.633371,0.577108,-0.622291,1.808542,1.999361,-0.051296,-0.632301,-0.625598
4,0,-0.536216,-0.698665,-0.320544,-0.279717,-0.859295,-0.429462,-0.516547,-0.542436,-0.595117,-0.529651,-0.760512,-0.538711,-0.421848,-0.419895,-0.617717,-0.530884,-0.539018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60874,2,-0.566809,0.310190,1.749873,-0.536025,0.670718,-0.605192,-0.653558,-0.573005,-0.506702,-0.567929,0.628686,-0.564325,1.662352,3.338486,1.000304,-0.567348,-0.562941
60875,2,-0.169108,-0.373428,-0.253582,-1.254287,-0.655404,-0.107289,-0.503413,-0.175613,-0.143445,-0.123415,-0.382070,-0.124858,-0.357513,-0.172452,-0.360968,-0.121798,-0.125484
60876,2,-0.689178,4.129555,4.414361,4.358814,0.890190,-0.722346,0.142547,-0.664711,-0.588887,-0.691405,3.799765,-0.688346,3.439256,-1.058727,4.256529,-0.689277,-0.682559
60877,2,0.228593,-0.188667,-0.395984,-0.365299,-0.817822,0.244173,-0.434885,0.221779,0.167881,0.207502,-0.470838,0.204068,-0.646475,-0.602650,-0.455408,0.205243,0.201470


In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

label = 'label'
features = list(set(polymer_df.columns) - set([label]))

train_df, test_df = train_test_split(polymer_df, test_size=0.3, shuffle=True, random_state=42)

model = SVC()
model.fit(train_df[features], train_df[label])

preds = model.predict(test_df[features])
accuracy_score(test_df[label], preds), f1_score(test_df[label], preds)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

label = 'label'
features = list(set(polymer_df.columns) - set([label]))

train_df, test_df = train_test_split(polymer_df, test_size=0.2, shuffle=True, random_state=42)

model = RandomForestClassifier(class_weight='balanced', min_samples_leaf=4)
model.fit(train_df[features], train_df[label])

preds = model.predict(test_df[features])
accuracy_score(test_df[label], preds), f1_score(test_df[label], preds, average='weighted')

(0.688239159001314, 0.6894544697040937)

In [8]:
preds = model.predict(train_df[features])
accuracy_score(train_df[label], preds), f1_score(train_df[label], preds, average='weighted')

(0.924016588650735, 0.9240318199035062)