In [1]:
import re
import numpy as np
import pandas as pd
import subprocess

from matplotlib import pyplot as plt
from scipy.io import arff
from scipy.stats import zscore

from DBA_multivariate import performDBA
from search_subsequence import search_dtw, decision, search_ed, OUPUT_STARTS_REGEXP, OUPUT_BASE_REGEXP

# Эксперимент 1
## Сравнение производителности с оптимизациями и без них
**Датасет**: рукописные цифры

In [38]:
keys = pd.read_csv("../data/search_subseries/character/character_trajectories_labels.csv", index_col=0)
warp_window = 0.05
subseq_len = 100
closest_series_num = 80 
true_subseq_len = 182

In [39]:
(keys.labels == 1).sum()

73

### Усреднение: DBA

In [40]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/dba_averaged_{0}.csv".format(character),
                     1)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.932 | 1.959 
character:   2 | 0.793 | 2.016 
character:   3 | 0.932 | 1.727 
character:   4 | 0.924 | 1.349 
character:   5 | 0.975 | 1.967 
character:   6 | 0.912 | 2.003 
character:   7 | 0.843 | 1.490 
character:   8 | 0.787 | 1.477 
character:   9 | 1.000 | 3.243 
character:  10 | 0.566 | 1.272 
character:  11 | 0.983 | 2.083 
character:  12 | 0.887 | 2.215 
character:  13 | 0.860 | 1.808 
character:  14 | 0.511 | 2.227 
character:  15 | 0.981 | 2.412 
character:  16 | 0.755 | 1.596 
character:  17 | 1.000 | 2.444 
character:  18 | 0.720 | 1.669 
character:  19 | 0.804 | 2.261 
character:  20 | 0.973 | 2.867 
0.857  2.004


In [41]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/dba_averaged_{0}.csv".format(character),
                     2)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.877 | 1.426 
character:   2 | 0.741 | 1.501 
character:   3 | 0.898 | 1.129 
character:   4 | 0.924 | 0.873 
character:   5 | 0.975 | 1.259 
character:   6 | 0.877 | 1.406 
character:   7 | 0.804 | 1.062 
character:   8 | 0.853 | 0.876 
character:   9 | 1.000 | 1.672 
character:  10 | 0.528 | 0.895 
character:  11 | 0.948 | 1.460 
character:  12 | 0.792 | 1.576 
character:  13 | 0.740 | 1.335 
character:  14 | 0.447 | 1.712 
character:  15 | 0.981 | 1.651 
character:  16 | 0.623 | 1.153 
character:  17 | 1.000 | 1.658 
character:  18 | 0.660 | 1.077 
character:  19 | 0.804 | 1.793 
character:  20 | 0.890 | 2.264 
0.818  1.389


### Усреднение: mean

In [42]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/averaged_{0}.csv".format(character),
                     1)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.986 | 1.904 
character:   2 | 0.879 | 1.986 
character:   3 | 0.898 | 1.734 
character:   4 | 0.985 | 1.529 
character:   5 | 0.988 | 1.609 
character:   6 | 0.965 | 2.775 
character:   7 | 0.882 | 2.128 
character:   8 | 0.880 | 1.895 
character:   9 | 1.000 | 3.374 
character:  10 | 0.585 | 1.422 
character:  11 | 1.000 | 2.181 
character:  12 | 0.717 | 2.278 
character:  13 | 0.920 | 1.722 
character:  14 | 0.936 | 2.156 
character:  15 | 0.981 | 2.708 
character:  16 | 0.830 | 1.568 
character:  17 | 1.000 | 2.439 
character:  18 | 0.600 | 1.669 
character:  19 | 0.893 | 2.114 
character:  20 | 0.945 | 2.754 
0.894  2.097


In [43]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/averaged_{0}.csv".format(character),
                     2)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.863 | 1.287 
character:   2 | 0.828 | 1.410 
character:   3 | 0.881 | 1.112 
character:   4 | 0.970 | 1.007 
character:   5 | 0.988 | 1.072 
character:   6 | 0.947 | 1.395 
character:   7 | 0.863 | 1.045 
character:   8 | 0.893 | 1.024 
character:   9 | 0.980 | 1.676 
character:  10 | 0.547 | 0.971 
character:  11 | 0.948 | 1.314 
character:  12 | 0.717 | 1.528 
character:  13 | 0.780 | 1.242 
character:  14 | 0.872 | 1.477 
character:  15 | 1.000 | 1.722 
character:  16 | 0.660 | 1.073 
character:  17 | 1.000 | 1.632 
character:  18 | 0.540 | 1.113 
character:  19 | 0.893 | 1.441 
character:  20 | 0.918 | 1.961 
0.854  1.325


## Без оптимизаций

### Усреднение: DBA

In [None]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/dba_averaged_{0}.csv".format(character),
                     1, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.932 | 10.251 
character:   2 | 0.793 | 10.098 
character:   3 | 0.932 | 10.107 
character:   4 | 0.924 | 10.512 
character:   5 | 0.975 | 10.631 
character:   6 | 0.912 | 9.969 
character:   7 | 0.843 | 11.858 
character:   8 | 0.787 | 15.838 
character:   9 | 1.000 | 10.208 
character:  10 | 0.566 | 10.552 
character:  11 | 0.983 | 9.657 
character:  12 | 0.887 | 9.894 
character:  13 | 0.860 | 12.172 
character:  14 | 0.511 | 10.349 
character:  15 | 0.981 | 10.648 
character:  16 | 0.755 | 11.231 
character:  17 | 1.000 | 12.324 
character:  18 | 0.720 | 10.472 
character:  19 | 0.804 | 10.642 
character:  20 | 0.973 | 10.012 
0.857  10.871


In [None]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/dba_averaged_{0}.csv".format(character),
                     2, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.877 | 12.603 
character:   2 | 0.741 | 10.658 
character:   3 | 0.898 | 11.487 
character:   4 | 0.924 | 11.019 
character:   5 | 0.975 | 11.017 
character:   6 | 0.877 | 10.109 
character:   7 | 0.804 | 10.015 
character:   8 | 0.853 | 9.587 
character:   9 | 1.000 | 9.586 
character:  10 | 0.528 | 9.705 
character:  11 | 0.948 | 9.624 
character:  12 | 0.792 | 10.536 
character:  13 | 0.740 | 10.024 
character:  14 | 0.447 | 9.725 
character:  15 | 0.981 | 10.363 
character:  16 | 0.623 | 10.213 
character:  17 | 1.000 | 10.713 
character:  18 | 0.660 | 10.820 
character:  19 | 0.804 | 10.211 
character:  20 | 0.890 | 10.504 
0.818  10.426


### Усреднение: mean

In [None]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/averaged_{0}.csv".format(character),
                     1, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.986 | 10.756 
character:   2 | 0.879 | 10.900 
character:   3 | 0.898 | 11.644 
character:   4 | 0.985 | 10.198 
character:   5 | 0.988 | 10.454 
character:   6 | 0.965 | 10.363 
character:   7 | 0.882 | 10.921 
character:   8 | 0.880 | 11.726 
character:   9 | 1.000 | 10.984 
character:  10 | 0.585 | 11.174 
character:  11 | 1.000 | 10.602 
character:  12 | 0.717 | 10.580 
character:  13 | 0.920 | 10.362 
character:  14 | 0.936 | 10.204 
character:  15 | 0.981 | 10.445 
character:  16 | 0.830 | 10.159 
character:  17 | 1.000 | 9.956 
character:  18 | 0.600 | 12.026 
character:  19 | 0.893 | 10.490 
character:  20 | 0.945 | 10.104 
0.894  10.702


In [None]:
results = []
total_elapsed = 0

for character in range(1, 21):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/character/character_trajectories.csv",
                     "../data/search_subseries/character/averaged_{0}.csv".format(character),
                     2, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.863 | 9.542 
character:   2 | 0.828 | 10.756 
character:   3 | 0.881 | 9.653 
character:   4 | 0.970 | 9.763 
character:   5 | 0.988 | 9.674 
character:   6 | 0.947 | 9.441 
character:   7 | 0.863 | 9.502 


## Эпилепсия

In [28]:
keys = pd.read_csv("../data/search_subseries/epi/epi_labels.csv", index_col=0)
warp_window = 0.15
subseq_len = 150
closest_series_num = 60 
true_subseq_len = 206

### Усреднение: DBA

In [29]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/dba_averaged_{0}.csv".format(character),
                     1)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} | {3:d} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.587 | 13.305 | 46 
character:   2 | 0.885 | 13.222 | 52 
character:   3 | 0.837 | 16.371 | 49 
character:   4 | 0.667 | 14.443 | 39 
0.744  14.335


In [31]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/dba_averaged_{0}.csv".format(character),
                     2)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.522 | 12.208 
character:   2 | 0.769 | 12.277 
character:   3 | 0.714 | 12.049 
character:   4 | 0.744 | 12.835 
0.687  12.342


### Усреднение: mean

In [32]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/averaged_{0}.csv".format(character),
                     1)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.587 | 12.992 
character:   2 | 0.885 | 13.456 
character:   3 | 0.837 | 13.813 
character:   4 | 0.667 | 13.903 
0.744  13.541


In [33]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/averaged_{0}.csv".format(character),
                     2)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.522 | 15.411 
character:   2 | 0.769 | 12.266 
character:   3 | 0.714 | 15.394 
character:   4 | 0.744 | 13.723 
0.687  14.199


## Без оптимизаций

### Усреднение: DBA

In [34]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/dba_averaged_{0}.csv".format(character),
                     1, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.587 | 13.484 
character:   2 | 0.885 | 13.202 
character:   3 | 0.837 | 12.757 
character:   4 | 0.667 | 12.814 
0.744  13.064


In [35]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/dba_averaged_{0}.csv".format(character),
                     2, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.522 | 12.786 
character:   2 | 0.788 | 13.836 
character:   3 | 0.714 | 12.473 
character:   4 | 0.744 | 13.724 
0.692  13.205


### Усреднение: mean

In [36]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/averaged_{0}.csv".format(character),
                     1, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.587 | 16.215 
character:   2 | 0.885 | 13.295 
character:   3 | 0.837 | 12.869 
character:   4 | 0.667 | 13.269 
0.744  13.912


In [37]:
results = []
total_elapsed = 0

for character in range(1, 5):
    closest, elapsed = search_dtw(closest_series_num, subseq_len, warp_window,
                     "../data/search_subseries/epi/epi.csv",
                     "../data/search_subseries/epi/averaged_{0}.csv".format(character),
                     2, optimize=False)
    expected = np.array(list(closest.keys()))
    true_starts = keys[keys.labels == character].start.values
    total_elapsed += elapsed 
    
    results.append(decision(true_starts, expected, 182, subseq_len))
    print("character: {0:>3d} | {1:.3f} | {2:.3f} ".format(
        character,
        results[-1],
        elapsed,
        (keys.labels == character).sum()))
    
print("{0:.3f}  {1:.3f}".format(sum(results) / len(results), total_elapsed / len(results)))

character:   1 | 0.522 | 12.383 
character:   2 | 0.788 | 11.700 
character:   3 | 0.714 | 13.154 
character:   4 | 0.744 | 13.714 
0.692  12.738


In [95]:
(keys.labels == 1).sum()

46