** Import block **

Take a notice of cells in the Appendix.
Some sells below depend on these ones.

In [424]:
import os, os.path, re
import numpy as np
import time

from sklearn.preprocessing import normalize, scale, MinMaxScaler
from sklearn.cluster import *

from collections import Counter
from sys import getsizeof

from code.modules.levenshtein import levenshtein_distance
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [553]:
files_path = 'populations/collected_models2/'
#ts_labels = ['chest_volume', 'heart_rate', 'oxygen_concentration', 'open_apple']
ts_labels = sorted(['oxygen_concentration', 'heart_rate'])

#number_ts_pieces = len(os.listdir(files_path)) / len(ts_labels)
number_ts_pieces = 50

def atoi(text):
    return int(text) if text.isdigit() else text
def natural_keys(text):    
    return [ atoi(c) for c in re.split('(\d+)', text) ]

filenames = sorted(os.listdir(files_path), key=natural_keys)

** Collect primitive structural features from a population of models**

In [554]:
def get_simple_features_from_segment_of_ts(number_of_file, type_of_ts):
    number_of_retrieved_models = None
    
    tokens_codes, _ = create_map_tokens_params()
    filename = type_of_ts + '_' + str(number_of_file + 1) + '.txt'
    
    models = get_population_from_file(filename)[0:number_of_retrieved_models]
    
    primitive_frequences = np.zeros(len(tokens_codes) - 1)
    lower_bound_code_variables = tokens_codes.get('x0', len(tokens_codes))

    for model in models:
        matr, encodings = dfs_search_on_handle(model)
        model_primitive_frequences = Counter(encodings)
        for key in model_primitive_frequences:
            if key >= 0 and key < len(tokens_codes) - 1:
                primitive_frequences[key] += model_primitive_frequences[key]
    #primitive_frequences[-1] = len(encodings)
    #return normalize(primitive_frequences.reshape(-1,1), axis=0)
    #return primitive_frequences.reshape(-1,1)
    
    return scale(primitive_frequences.reshape(-1,1), axis=0)

In [566]:
tokens_codes, _ = create_map_tokens_params()
feature_matrices_of_ts = {label : np.zeros((number_ts_pieces, len(tokens_codes) - 1)) for label in ts_labels}

for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts[label][index,:] = get_simple_features_from_segment_of_ts(index, label)[:,0]
# feature_matrices_of_ts = {label : scale(feature_matrices_of_ts[label].T) for label in ts_labels}          
unite_feature_matrix = np.vstack((feature_matrices_of_ts[label] for label in ts_labels))
unite_feature_matrix.shape

(100, 25)

In [567]:
ts_labels = list(feature_matrices_of_ts.keys())
ts_labels_in_my_own_order = ['chest_volume', 'heart_rate', 'oxygen_concentration', 'open_apple']
which_label_is_positive = ts_labels_in_my_own_order[1]

target_vector = np.zeros((unite_feature_matrix.shape[0],1))
ts_labels_in_order_from_dictionary = [label for label in feature_matrices_of_ts]
index_of_label_positive = ts_labels_in_order_from_dictionary.index(which_label_is_positive)
all_inidices_of_samples = np.arange(target_vector.shape[0])
positive_indices_samples = number_ts_pieces * index_of_label_positive + np.arange(number_ts_pieces)
negative_positive_samples = [ind for ind in all_inidices_of_samples if not ind in positive_indices_samples]
target_vector[number_ts_pieces * index_of_label_positive:number_ts_pieces * (index_of_label_positive + 1)] = np.ones((number_ts_pieces,1))
backup_target = target_vector

fraction_of_test_samples = 0.3

In [572]:
means_of_frequencies = np.zeros((unite_feature_matrix.shape[1],len(ts_labels)))
for ind_label, label in enumerate(ts_labels):
    matr = feature_matrices_of_ts[label]
    for ind_token in range(matr.shape[1]):
        means_of_frequencies[ind_token, ind_label] = np.mean(matr[:,ind_token])
np.savetxt('means.txt', means_of_frequencies, fmt = '%.3f')

In [568]:
from sklearn.svm import SVC
trials = 200
cumulative_error = 0
cumulative_error_pos = 0
cumulative_error_neg = 0

for trying in range(trials):
    indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
    target_vector = 2* backup_target - 1 
    
    train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
    test_matrix = unite_feature_matrix[indices_of_test_sample,:]
    train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
    test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)

    clf = SVC(kernel='linear')
    clf.fit(train_matrix, train_target) 
    
    predictions = clf.predict(test_matrix)
    #print("predictions: ", predictions)
    errors = predictions != test_target
    cumulative_error += sum(errors) / len(errors)
    cumulative_error_pos += sum(errors[test_target == max(test_target)]) / len(errors[test_target == max(test_target)])
    cumulative_error_neg += sum(errors[test_target == min(test_target)]) / len(errors[test_target == min(test_target)])

cumulative_error /= trials
cumulative_error_pos /= trials
cumulative_error_neg /= trials

print("error = ", cumulative_error)
print("error on positive = ", cumulative_error_pos)
print("error on negative = ", cumulative_error_neg)


error =  0.360280102357
error on positive =  0.319857050643
error on negative =  0.392197963298


In [404]:
indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
target_vector = 2* backup_target - 1 
train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
test_matrix = unite_feature_matrix[indices_of_test_sample,:]
train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)

clf = SVC(kernel='linear')
clf.fit(train_matrix, train_target) 
print(clf.n_support_)
predictions = clf.predict(test_matrix)
#print("predictions: ", predictions)
errors = predictions != test_target

print("error = ", sum(errors) / len(errors))
print("error on positive = ", sum(errors[test_target == max(test_target)]) / len(errors[test_target == max(test_target)]))
print("error on negative = ", sum(errors[test_target == min(test_target)]) / len(errors[test_target == min(test_target)]))

[39 39]
error =  0.333333333333
error on positive =  0.2
error on negative =  0.5


In [405]:
print(np.arange(len(indices_of_test_sample))[indices_of_test_sample])
print(np.arange(len(indices_of_test_sample))[indices_of_test_sample][np.arange(len(errors))[errors]])

[14 16 31 34 43 60 66 83 97]
[31 60 83]


In [87]:
from sklearn import neighbors, datasets

indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
target_vector = 2* backup_target - 1 
train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
test_matrix = unite_feature_matrix[indices_of_test_sample,:]
train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)


n_neighbors = 19
for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(train_matrix, train_target)

    predictions = clf.predict(np.c_[test_matrix])
    print("weights: ", weights)
    errors = predictions != test_target
    print("error = ", sum(errors) / len(errors))
    print("error on positive = ", sum(errors[test_target == max(test_target)]) / len(errors[test_target == max(test_target)]))
    print("error on negative = ", sum(errors[test_target == min(test_target)]) / len(errors[test_target == min(test_target)]))


weights:  uniform
error =  0.133333333333
error on positive =  0.25
error on negative =  0.0
weights:  distance
error =  0.1
error on positive =  0.1875
error on negative =  0.0


In [234]:
from sklearn import neighbors, datasets



h = .02  # step size in the mesh
error = np.inf
where_min = -1
trials = 30

for n_neighbors in 1 + np.arange(30):
    for weights in ['uniform', 'distance']:
        cumulative_error = 0
        for trying in range(trials):
            indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
            target_vector = 2* backup_target - 1 
            train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
            test_matrix = unite_feature_matrix[indices_of_test_sample,:]
            train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
            test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)
            
            # we create an instance of Neighbours Classifier and fit the data.
            clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
            clf.fit(train_matrix, train_target)

            predictions = clf.predict(np.c_[test_matrix])
            errors = predictions != test_target
            
            cumulative_error = cumulative_error + (sum(errors) / len(errors))
        if cumulative_error / trials < error:
            where_min = n_neighbors
            error = cumulative_error / trials


where_min, error

(11, 0.38288642896090991)

** Structural metrics (Levenshtein) **

In [358]:
def distance_between_populations(population_first, population_second):
    len_first, len_second = len(population_first), len(population_second)
    # calculate sum of distances for all pairs of models: one from first population, the other from second
    cumulative_distance = 0
    for model_from_first in population_first:
        for model_from_second in population_second:
            cumulative_distance = cumulative_distance + levenshtein_distance(model_from_first, model_from_second)
    cumulative_distance = cumulative_distance / (len_first * len_second)
    return cumulative_distance

In [359]:
def get_simple_features_from_ts(number_of_file, type_of_ts, tokens_codes):
    models = get_models_from_file(number_of_file, type_of_ts)
    
    primitive_frequences = np.zeros(len(tokens_codes))    

    for model in models:
        matr, encodings = dfs_search_on_handle(model)
        primitive_frequences = encodings

    return primitive_frequences.reshape(-1,1)
    #return primitive_frequences.reshape(-1,1)

In [360]:
def create_matrix_of_codes_of_one_population(number_of_file, type_of_ts, tokens_codes):
    codes = 'QWERTYUIOPASDFGHJKLZXCVBNM123456789/*-+=?!'
    models = get_models_from_file(number_of_file, type_of_ts)
    
    matrix_representation = []    
    for model in models:
        matr, encodings = dfs_search_on_handle(model)
        encodings = np.array(encodings)
        if len(model) == 0:
            break
        matrix_representation.append(''.join(np.array(list(codes))[encodings]))

    return matrix_representation
    #return primitive_frequences.reshape(-1,1)

In [361]:
tokens_codes, _ = create_map_tokens_params()
feature_matrices_of_ts = {label : [] for label in ts_labels}
    
for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts[label].append(create_matrix_of_codes_of_one_population(index, label, tokens_codes))

In [362]:
distances_between_segments = np.zeros((len(ts_labels) * number_ts_pieces, len(ts_labels) * number_ts_pieces))

start = time.time()

indices_vs_labels = list(enumerate(feature_matrices_of_ts))

for ind_f in range(distances_between_segments.shape[0]):
    for ind_s in range(distances_between_segments.shape[0]):
        if ind_f % 20 == 0 and ind_s % 20 == 0:
            #print(ind_f, ind_s)
            pass
        label_f = indices_vs_labels[ind_f // number_ts_pieces][1]
        label_s = indices_vs_labels[ind_s // number_ts_pieces][1]
        
        population_f = feature_matrices_of_ts[label_f][ind_f % number_ts_pieces]
        population_s = feature_matrices_of_ts[label_s][ind_s % number_ts_pieces]
        
        if ind_f <= ind_s:
            if ind_f == 0 and ind_s == 0:
                print(population_f[0], population_s[0])
            distances_between_segments[ind_f][ind_s] = distance_between_populations(population_f[0:1], population_s[0:1])
        else:
            distances_between_segments[ind_f][ind_s] = distances_between_segments[ind_s][ind_f]
            
end = time.time()
print(end - start)
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % ( 1, 700))

YCTE1XIUTXEE1E11 YCTE1XIUTXEE1E11
0.2306687831878662


0

In [363]:
trials = 200
measurements = np.zeros(trials)
for ii in range(trials):
    fraction_of_test_samples = 0.3

    indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
    indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
    indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
    #indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
    #                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
    indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

    responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

    indices_to_search = np.array(indices_to_search, dtype = int)
    for ind_label in range(len(ts_labels)):
        for index_of_index, index in enumerate(indices_of_test_sample):
            index_in_matrix  = ind_label * number_ts_pieces + index
            row_for_analysis = distances_between_segments[index_in_matrix,:] 

            #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
            nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
            responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

    true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
    for i in range(len(ts_labels)):
        true_responses[i*len(indices_of_test_sample):(i+1)*len(indices_of_test_sample)] = i * np.ones((len(indices_of_test_sample), 1))
    measurements[ii] = sum(responses != true_responses) / len(true_responses)
    #print("error = ", sum(responses != true_responses) / len(true_responses))
print("error = ", np.mean(measurements))

error =  0.512113875872


In [None]:
fraction_of_test_samples = 0.3

indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
#indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
#                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

indices_to_search = np.array(indices_to_search, dtype = int)

In [365]:
for ind_label in range(len(ts_labels)):
    for index_of_index, index in enumerate(indices_of_test_sample):
        index_in_matrix  = ind_label * number_ts_pieces + index
        row_for_analysis = distances_between_segments[index_in_matrix,:] 

        #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
        nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
        responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
for i in range(len(ts_labels)):
    true_responses[i*len(indices_of_test_sample):(i+1)*len(indices_of_test_sample)] = i * np.ones((len(indices_of_test_sample), 1))
sum(responses != true_responses) / len(true_responses)

array([ 0.41666667])

In [371]:
print("indices_of_test_sample =", indices_of_test_sample)
print("responses =", [int(item[0]) for item in responses] )

indices_of_test_sample = [ 2  4 14 16 17 19 22 25 28 31 36 47]
responses = [1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1]


In [None]:
index_in_matrix  = 0 * number_ts_pieces + 2
row_for_analysis = distances_between_segments[0,:] 

nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

In [47]:
def find_closest_elem(row_for_analysis, indices_to_search):
    minim = np.Inf
    posit = -1
    for ind in indices_to_search:
        if row_for_analysis[ind] < minim:
            posit = ind
            minim = row_for_analysis[ind]
    return posit

In [48]:
distances_between_segments[0:5, 0:5]

array([[  0.,  15.,  12.,  11.,  11.],
       [ 15.,   0.,  14.,  12.,   8.],
       [ 12.,  14.,   0.,  11.,  10.],
       [ 11.,  12.,  11.,   0.,   7.],
       [ 11.,   8.,  10.,   7.,   0.]])

In [49]:
blocks_mat = np.empty((len(ts_labels),len(ts_labels)))
length_side = distances_between_segments.shape[0] // 3
for i in range(len(ts_labels)):
    for j in range(len(ts_labels)):
        ul = i * length_side
        ur = ul + length_side
        dl = j * length_side
        dr = dl + length_side
        for k in np.arange(ul, ur, 1):
            for q in np.arange(dl, dr, 1):
                blocks_mat[i,j] += distances_between_segments[k,q]

blocks_mat  / np.max(blocks_mat.reshape(-1,1))

array([[ 0.96040539,  1.        ],
       [ 1.        ,  0.98376707]])

In [34]:
np.arange(3,8,1)

array([3, 4, 5, 6, 7])

In [33]:
fraction_of_test_samples = 0.3

indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
#indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
#                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

array([ 1,  4,  7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 43])

In [23]:
#np.savetxt('matrix_distances2.txt', distances_between_segments, fmt='%.5f', delimiter=',')

In [63]:
a = distances_between_segments[10,:]
order_neighbors = sorted(indices_to_search, key = lambda x: a[x])
print(order_neighbors[0:5])
print(a[order_neighbors[0:5]])

[169, 22, 262, 157, 65]
[ 8.70222222  9.01777778  9.15555556  9.32        9.32444444]


In [None]:
bounds = np.arange(1,45,step=3)
min_error = np.Inf
which_min = -1

for bound in bounds:
    print("bound =",bound)
    distances_between_segments = np.zeros((len(ts_labels) * number_ts_pieces, len(ts_labels) * number_ts_pieces))

    indices_vs_labels = list(enumerate(feature_matrices_of_ts))

    for ind_f in range(distances_between_segments.shape[0]):
        for ind_s in range(distances_between_segments.shape[0]):
            if ind_f % 20 == 0 and ind_s % 20 == 0:
                #print(ind_f, ind_s)
                pass
            label_f = indices_vs_labels[ind_f // number_ts_pieces][1]
            label_s = indices_vs_labels[ind_s // number_ts_pieces][1]

            population_f = feature_matrices_of_ts[label_f][ind_f % number_ts_pieces]
            population_s = feature_matrices_of_ts[label_s][ind_s % number_ts_pieces]

            if ind_f <= ind_s:
                if ind_f == 0 and ind_s == 0:
                    print(population_f[0], population_s[0])
                distances_between_segments[ind_f][ind_s] = distance_between_populations(population_f[0:bound], population_s[0:bound])
            else:
                distances_between_segments[ind_f][ind_s] = distances_between_segments[ind_s][ind_f]


    trials = 200
    measurements = np.zeros(trials)
    for ii in range(trials):
        fraction_of_test_samples = 0.3

        indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
        indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
        indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
        #indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
        #                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
        indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

        responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

        indices_to_search = np.array(indices_to_search, dtype = int)
        for ind_label in range(len(ts_labels)):
            for index_of_index, index in enumerate(indices_of_test_sample):
                index_in_matrix  = ind_label * number_ts_pieces + index
                row_for_analysis = distances_between_segments[index_in_matrix,:] 

                #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
                nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
                responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

        true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
        true_responses[len(indices_of_test_sample):2 * len(indices_of_test_sample)] = np.ones((len(indices_of_test_sample), 1))
        #true_responses[2 * len(indices_of_test_sample):3 * len(indices_of_test_sample)] = 2 * np.ones((len(indices_of_test_sample), 1))
        measurements[ii] = sum(responses != true_responses) / len(true_responses)
        #print("error = ", sum(responses != true_responses) / len(true_responses))
    if np.mean(measurements) < min_error:
        min_error = np.mean(measurements)
        which_min = bound
        
print("min_error =", min_error,";\nwhich_min =", which_min)        
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % ( 1, 700))

In [92]:
population_f = feature_matrices_of_ts['chest_volume']
population_s = feature_matrices_of_ts['heart_rate']
print(len(population_f))
import time
start = time.time()
distance_between_populations(population_f[0], population_s[0])
print(time.time() - start)


50
0.5323779582977295


In [109]:
100 % 2

0

In [56]:
random_state = 50
clusters = KMeans(n_clusters=len(ts_labels), random_state=random_state,n_init = 10).fit_predict(distances_between_segments)
true_clusters = np.zeros(len(ts_labels) * number_ts_pieces)
for ind,_ in enumerate(ts_labels):
    true_clusters[ind * number_ts_pieces:(ind + 1) * number_ts_pieces] = ind * np.ones(number_ts_pieces)
print(sum(clusters != true_clusters) / len(clusters))
clusters

0.735


array([0, 2, 2, 0, 0, 3, 0, 2, 2, 0, 0, 2, 2, 3, 3, 0, 2, 2, 0, 2, 0, 2, 0,
       2, 3, 2, 2, 2, 2, 0, 2, 0, 3, 0, 2, 2, 2, 3, 2, 3, 3, 3, 2, 0, 2, 2,
       3, 0, 2, 0, 2, 3, 1, 0, 2, 2, 0, 2, 2, 2, 3, 2, 2, 3, 0, 0, 1, 2, 2,
       3, 2, 2, 0, 0, 2, 2, 1, 0, 2, 2, 3, 0, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2,
       1, 2, 2, 2, 0, 2, 0, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 0, 0, 2, 1, 1, 1,
       2, 2, 0, 2, 2, 0, 2, 3, 0, 2, 0, 0, 3, 3, 2, 2, 2, 2, 0, 2, 3, 3, 1,
       2, 3, 2, 2, 3, 0, 3, 0, 2, 1, 1, 0, 3, 2, 2, 1, 2, 1, 2, 2, 2, 0, 3,
       2, 2, 0, 0, 1, 3, 2, 0, 3, 1, 0, 1, 2, 3, 2, 2, 0, 2, 3, 0, 0, 0, 0,
       2, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 0, 3, 3, 2, 3], dtype=int32)

In [57]:
random_state = 170
clusters = AgglomerativeClustering(n_clusters=len(ts_labels)).fit_predict(distances_between_segments)
print(sum(clusters != true_clusters) / len(clusters))
clusters

0.77


array([2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 3, 2, 0, 0, 0, 0, 0, 0, 2,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0,
       3, 2, 0, 2, 0, 3, 3, 2, 0, 0, 2, 0, 0, 0, 3, 0, 0, 2, 2, 0, 3, 0, 0,
       2, 0, 0, 2, 0, 0, 0, 3, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 3, 3,
       0, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 2, 2, 3, 0, 0, 0, 0, 2, 0, 3, 2, 1,
       0, 2, 0, 0, 2, 2, 3, 2, 0, 1, 1, 0, 2, 0, 0, 1, 0, 1, 0, 0, 0, 2, 3,
       0, 0, 0, 2, 3, 2, 0, 2, 3, 1, 2, 1, 0, 2, 0, 0, 0, 0, 3, 0, 2, 2, 0,
       0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 0, 0, 2, 2, 0, 2])

In [None]:
fraction_of_test_samples = 0.3

test_indices = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
target_vector = 2* target_vector - 1 

train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
test_matrix = unite_feature_matrix[indices_of_test_sample,:]
train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)


** APPENDIX **

In [519]:
def get_population_from_file(filename):
    if not filename in filenames:
        print('Error:', filename, 'is not presented in the directory')
    
    
    
    lines_file_content = open(files_path + filename, 'r').readlines()
    population = np.empty(len(lines_file_content), dtype = object)
    
    for ind, entity in enumerate(lines_file_content):          
        model_name = entity.split(' ')[-1]
        population[ind] = re.sub(r'X\[(\d+)\]', r'x\1', model_name.strip())    
    return population

In [520]:
def find_num_vars_and_tokens(handle):    
    counter_tokens = 0
    counter_variables = 0

    for i in range(len(handle)):
        if handle[i] == '_': 
            counter_tokens += 1
        elif i < len(handle)-1 and handle[i] == 'x' and handle[i+1].isdigit():
            counter_variables += 1;

    return (counter_tokens, counter_variables)

In [521]:
def create_map_tokens_params():
    file_opened = open('data/numbParam.txt', 'r')
    primitives_lines = file_opened.readlines()
    tokens_codes = {line.split()[0] : int(ind) for ind,line in enumerate(primitives_lines)}    
    tokens_params = {line.split()[0] : int(line.split()[1]) for line in primitives_lines}    
    return (tokens_codes, tokens_params)

In [522]:
def dfs_search_on_handle(handle):
    counters = find_num_vars_and_tokens(handle)
    number_tokens = counters[0] + counters[1]
    
    waiting_tokens = []
    encodings = np.zeros(number_tokens, dtype = int)        
    current_token, left, right = 0, 0, 0
    is_a_token_processed_now = False    
    
    map_tokens_params = create_map_tokens_params()[0]
    
    for right in range(len(handle)):
        if handle[right] == '_':
            # the root is detected
            waiting_tokens.append(current_token)
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            right += 1
            break;  
    
    matr = [[] for i in range(number_tokens)]            
    
    # now process the remaining vertices
    reserved_right = right
    for right in np.arange(right, len(handle)):
        if handle[right] == ')':
            waiting_tokens.pop()        
    
        if not is_a_token_processed_now and handle[right].isalpha():
            is_a_token_processed_now = True
            left = right
    
        # if a token is found
        if handle[right] == '_':
            # new token is detected
            current_token += 1
            matr[waiting_tokens[-1]].append(current_token)
            waiting_tokens.append(current_token)
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            is_a_token_processed_now = False      
        
        # if a variable is found
        if right < len(handle)-1 and handle[right] == 'x' and handle[right+1].isdigit():
            # new variable is detected
            current_token += 1
            matr[waiting_tokens[-1]].append(current_token)
            while right < len(handle)-1 and handle[right] == 'x' and handle[right+1].isdigit():
                right += 1
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            is_a_token_processed_now = False            
    
    return (matr, encodings)

In [523]:
def incidence_to_adjacency(incidence):
    size_of_mat = len(incidence)
    adjacency = np.zeros((size_of_mat, size_of_mat))    
    for ind, row in enumerate(incidence):
        adjacency[ind][row] = 1
    return adjacency

** Collect primitive structural features from a population of models**