** Import block **
==========

Take a notice of cells in the Appendix.
Some sells below depend on these ones.

In [55]:
import os, os.path, re
import numpy as np
import time

from sklearn.preprocessing import normalize, scale, MinMaxScaler
from sklearn.cluster import *

from collections import Counter
from sys import getsizeof

from code.modules.levenshtein import levenshtein_distance
from code.modules.model_simplifier_by_rules import simplify_by_rules
from code.modules.model_reconstructer import model_reconstruct
from code.modules.isomorphism_distance import isomorphism_distance
from code.modules.patterns_extracter import extract_patterns
from code.modules.patterns_extracter import extract_patterns


import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [56]:
files_path = 'populations/collected_models14/'
#ts_labels = ['chest_volume', 'heart_rate', 'oxygen_concentration', 'open_apple']
ts_labels = sorted(['chest_volume', 'open_apple'])

number_ts_pieces = 36
tokens_which_param_interest = ['normal_', 'sina_']

def atoi(text):
    return int(text) if text.isdigit() else text
def natural_keys(text):    
    return [ atoi(c) for c in re.split('(\d+)', text) ]
filenames = sorted(os.listdir(files_path), key=natural_keys)

** Collect primitive structural features from a population of models**
=====================================

In [84]:
def get_simple_features_from_segment_of_ts(number_of_file, type_of_ts):
    number_of_retrieved_models = 10
    
    tokens_codes, _ = create_map_tokens_params()
    filename = type_of_ts + '_' + str(number_of_file + 1) + '.txt'
    
    models = get_population_from_file(filename)[0:number_of_retrieved_models]
    primitive_frequences = np.zeros(len(tokens_codes) - 1, dtype = float)
    lower_bound_code_variables = tokens_codes.get('x0', len(tokens_codes))
    
    for model in models:
        matr, encodings = dfs_search_on_handle(model)
        model_primitive_frequences = Counter(encodings)
        for key in model_primitive_frequences:
            if key >= 0 and key < len(tokens_codes) - 1:
                primitive_frequences[key] += (model_primitive_frequences[key] / len(model_primitive_frequences))
    return scale(primitive_frequences.reshape(-1,1), axis=0)
    
    #primitive_frequences[-1] = len(encodings)
    #return normalize(primitive_frequences.reshape(-1,1), axis=0)
    #return primitive_frequences.reshape(-1,1)

** Trivial features creation**
==

In [85]:
tokens_codes, _ = create_map_tokens_params()
feature_matrices_of_ts = {label : np.zeros((number_ts_pieces, len(tokens_codes) - 1)) for label in ts_labels}

for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts[label][index,:] = get_simple_features_from_segment_of_ts(index, label)[:,0]
# feature_matrices_of_ts = {label : scale(feature_matrices_of_ts[label].T) for label in ts_labels}          
unite_feature_matrix = np.vstack((feature_matrices_of_ts[label] for label in ts_labels))
unite_feature_matrix_backup = unite_feature_matrix
unite_feature_matrix.shape

(72, 18)

** Features on parameters of superpositions **
==

In [86]:
tokens_codes, tokens_params = create_map_tokens_params()
total_num_of_params         = sum(tokens_params.values())
    
feature_matrices_of_ts_ext = {label : np.zeros((number_ts_pieces, total_num_of_params)) for label in ts_labels}

for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts_ext[label][index,:] = get_param_features_from_segment_of_ts(index, label)[:,0]
unite_feature_matrix_add = np.vstack((feature_matrices_of_ts_ext[label] for label in ts_labels))
unite_feature_matrix = np.hstack((unite_feature_matrix_backup, unite_feature_matrix_add)) 
unite_feature_matrix_param_backup = unite_feature_matrix
unite_feature_matrix.shape

(72, 39)

In [87]:
indices_of_params_interesting_tokens = get_indices_of_params_interesting_tokens(tokens_which_param_interest)
unite_feature_matrix_add_new = np.array([unite_feature_matrix_add[:,ind] for ind in indices_of_params_interesting_tokens])
unite_feature_matrix_add_new = unite_feature_matrix_add_new.T

unite_feature_matrix = np.hstack((unite_feature_matrix_backup, unite_feature_matrix_add_new)) 
unite_feature_matrix_param_backup = unite_feature_matrix
unite_feature_matrix.shape

(72, 22)

In [237]:
unite_feature_matrix[2,:]

** Pattern features **
==

In [88]:
countered_patterns_dict = {label : get_countered_useful_patterns(label,desired_number_of_patterns=20) for label in ts_labels}
useful_patterns_dict = {label : sorted(countered_patterns_dict[label].keys()) for label in ts_labels}

useful_patterns = [useful_patterns_dict[label] for label in ts_labels]
#useful_patterns_list = set.difference(*[set(useful_patterns_dict[label]) for label in ts_labels])
useful_patterns_list = set([item for sublist in useful_patterns for item in sublist])
#useful_patterns_list = set(useful_patterns_dict[ts_labels[0]]).difference(set(useful_patterns_dict[ts_labels[1]]))
#useful_patterns_list = useful_patterns_list.union(set(useful_patterns_dict[ts_labels[1]]).difference(set(useful_patterns_dict[ts_labels[0]])) )
feature_matrices_of_ts = {label : np.zeros((number_ts_pieces, len(useful_patterns_list))) for label in ts_labels}

for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts[label][index,:] = \
        get_features_patterns_population(get_population_from_file(create_filename(label,index+1)),list(useful_patterns_list))[:,0]
#unite_feature_matrix_add = np.vstack((feature_matrices_of_ts_ext[label] for label in ts_labels))
unite_feature_matrix_add = np.vstack((feature_matrices_of_ts[label] for label in ts_labels))
#unite_feature_matrix = unite_feature_matrix_add
unite_feature_matrix = np.hstack((unite_feature_matrix_param_backup, unite_feature_matrix_add)) 

unite_feature_matrix.shape

(72, 43)

In [173]:
unite_feature_matrix[0,:]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [155]:
countered_patterns_dict['heart_rate'], countered_patterns_dict['open_apple']

(Counter({'lnl_(sin_(lnl_))': 18,
          'lnl_(sin_)': 29,
          'lnl_(sina_(sin_))': 32,
          'lnl_(sina_)': 63,
          'plus2_(sin_(x0),sin_(x0))': 17,
          'plus2_(sin_(x0),x0)': 29,
          'plus2_(sina_(x0),x0)': 60,
          'sin_(inv_)': 24,
          'sin_(sin_(x0))': 16,
          'sin_(sin_)': 31,
          'sin_(sina_(sina_))': 38,
          'sin_(sina_)': 102,
          'sin_(tana_)': 31,
          'sina_(sin_)': 83,
          'sina_(sina_(sina_))': 27,
          'sina_(sina_)': 102,
          'sina_(sqrtl_)': 34,
          'tana_(sina_)': 28,
          'times2_(sina_(x0),x0)': 20}),
 Counter({'lnl_(sin_(sina_))': 38,
          'lnl_(sin_)': 38,
          'lnl_(sina_)': 44,
          'minus2_(sin_(x0),x0)': 44,
          'minus2_(x0,sin_(x0))': 33,
          'plus2_(sin_(sin_(x0)),x0)': 19,
          'plus2_(sin_(x0),x0)': 44,
          'plus2_(sina_(sina_(x0)),x0)': 17,
          'plus2_(sina_(x0),x0)': 83,
          'sin_(sin_)': 97,
          'sin_

In [44]:
feature_matrices_of_ts['chest_volume'][0,:]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  2.,  1.,
        0.,  0.,  1.,  0.,  0.,  3.,  0.,  0.,  0.,  0.,  0.,  0.])

In [89]:
ts_labels = list(feature_matrices_of_ts.keys())
ts_labels_in_my_own_order = ['chest_volume', 'heart_rate', 'oxygen_concentration', 'open_apple']
which_label_is_positive = ts_labels_in_my_own_order[0]

target_vector = np.zeros((unite_feature_matrix.shape[0],1))
ts_labels_in_order_from_dictionary = [label for label in feature_matrices_of_ts]
index_of_label_positive = ts_labels_in_order_from_dictionary.index(which_label_is_positive)
all_inidices_of_samples = np.arange(target_vector.shape[0])
positive_indices_samples = number_ts_pieces * index_of_label_positive + np.arange(number_ts_pieces)
negative_positive_samples = [ind for ind in all_inidices_of_samples if not ind in positive_indices_samples]
target_vector[number_ts_pieces * index_of_label_positive:number_ts_pieces * (index_of_label_positive + 1)] = np.ones((number_ts_pieces,1))
backup_target = target_vector

fraction_of_test_samples = 0.25

In [90]:
from sklearn.svm import SVC
trials = 500
cumulative_error = 0
cumulative_error_pos = 0
cumulative_error_neg = 0

for trying in range(trials):
    indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
    target_vector = 2* backup_target - 1 
    
    train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
    test_matrix = unite_feature_matrix[indices_of_test_sample,:]
    train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
    test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)

    clf = SVC(kernel='linear')
    clf.fit(train_matrix, train_target) 
    
    predictions = clf.predict(test_matrix)
    #print("predictions: ", predictions)
    errors = predictions != test_target
    cumulative_error += sum(errors) / len(errors)
    cumulative_error_pos += sum(errors[test_target == max(test_target)]) / len(errors[test_target == max(test_target)])
    cumulative_error_neg += sum(errors[test_target == min(test_target)]) / len(errors[test_target == min(test_target)])

cumulative_error /= trials
cumulative_error_pos /= trials
cumulative_error_neg /= trials

print("error = ", cumulative_error)
print("error on positive = ", cumulative_error_pos)
print("error on negative = ", cumulative_error_neg)


error =  0.189253462942
error on positive =  0.181492832168
error on negative =  0.188921410126


In [69]:
l = []
means_of_frequencies = np.zeros((unite_feature_matrix.shape[1],len(ts_labels)))
for ind_label, label in enumerate(ts_labels):
    print(label)
    matr = feature_matrices_of_ts[label]
    for ind_token in range(matr.shape[1]):
        means_of_frequencies[ind_token, ind_label] = np.mean(matr[:,ind_token])# / np.sum(np.mean(matr[:,ind_token]))

np.savetxt('means.txt', means_of_frequencies, fmt = '%.3f')

chest_volume
open_apple


In [87]:
matr = np.loadtxt('means.txt')
f = matr[:,0]
s = matr[:,1]
values = np.zeros(number_ts_pieces)
pos_ind = [f[i] > s[i] for i in range(f.shape[0])]
print(pos_ind)
for ind in range(number_ts_pieces):
    row =  unite_feature_matrix[number_ts_pieces + ind,:] 
    
    fd = sum([row[i] for i in range(len(pos_ind)) if pos_ind[i]])
    sd = sum(row) - fd

    if (fd > sd):
        values[ind] = 1
values        

[True, False, True, False, True, False, False, True, False, True, True, False, True, True, False, True, False, False, True, False, False, False, True, True, False]


array([ 0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.])

In [76]:
sum(pos_ind)

12

In [248]:
indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
target_vector = 2* backup_target - 1 
train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
test_matrix = unite_feature_matrix[indices_of_test_sample,:]
train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)

clf = SVC(kernel='linear')
clf.fit(train_matrix, train_target) 
print(clf.n_support_)
predictions = clf.predict(test_matrix)
#print("predictions: ", predictions)
errors = predictions != test_target

print("error = ", sum(errors) / len(errors))
print("error on positive = ", sum(errors[test_target == max(test_target)]) / len(errors[test_target == max(test_target)]))
print("error on negative = ", sum(errors[test_target == min(test_target)]) / len(errors[test_target == min(test_target)]))

[27 28]
error =  0.5
error on positive =  0.375
error on negative =  0.571428571429


In [405]:
print(np.arange(len(indices_of_test_sample))[indices_of_test_sample])
print(np.arange(len(indices_of_test_sample))[indices_of_test_sample][np.arange(len(errors))[errors]])

[14 16 31 34 43 60 66 83 97]
[31 60 83]


In [None]:
short_unite_feature_matrix = unite_feature_matrix

In [249]:
from sklearn import neighbors, datasets



h = .02  # step size in the mesh
error = np.inf
where_min = -1
trials = 30

for n_neighbors in 1 + np.arange(30):
    for weights in ['uniform', 'distance']:
        cumulative_error = 0
        for trying in range(trials):
            indices_of_test_sample = np.random.choice([True, False], len(all_inidices_of_samples), p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
            target_vector = 2* backup_target - 1 
            train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
            test_matrix = unite_feature_matrix[indices_of_test_sample,:]
            train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
            test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)
            
            # we create an instance of Neighbours Classifier and fit the data.
            clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
            clf.fit(train_matrix, train_target)

            predictions = clf.predict(np.c_[test_matrix])
            errors = predictions != test_target
            
            cumulative_error = cumulative_error + (sum(errors) / len(errors))
        if cumulative_error / trials < error:
            where_min = n_neighbors
            error = cumulative_error / trials


where_min, error

(1, 0.44765882695184861)

** Structural metrics (Levenshtein on strings and graphs) **
==

In [17]:
def distance_between_populations_string(population_first, population_second):
    len_first, len_second = len(population_first), len(population_second)
    # calculate sum of distances for all pairs of models: one from first population, the other from second
    cumulative_distance = 0
    for model_from_first in population_first:
        for model_from_second in population_second:
            cumulative_distance = cumulative_distance + levenshtein_distance(model_from_first, model_from_second)
    cumulative_distance = cumulative_distance / (len_first * len_second)
    return cumulative_distance

In [18]:
def distance_between_populations_trees(population_first, population_second):
    len_first, len_second = len(population_first), len(population_second)
    # calculate sum of distances for all pairs of models: one from first population, the other from second
    cumulative_distance = 0
    for model_from_first in population_first:
        for model_from_second in population_second:
            cumulative_distance = cumulative_distance + isomorphism_distance(model_from_first, model_from_second)
    cumulative_distance = cumulative_distance / (len_first * len_second)
    return cumulative_distance

In [19]:
def create_matrix_of_codes_of_one_population(number_of_file, type_of_ts, tokens_codes):
    codes = 'QWERTYUIOPASDFGHJKLZXCVBNM123456789/*-+=?!'
    filename = type_of_ts + '_' + str(number_of_file + 1) + '.txt'
    models = get_population_from_file(filename)
    
    matrix_representation = []    
    for model in models:
        matr, encodings = dfs_search_on_handle(model)
        encodings = np.array(encodings)
        if len(model) == 0:
            break
        matrix_representation.append(''.join(np.array(list(codes))[encodings]))

    return matrix_representation
    #return primitive_frequences.reshape(-1,1)

** Create string-format DFS-code of the final selected models stored in the corresponding files **

In [21]:
tokens_codes, _ = create_map_tokens_params()
feature_matrices_of_ts = {label : [] for label in ts_labels}
    
for label in ts_labels:
    for index in range(number_ts_pieces):
        feature_matrices_of_ts[label].append(create_matrix_of_codes_of_one_population(index, label, tokens_codes))

In [24]:
for i in range(180):
    row = distances_between_segments[i,:]
    inds= np.array([i[0] for i in sorted(enumerate(row), key=lambda x:x[1])])[0:10]
    threshhold = number_ts_pieces

    pos = sum(inds < number_ts_pieces)
    neg = inds.shape[0] - pos
    print('pos =', pos, '; neg =', neg)

pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 4 ; neg = 6
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 4 ; neg = 6
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 4 ; neg = 6
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 3 ; neg = 7
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 4 ; neg = 6
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 5 ; neg = 5
pos = 5 ; neg = 5
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 6 ; neg = 4
pos = 6 ; 

In [315]:
isomorphism_distance('minus2_(linear_(sina_(sina_(x0))),normal_(normal_(x0)))', 'minus2_(linear_(sina_(sina_(x0))),normal_(normal_(x0)))')

8

In [22]:
distances_between_segments = np.zeros((len(ts_labels) * number_ts_pieces, len(ts_labels) * number_ts_pieces))

start = time.time()

indices_vs_labels = list(enumerate(feature_matrices_of_ts))

for ind_f in range(distances_between_segments.shape[0]):
    for ind_s in range(distances_between_segments.shape[0]):
        label_f = indices_vs_labels[ind_f // number_ts_pieces][1]
        label_s = indices_vs_labels[ind_s // number_ts_pieces][1]
        
        #population_f = feature_matrices_of_ts[label_f][ind_f % number_ts_pieces]
        #population_s = feature_matrices_of_ts[label_s][ind_s % number_ts_pieces]
        population_f = get_population_from_file(label_f + '_' + str(ind_f % number_ts_pieces + 1) + '.txt')
        population_s = get_population_from_file(label_s + '_' + str(ind_s % number_ts_pieces + 1) + '.txt')
        
        if ind_f <= ind_s:
            if ind_f == 0 and ind_s == 0:
                print(population_f[0], population_s[0])
            distances_between_segments[ind_f][ind_s] = distance_between_populations_trees(population_f[0:10], population_s[0:10])
        else:
            distances_between_segments[ind_f][ind_s] = distances_between_segments[ind_s][ind_f]
            
end = time.time()
print(end - start)
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % ( 1, 700))

minus2_(linear_(sina_(sina_(x0))),normal_(normal_(x0))) minus2_(linear_(sina_(sina_(x0))),normal_(normal_(x0)))
771.6964039802551


0

In [23]:
trials = 200
measurements = np.zeros(trials)
for ii in range(trials):
    fraction_of_test_samples = 0.3

    indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
    indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
    indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
    #indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
    #                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
    indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

    responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

    indices_to_search = np.array(indices_to_search, dtype = int)
    for ind_label in range(len(ts_labels)):
        for index_of_index, index in enumerate(indices_of_test_sample):
            index_in_matrix  = ind_label * number_ts_pieces + index
            row_for_analysis = distances_between_segments[index_in_matrix,:] 

            #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
            nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
            responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

    true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
    for i in range(len(ts_labels)):
        true_responses[i*len(indices_of_test_sample):(i+1)*len(indices_of_test_sample)] = i * np.ones((len(indices_of_test_sample), 1))
    measurements[ii] = sum(responses != true_responses) / len(true_responses)
    #print("error = ", sum(responses != true_responses) / len(true_responses))
print("error = ", np.mean(measurements))

error =  0.494774985909


In [None]:
fraction_of_test_samples = 0.3

indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
#indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
#                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

indices_to_search = np.array(indices_to_search, dtype = int)

In [365]:
for ind_label in range(len(ts_labels)):
    for index_of_index, index in enumerate(indices_of_test_sample):
        index_in_matrix  = ind_label * number_ts_pieces + index
        row_for_analysis = distances_between_segments[index_in_matrix,:] 

        #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
        nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
        responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
for i in range(len(ts_labels)):
    true_responses[i*len(indices_of_test_sample):(i+1)*len(indices_of_test_sample)] = i * np.ones((len(indices_of_test_sample), 1))
sum(responses != true_responses) / len(true_responses)

array([ 0.41666667])

In [None]:
index_in_matrix  = 0 * number_ts_pieces + 2
row_for_analysis = distances_between_segments[0,:] 

nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

In [47]:
def find_closest_elem(row_for_analysis, indices_to_search):
    minim = np.Inf
    posit = -1
    for ind in indices_to_search:
        if row_for_analysis[ind] < minim:
            posit = ind
            minim = row_for_analysis[ind]
    return posit

In [48]:
distances_between_segments[0:5, 0:5]

array([[  0.,  15.,  12.,  11.,  11.],
       [ 15.,   0.,  14.,  12.,   8.],
       [ 12.,  14.,   0.,  11.,  10.],
       [ 11.,  12.,  11.,   0.,   7.],
       [ 11.,   8.,  10.,   7.,   0.]])

In [49]:
blocks_mat = np.empty((len(ts_labels),len(ts_labels)))
length_side = distances_between_segments.shape[0] // 3
for i in range(len(ts_labels)):
    for j in range(len(ts_labels)):
        ul = i * length_side
        ur = ul + length_side
        dl = j * length_side
        dr = dl + length_side
        for k in np.arange(ul, ur, 1):
            for q in np.arange(dl, dr, 1):
                blocks_mat[i,j] += distances_between_segments[k,q]

blocks_mat  / np.max(blocks_mat.reshape(-1,1))

array([[ 0.96040539,  1.        ],
       [ 1.        ,  0.98376707]])

In [33]:
fraction_of_test_samples = 0.3

indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
#indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
#                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

In [63]:
a = distances_between_segments[10,:]
order_neighbors = sorted(indices_to_search, key = lambda x: a[x])
print(order_neighbors[0:5])
print(a[order_neighbors[0:5]])

[169, 22, 262, 157, 65]
[ 8.70222222  9.01777778  9.15555556  9.32        9.32444444]


In [73]:
bounds = np.arange(1,45,step=2)
min_error = np.Inf
which_min = -1

for bound in bounds:
    print("bound =",bound)
    distances_between_segments = np.zeros((len(ts_labels) * number_ts_pieces, len(ts_labels) * number_ts_pieces))

    indices_vs_labels = list(enumerate(feature_matrices_of_ts))

    for ind_f in range(distances_between_segments.shape[0]):
        for ind_s in range(distances_between_segments.shape[0]):
            if ind_f % 20 == 0 and ind_s % 20 == 0:
                #print(ind_f, ind_s)
                pass
            label_f = indices_vs_labels[ind_f // number_ts_pieces][1]
            label_s = indices_vs_labels[ind_s // number_ts_pieces][1]

            population_f = feature_matrices_of_ts[label_f][ind_f % number_ts_pieces]
            population_s = feature_matrices_of_ts[label_s][ind_s % number_ts_pieces]

            if ind_f <= ind_s:
                if ind_f == 0 and ind_s == 0:
                    print(population_f[0], population_s[0])
                distances_between_segments[ind_f][ind_s] = distance_between_populations(population_f[0:bound], population_s[0:bound])
            else:
                distances_between_segments[ind_f][ind_s] = distances_between_segments[ind_s][ind_f]


    trials = 200
    measurements = np.zeros(trials)
    for ii in range(trials):
        fraction_of_test_samples = 0.3

        indices_of_test_sample = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
        indices_of_test_sample = np.arange(number_ts_pieces)[indices_of_test_sample]
        indices_to_search = np.setdiff1d(range(number_ts_pieces), indices_of_test_sample)
        #indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)
        #                               , indices_to_search + 2 * number_ts_pieces * np.ones(indices_to_search.shape)))
        indices_to_search = np.hstack((indices_to_search, indices_to_search + number_ts_pieces * np.ones(indices_to_search.shape)))

        responses = np.empty((len(ts_labels) * len(indices_of_test_sample), 1))

        indices_to_search = np.array(indices_to_search, dtype = int)
        for ind_label in range(len(ts_labels)):
            for index_of_index, index in enumerate(indices_of_test_sample):
                index_in_matrix  = ind_label * number_ts_pieces + index
                row_for_analysis = distances_between_segments[index_in_matrix,:] 

                #nearest_neighbor_label = int(find_closest_elem(row_for_analysis, indices_to_search) // number_ts_pieces)
                nearest_neighbor_label = indices_to_search[np.argmin(row_for_analysis[indices_to_search])] // number_ts_pieces        
                responses[len(indices_of_test_sample) * ind_label + index_of_index] = nearest_neighbor_label

        true_responses = np.zeros((len(ts_labels) * len(indices_of_test_sample), 1))
        true_responses[len(indices_of_test_sample):2 * len(indices_of_test_sample)] = np.ones((len(indices_of_test_sample), 1))
        #true_responses[2 * len(indices_of_test_sample):3 * len(indices_of_test_sample)] = 2 * np.ones((len(indices_of_test_sample), 1))
        measurements[ii] = sum(responses != true_responses) / len(true_responses)
        #print("error = ", sum(responses != true_responses) / len(true_responses))
    if np.mean(measurements) < min_error:
        min_error = np.mean(measurements)
        which_min = bound
        
print("min_error =", min_error,";\nwhich_min =", which_min)        
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % ( 1, 700))

bound = 1
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 3
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 5
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 7
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 9
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 11
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 13
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 15
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 17
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 19
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 21
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 23
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 25
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 27
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 29
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 31
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 33
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 35
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 37
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 39
JXBJVBEBDXEJEBFBEB JXBJVBEBDXEJEBFBEB
bound = 41
JXBJVBEBDXEJEB

0

In [92]:
population_f = feature_matrices_of_ts['chest_volume']
population_s = feature_matrices_of_ts['heart_rate']
print(len(population_f))
import time
start = time.time()
distance_between_populations(population_f[0], population_s[0])
print(time.time() - start)


50
0.5323779582977295


In [109]:
100 % 2

0

In [50]:
random_state = 50
clusters = KMeans(n_clusters=len(ts_labels), random_state=random_state,n_init = 10).fit_predict(distances_between_segments)
true_clusters = np.zeros(len(ts_labels) * number_ts_pieces)
for ind,_ in enumerate(ts_labels):
    true_clusters[ind * number_ts_pieces:(ind + 1) * number_ts_pieces] = ind * np.ones(number_ts_pieces)
print(sum(clusters != true_clusters) / len(clusters))
clusters

0.615384615385


array([0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=int32)

In [51]:
random_state = 170
clusters = AgglomerativeClustering(n_clusters=len(ts_labels)).fit_predict(distances_between_segments)
print(sum(clusters != true_clusters) / len(clusters))
clusters

0.371794871795


array([0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0])

In [None]:
fraction_of_test_samples = 0.3

test_indices = np.random.choice([True, False], number_ts_pieces, p = [fraction_of_test_samples, 1 - fraction_of_test_samples])
target_vector = 2* target_vector - 1 

train_matrix = unite_feature_matrix[~indices_of_test_sample,:]
test_matrix = unite_feature_matrix[indices_of_test_sample,:]
train_target = target_vector[~indices_of_test_sample].reshape(sum(~indices_of_test_sample),)
test_target = target_vector[indices_of_test_sample].reshape(sum(indices_of_test_sample),)


** APPENDIX **
==

In [3]:
def get_population_from_file(filename):
    if not filename in filenames:
        print('Error:', filename, 'is not presented in the directory')
    
    lines_file_content = []
    
    with open(files_path + filename, 'r') as f_in:
        lines_file_content = (line.rstrip() for line in f_in) # All lines including the blank ones
        lines_file_content = [line for line in lines_file_content if line] # Non-blank lines
    #    #population = np.empty(len(lines_file_content) // 2, dtype = object)
    population = np.empty(len(lines_file_content) // 2, dtype = object)
    
    """for ind, entity in enumerate(lines_file_content):          
        model_name = entity.split(' ')[-1]
        population[ind] = re.sub(r'X\[(\d+)\]', r'x\1', model_name.strip())"""
    for ind, entity in enumerate(lines_file_content):          
        if ind % 2 == 0:
            model_name = entity.split(' ')[-1]
            population[ind // 2] = re.sub(r'X\[(\d+)\]', r'x\1', model_name.strip())
                                              
                                          
    return population

In [4]:
def find_num_vars_and_tokens(handle):    
    counter_tokens = 0
    counter_variables = 0

    for i in range(len(handle)):
        if handle[i] == '_': 
            counter_tokens += 1
        elif i < len(handle)-1 and handle[i] == 'x' and handle[i+1].isdigit():
            counter_variables += 1;

    return (counter_tokens, counter_variables)

In [83]:
def create_map_tokens_params():
    file_opened = open('data/tokensInterest.txt', 'r')
    primitives_lines = file_opened.readlines()
    tokens_codes = {line.split()[0] : int(ind) for ind,line in enumerate(primitives_lines)}    
    tokens_params = {line.split()[0] : int(line.split()[1]) for line in primitives_lines}    
    return (tokens_codes, tokens_params)

In [6]:
def dfs_search_on_handle(handle):
    counters = find_num_vars_and_tokens(handle)
    number_tokens = counters[0] + counters[1]
    
    waiting_tokens = []
    encodings = np.zeros(number_tokens, dtype = int)        
    current_token, left, right = 0, 0, 0
    is_a_token_processed_now = False    
    
    map_tokens_params = create_map_tokens_params()[0]
    
    for right in range(len(handle)):
        if handle[right] == '_':
            # the root is detected
            waiting_tokens.append(current_token)
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            right += 1
            break;  
    
    matr = [[] for i in range(number_tokens)]            
    
    # now process the remaining vertices
    reserved_right = right
    for right in np.arange(right, len(handle)):
        if handle[right] == ')':
            waiting_tokens.pop()        
    
        if not is_a_token_processed_now and handle[right].isalpha():
            is_a_token_processed_now = True
            left = right
    
        # if a token is found
        if handle[right] == '_':
            # new token is detected
            current_token += 1
            matr[waiting_tokens[-1]].append(current_token)
            waiting_tokens.append(current_token)
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            is_a_token_processed_now = False      
        
        # if a variable is found
        if right < len(handle)-1 and handle[right] == 'x' and handle[right+1].isdigit():
            # new variable is detected
            current_token += 1
            matr[waiting_tokens[-1]].append(current_token)
            while right < len(handle)-1 and handle[right] == 'x' and handle[right+1].isdigit():
                right += 1
            token = handle[left:right + 1]
            encodings[current_token] = map_tokens_params.get(token, -1)
            is_a_token_processed_now = False            
    
    return (matr, encodings)

In [7]:
def incidence_to_adjacency(incidence):
    size_of_mat = len(incidence)
    adjacency = np.zeros((size_of_mat, size_of_mat))    
    for ind, row in enumerate(incidence):
        adjacency[ind][row] = 1
    return adjacency

** Create features on parameters of nonlinear functions **
==

In [70]:
def get_parameters_from_file(filename):
    if not filename in filenames:
        print('Error:', filename, 'is not presented in the directory')
    
    lines_file_content = []
    
    with open(files_path + filename, 'r') as f_in:
        lines_file_content = (line.rstrip() for line in f_in) # All lines including the blank ones
        lines_file_content = [line for line in lines_file_content if line] # Non-blank lines
    
    array_of_parameters = []
    
    for ind, entity in enumerate(lines_file_content):          
        if ind % 2 == 1:
            list_of_parameters = list(map(float,entity.split(', ')))
            array_of_parameters.append(np.array(list_of_parameters))
    return array_of_parameters

In [82]:
def get_ordered_list_tokens():
    file_opened = open('data/tokensInterest.txt', 'r')
    primitives_lines = file_opened.readlines()
    tokens_names = [line.split()[0] for line in primitives_lines]
    return tokens_names

In [72]:
def get_array_ranges_parameters(array_param_nums):
    array_ranges = [[0,array_param_nums[0]],]    
    for ind, param_num in enumerate(array_param_nums):
        if ind == 0:
            continue
        array_ranges.append([array_ranges[-1][-1],array_ranges[-1][-1]+param_num])
    return array_ranges

In [73]:
def do_bsxfun_to_params(model, parameters):
    tokens_codes, tokens_params = create_map_tokens_params()
    tokens_names                  = get_ordered_list_tokens()
    matr, encodings               = dfs_search_on_handle(model)

    model_tokens     = [tokens_names[i] for i in encodings]
    array_param_nums = [tokens_params[tokens_names[i]] for i in encodings]
    
    array_ranges_parameters = get_array_ranges_parameters(array_param_nums)
    bsxfun_to_params = {token : np.zeros(tokens_params[token]) for ind, token in enumerate(tokens_names)}
    for ind, token in enumerate(model_tokens):
        bsxfun_to_params[token] += parameters[array_ranges_parameters[ind][0]:array_ranges_parameters[ind][1]]
    return bsxfun_to_params

In [51]:
def get_param_features_from_segment_of_ts(number_of_file, type_of_ts):
    number_of_retrieved_models = 10
    
    tokens_codes, tokens_params = create_map_tokens_params()
    tokens_names                = get_ordered_list_tokens()
    total_num_of_params         = sum(tokens_params.values())
    
    filename = type_of_ts + '_' + str(number_of_file + 1) + '.txt'
    
    
    array_of_parameters = get_parameters_from_file(filename)[0:number_of_retrieved_models]
    population          = get_population_from_file(filename)[0:number_of_retrieved_models]
    
    cumulated_params = np.zeros(total_num_of_params)
    def sigmoid_array(x):                                        
        return 1 / (1 + np.exp(-x))
    for model, parameters in zip(population, array_of_parameters):
        bsxfun_to_params    = do_bsxfun_to_params(model, parameters)
        row = np.array([param for token in tokens_names for param in bsxfun_to_params[token]])
        cumulated_params   += row
    
    #return scale(cumulated_params.reshape(-1,1), axis=0)
    return scale(cumulated_params.reshape(-1,1))

In [15]:
get_param_features_from_segment_of_ts(41, 'heart_rate')

array([[-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.40018362],
       [-0.41487209],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.02795842],
       [ 3.9262091 ],
       [ 0.42325242],
       [ 2.88716793],
       [ 1.46072947],
       [-0.4107989 ],
       [-0.4107989 ],
       [ 0.90030533],
       [ 0.28292565],
       [-0.4107989 ],
       [-0.4107989 ],
       [-0.4107989 ]])

In [13]:
def get_indices_of_params_interesting_tokens(interesting_tokens):
    tokens_codes, tokens_params = create_map_tokens_params()
    tokens_names                = get_ordered_list_tokens()
    total_num_of_params         = sum(tokens_params.values())

    array_param_nums = [tokens_params[token] for token in tokens_names]

    array_ranges_parameters = get_array_ranges_parameters(array_param_nums)

    array_params_interest = []
    for token in tokens_which_param_interest:
        range_params = array_ranges_parameters[tokens_names.index(token)]
        array_params_interest.extend(list(np.arange(range_params[0], range_params[1], 1)))

    return array_params_interest

** Functionality for retrieving nontrivial patterns from population's models **
==

In [14]:
Counter(['str', 'str', 'str2'])['str']

2

In [15]:
def simplify_model(handle):
    handle = re.sub(r'X\[(\d+)\]', r'x\1', handle)
    handle = model_reconstruct(handle)
    handle = simplify_by_rules(handle)

    handle = re.sub(r'x(\d+)', r'X[\1]', handle)
    return handle

In [16]:
population = get_population_from_file('heart_rate_1.txt')
simplified_population = []
for model in population:
    simplified_population.append(simplify_model(model))
for ind, model in enumerate(population):
    print(model, '\n-->', simplified_population[ind])

NameError: name 'filenames' is not defined

In [20]:
isomorphism_distance('plus_(sina_(normal_(X[0])))', 'X[0]')

4

** Extract and work with patterns of models from the stored populations ** 
==

In [17]:
def extract_patterns_from_model(model_handle):
    unite_string_with_patterns = extract_patterns(model_handle)
    patterns = unite_string_with_patterns.split('&')
    return patterns

In [18]:
def collect_patterns_from_population(population):
    patterns = []
    for model in population:
        model = model_reconstruct(model)
        patterns.extend(extract_patterns_from_model(model))
    patterns = [item for item in patterns if item]
    return patterns

In [19]:
def collect_patterns_from_label(label):
    patterns = []
    for i in np.arange(1, number_ts_pieces, 1):
        patterns.extend(collect_patterns_from_population(get_population_from_file(label+'_'+str(i)+'.txt')))
    return patterns

In [20]:
def get_popular_patterns(label, threshhold_popularity = 20):
    
    patterns = collect_patterns_from_label(label)
    countered_patterns = Counter(patterns)

    popular_patterns = [item for item in set(patterns) if countered_patterns[item] >= threshhold_popularity]
    
    return popular_patterns

In [21]:
def get_threshhold_for_patterns_selection(label, desired_number_of_patterns):
    
    threshhold_popularity = 0

    patterns = collect_patterns_from_label(label)
    unique_patterns = set(patterns)
    countered_patterns = Counter(patterns)

    for threshhold_popularity in np.arange(1,1000,1):
        popular_patterns = [item for item in unique_patterns if countered_patterns[item] >= threshhold_popularity]
        if len(popular_patterns) <= desired_number_of_patterns:
            break
    return threshhold_popularity

In [22]:
def get_countered_useful_patterns(label,desired_number_of_patterns = 40):
    
    threshhold_popularity = get_threshhold_for_patterns_selection(label,desired_number_of_patterns)
    unique_popular_patterns = get_popular_patterns(label, threshhold_popularity)

    all_patterns = collect_patterns_from_label(label)
    popular_counter_to_count = [item for item in all_patterns if item in unique_popular_patterns]
    return Counter(popular_counter_to_count)

In [23]:
def get_features_patterns_population(population, useful_patterns_list):
    patterns = collect_patterns_from_population(population)
    countered_patterns = Counter(patterns)
    
    features = [countered_patterns.get(item,0) for item in useful_patterns_list]
    return scale(np.array(features,dtype=float).reshape(-1,1))

In [24]:
def create_filename(label, number_of_segment):
    return label + '_' + str(number_of_segment) + '.txt'

In [25]:
counter_f = get_countered_useful_patterns('open_apple',desired_number_of_patterns = 15)
counter_s = get_countered_useful_patterns('chest_volume',desired_number_of_patterns = 15)
print(setlist(counter_f.keys()), list(counter_s.keys()))

NameError: name 'number_ts_pieces' is not defined

In [91]:
get_countered_useful_patterns('chest_volume',desired_number_of_patterns = 15),get_countered_useful_patterns('open_apple',desired_number_of_patterns = 15)

(Counter({'lnl_(normal_(lnl_(sina_)))': 2,
          'lnl_(normal_(lnl_))': 2,
          'lnl_(normal_(normal_(atana_)))': 3,
          'lnl_(normal_(normal_))': 2,
          'lnl_(normal_)': 5,
          'normal_(expl_(x0))': 2,
          'normal_(normal_(lnl_(inv_)))': 2,
          'normal_(normal_)': 3,
          'sina_(normal_)': 3}),
 Counter({'atana_(normal_)': 9,
          'expl_(atana_)': 3,
          'lnl_(normal_)': 11,
          'neg_(sina_)': 8,
          'normal_(atana_)': 3,
          'normal_(normal_(normal_))': 5,
          'normal_(normal_)': 8,
          'normal_(tana_)': 11,
          'plus2_(normal_(x0),x0)': 7,
          'sina_(atana_)': 3,
          'sina_(sina_)': 3,
          'sqrtl_(normal_)': 6,
          'tana_(normal_)': 9,
          'times2_(normal_(x0),x0)': 3}))

In [None]:
useful_patterns_list = 
patterns = collect_patterns_from_population(population)
countered_patterns = Counter(patterns)

features = [countered_patterns.get(item,0) for item in useful_patterns_list]
return scale(np.array(features,dtype=float).reshape(-1,1))

In [176]:
collect_patterns_from_population(get_population_from_file('heart_rate_1.txt'))

['sin_(tana_(tana_))']

In [178]:
get_population_from_file('heart_rate_1.txt')

array(['lnl_(normal_(linear_(sina_(sina_(sina_(x0))))))',
       'plus_(normal_(lnl_(linear_(sina_(x0)))))',
       'lnl_(normal_(lnl_(normal_(linear_(sina_(x0))))))',
       'lnl_(plus_(sina_(sqrtl_(sqrtl_(linear_(sina_(x0)))))))',
       'lnl_(lnl_(linear_(sina_(sina_(x0)))))',
       'lnl_(plus_(sina_(lnl_(normal_(sina_(x0))))))',
       'lnl_(plus_(normal_(sina_(x0))))', 'sina_(linear_(linear_(x0)))',
       'plus_(sina_(normal_(x0)))', 'linear_(sina_(sina_(x0)))',
       'lnl_(normal_(normal_(x0)))', 'lnl_(normal_(sina_(x0)))',
       'lnl_(sina_(normal_(x0)))', 'lnl_(sina_(x0))', 'lnl_(normal_(x0))',
       'lnl_(inv_(sina_(x0)))', 'sina_(normal_(lnl_(x0)))',
       'inv_(tana_(sina_(sina_(normal_(lnl_(x0))))))',
       'plus_(linear_(sina_(lnl_(x0))))',
       'sina_(linear_(normal_(sina_(x0))))',
       'times2_(x0,normal_(tana_(tana_(x0))))',
       'plus_(linear_(sina_(linear_(normal_(sina_(x0))))))',
       'linear_(sina_(x0))', 'lnl_(sina_(sqrtl_(sqrtl_(x0))))',
       'lnl

** Analysis of distributions of features **
==

In [None]:
def find_token_frequency_in_population(number_of_segment,label):
    filename = files_path + label + '_' + str(number_of_file + 1) + '.txt'
    population = get_population_from_file(filename)
    
    

In [None]:
tokens_names = get_ordered_list_tokens()
number_of_tokens = len(tokens_names)
for number_of_plot in range(number_of_tokens):
    plt.figure(number_of_plot + 1)
    plt.hist()