In [1]:
from collections import deque
from queue import PriorityQueue
from load_data import make_growth_target_df
import pandas as pd
import numpy as np
import math
import time
from itertools import combinations

## aantekeningen
\
windowsize is erg bepalend in welke subgroups de hoogste quality measure halen (max 72 vs 250 bij 7 vs 5 windowsize) \
\
windowsize, windowoverlap en aggregate functies moeten nog als input aan beam search toegevoegd worden (voor de quality measure) \
\
we doen nu alleen equal voor classes column maar moet ook niet not equal? (dan neemt mogelijke optie wel gigantish toe (exponentieel??)) \
\
worden de splitpoints voor numerical class nu wel goed berekent en hoeveel bins is optimaal? \
\
is equalsized bins de beste optie? \
\
in de results opslaan hoe groot de subgroup is

In [2]:
# Timing decorator to profile any function
def time_function(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds")
        return result

    return wrapper

In [3]:
def gt(a, b):
    return a > b

def leeq(a, b):
    return a <= b

def eq(a, b):
    return a == b

def neq(a, b):
    return not eq(a, b)

def extract_subgroup(descriptors, data, col_index_dict):
    result = []
    for row in data:
        check = True
        for attribute in descriptors:
            att_name, descr_value, operator = attribute # unpack 3 values from attribute
            att_index = col_index_dict[att_name]
            value = operator(row[att_index], descr_value)

            if not value:
                check = False
                break
        
        if check:
            result.append(row)

    return result


def refin(seed, data, types, nr_bins, descr_indices, index_col_dict, col_index_dict):
    res = []
    used_descr = [col_index_dict[i[0]] for i in seed]
    not_used_indices = descr_indices[:]
    not_used_indices = [i for i in not_used_indices if i not in used_descr or types[i] == "numeric"]
 
    for i in not_used_indices: 
        aux = list(seed)[:]

        if types[i] == 'numeric':
            s = extract_subgroup(seed, data, col_index_dict)
            all_values = [float(entry[i]) for entry in s]
            all_values = sorted(all_values)
            n = len(all_values)
            split_points = [all_values[math.floor(j * (n/nr_bins))] for j in range(1, nr_bins)]
            for s in split_points:
                func1 = leeq
                func2 = gt
         
                local0 = aux[:]
                local0.append((index_col_dict[i], s, func1))
                res.append(local0)
           
                local1 = aux[:]
                local1.append((index_col_dict[i], s, func2))
                res.append(local1)

        elif types[i] == 'binary':
            func = eq
            local0 = aux[:]
            local0.append((index_col_dict[i], 0, func))
            local1 = aux[:]
            local1.append((index_col_dict[i], 1, func))
            res.append(local0)
            res.append(local1)

        else:
            all_values = [entry[i] for entry in data]
            for j in set(all_values):
                func1 = eq
                func2 = neq
                local0 = aux[:]
                local0.append((index_col_dict[i], j, func1))
                res.append(local0)
    return res

            

def constraints_satisfied(descriptors, constraints):
    return True

def put_item_in_queue(queue, quality, descriptor):
    if queue.full():
        min_quality, min_descriptor = queue.get()
        if min_quality >= quality:
            queue.put((min_quality, min_descriptor))
        else:
            queue.put((quality, descriptor))
    else:
        queue.put((quality, descriptor))

def categorize_columns_in_order(df, att_columns):
    # Define empty list to store the categories in order
    column_types = []

    # Loop through attribute columns in the DataFrame
    for col in att_columns:
        # Check if the column is numeric
        if pd.api.types.is_numeric_dtype(df[col]):
            column_types.append('numeric')
        # Check if the column has exactly 2 unique values (binary)
        elif df[col].nunique() == 2:
            column_types.append('binary')
        # Otherwise, treat it as nominal
        else:
            column_types.append('nominal')

    return column_types

def make_rolling_windows_np(growth_target, window_size):

    return np.lib.stride_tricks.sliding_window_view(growth_target, window_shape=window_size)[::window_size]


def quality_measure_fast(targets_subgroup, targets_baseline,
                         aggregate_func_window=np.mean, aggregate_func=np.max):

    subgroup_aggregated_windows = aggregate_func_window(targets_subgroup, axis=2)

    baseline_means = np.mean(targets_baseline, axis=1)

    subgroup_means = np.mean(subgroup_aggregated_windows, axis=0)

    abs_diff_mean = np.abs(subgroup_means - baseline_means)

    subgroup_std = np.std(subgroup_aggregated_windows, axis=0)

    standard_error_subgroup = subgroup_std / np.sqrt(len(targets_subgroup))

    z_scores = np.divide(abs_diff_mean, standard_error_subgroup, where=standard_error_subgroup != 0)

    quality_score = aggregate_func(z_scores)

    return quality_score

def get_all_descriptors(pq):
    temp_items = []
    descriptors = []
    # Step 1: Retrieve all items from the queue
    while not pq.empty():
        item = pq.get()
        temp_items.append(item)
        descriptors.append(item[1])

    # Step 2: Put all items back into the queue
    for item in temp_items:
        pq.put(item)

    # Step 3: Return the list of all items
    return descriptors

#@time_function
def similarity_descriptors(new_descriptor, pq):
    checks = []
    tolerance = 0.15
    results_descriptors = get_all_descriptors(pq)
    new_descriptors_dict = {metric: (value, func) for metric, value, func in new_descriptor}
    for old_descriptor in results_descriptors:
        old_descriptors_dict = {metric: (value, func) for metric, value, func in old_descriptor}
        for metric in new_descriptors_dict:
            if metric in old_descriptors_dict:
                value1, func1 = new_descriptors_dict[metric]
                value2, func2 = old_descriptors_dict[metric]



            
                if isinstance(value1, str) and value1 != value2:
                    checks.append(False)
                    continue

                if not isinstance(value1, str) and abs(value1 - value2) > tolerance * value1:
                    checks.append(False)
                    continue


                # Apply the function from descriptor1 on value from descriptor2
                if func1 != func2:
                    checks.append(False)
                    continue
                
            else:
                checks.append(False)
                # If a metric in descriptor1 is missing in descriptor2, they are not similar
                continue
    # If all metrics pass the comparison and value checks, descriptors are similar
    
            checks.append(True)
    print(checks)
    return any(checks)

def are_descriptors_similar(descriptor1, pq):
    # Convert descriptor1 to a dictionary for easier comparison
    tolerance=0.25
    desc1_dict = {metric: (value, func) for metric, value, func in descriptor1}
    descriptor_list = get_all_descriptors(pq)

    # Iterate through each descriptor in the list
    for descriptor2 in descriptor_list:
        # Convert the second descriptor to a dictionary
        desc2_dict = {metric: (value, func) for metric, value, func in descriptor2}
        
        if len(desc1_dict) != len(desc2_dict):
            continue
            # Flag to track if all metrics match
        all_metrics_match = True

        # Check each metric in descriptor1
        for metric in desc1_dict:
            if metric in desc2_dict:
                value1, func1 = desc1_dict[metric]
                value2, func2 = desc2_dict[metric]

                if func1 != func2:
                    all_metrics_match = False
                    break

                # Check if values are within the tolerance range
                if (not isinstance(value1, str)) and abs(value1 - value2) > abs(tolerance * value1):
                    all_metrics_match = False
                    break

                if isinstance(value1, str) and value1 != value2:
                    all_metrics_match = False
                    break
            else:
                # If a metric in descriptor1 is missing in descriptor2, they are not similar
                all_metrics_match = False
                break

        # If all metrics match, return True
        if all_metrics_match:
            return True

    # If no descriptors matched, return False
    return False

def beam_search(data, targets_baseline, column_names, beam_width, beam_depth, nr_bins, nr_saved, subgroup_size, target, types, window_size):

    index_col_dict = {i: col for i, col in enumerate(column_names)}
    col_index_dict = {col: i for i, col in enumerate(column_names)}
    target_ind = column_names.index(target)
    att_indices = list(range(0, len(column_names)))
    att_indices.remove(target_ind)

    data_windows = []

    for row in data:
        new_row = row[:]
        new_row[target_ind] = make_rolling_windows_np(row[target_ind], window_size)
        data_windows.append(new_row)

    data = data_windows
    targets_baseline = make_rolling_windows_np(targets_baseline, window_size)

    beam_queue = deque([()])
    results = PriorityQueue(nr_saved)
    
    for depth in range(beam_depth):
        beam = PriorityQueue(beam_width)

        while bool(beam_queue):
            seed = beam_queue.popleft()
            descriptor_set = refin(seed, data, types, nr_bins, att_indices, index_col_dict, col_index_dict)

            for descriptor in descriptor_set:
                subgroup = extract_subgroup(descriptor, data, col_index_dict)
                if len(subgroup) >= subgroup_size:
                    targets_subgroup = [i[target_ind] for i in subgroup]
                    quality_result = quality_measure_fast(targets_subgroup, targets_baseline)
                    put_item_in_queue(results, quality_result, tuple(descriptor))
                    put_item_in_queue(beam, quality_result, tuple(descriptor))

        while not beam.empty():

            new_combination = beam.get()
            new_combination = new_combination[1]
            beam_queue.append(new_combination)

    return results


In [4]:
def beam_search_with_constraint(data, targets_baseline, column_names, beam_width, beam_depth, nr_bins, nr_saved, subgroup_size, target, types, window_size):
    
    index_col_dict = {i: col for i, col in enumerate(column_names)}
    col_index_dict = {col: i for i, col in enumerate(column_names)}
    target_ind = column_names.index(target) 
    att_indices = list(range(0, len(column_names)))
    att_indices.remove(target_ind)

    data_windows = []

    for row in data:
        new_row = row[:]
        new_row[target_ind] = make_rolling_windows_np(row[target_ind], window_size)
        data_windows.append(new_row)

    data = data_windows
    targets_baseline = make_rolling_windows_np(targets_baseline, window_size)
    
    beam_queue = deque([()]) 
    results = PriorityQueue(nr_saved) 
    results.put((0,[(0,0,0)]))
    descriptors_in_results = []
    for depth in range(beam_depth):
        beam = PriorityQueue(beam_width) 
    
        while bool(beam_queue):
            seed = beam_queue.popleft()
            descriptor_set = refin(seed, data, types, nr_bins, att_indices, index_col_dict, col_index_dict)

            for descriptor in descriptor_set:
                subgroup = extract_subgroup(descriptor, data, col_index_dict)
                if len(subgroup) >= subgroup_size and not are_descriptors_similar(descriptor, results):
                    targets_subgroup = [i[target_ind] for i in subgroup]
                    quality_result = quality_measure_fast(targets_subgroup, targets_baseline)
                    put_item_in_queue(results, quality_result, tuple(descriptor))
                    put_item_in_queue(beam, quality_result, tuple(descriptor))
                    descriptors_in_results.append(tuple(descriptor))                  

        while not beam.empty(): 
            new_combination = beam.get()
            new_combination = new_combination[1]
            beam_queue.append(new_combination)
    print(len(descriptors_in_results))
    return results



In [5]:
stock_df = make_growth_target_df('datasets/stock_data_for_emm.pkl')

In [6]:
stock_df.drop(['index'], inplace=True, axis=1)
stock_df

Unnamed: 0,country,industry,currency,exchangeTimezoneName,exchange,sector,averageVolume10days,enterpriseToEbitda,marketCap,debtToEquity,fullTimeEmployees,growth_target
0,France,Aerospace & Defense,EUR,Europe/Paris,PAR,Industrials,382.0,11.822,5.750674e+07,50.783,1102.0,"[0.0, 1.17, 2.41, 0.41, 2.65, -1.39, -7.24, -4..."
1,Germany,Software—Application,EUR,Europe/Berlin,GER,Technology,8329.0,33.294,1.241838e+09,32.492,650.0,"[0.0, -1.84, 17.01, 3.7, -8.24, 8.01, -3.6, 4...."
2,Italy,Entertainment,EUR,Europe/Rome,MIL,Communication Services,330.0,-21.050,2.849466e+07,181.181,34.0,"[0.0, -7.96, -2.61, 0.89, 13.94, -4.85, -6.12,..."
3,Italy,Packaged Foods,EUR,Europe/Rome,MIL,Consumer Defensive,3831.0,8.794,5.971590e+07,68.513,232.0,"[0.0, -3.39, 0.58, -10.17, 11.33, -2.91, 2.1, ..."
4,United Kingdom,Other Precious Metals & Mining,EUR,Europe/Berlin,FRA,Basic Materials,108.0,1.402,2.123667e+08,81.212,3474.0,"[0.0, 8.5, -15.79, 5.28, 2.24, -1.94, 7.5, -1...."
...,...,...,...,...,...,...,...,...,...,...,...,...
9774,Japan,Electrical Equipment & Parts,JPY,Europe/London,LSE,Industrials,4893.0,6.287,2.079737e+10,10.654,145696.0,"[0.0, 1.08, 10.44, -11.75, -11.15, 7.96, -9.17..."
9775,Japan,Business Equipment & Supplies,JPY,Europe/London,LSE,Industrials,1940.0,5.531,5.086319e+09,35.268,78360.0,"[0.0, 5.82, 1.71, 7.84, -10.55, 3.58, -7.94, 2..."
9776,Japan,Electronic Gaming & Multimedia,JPY,Europe/London,LSE,Communication Services,3530.0,7.976,6.297095e+09,18.311,4894.0,"[0.0, 6.5, 0.0, -10.85, -1.95, -3.62, -3.95, 9..."
9777,Japan,Electronic Components,JPY,Europe/London,LSE,Technology,350.0,7.200,1.774490e+09,1.943,1297.0,"[0.0, 0.0, 0.0, 0.0, 18.86, 0.0, 0.0, 0.0, 10...."


In [7]:
data = stock_df.values.tolist()
column_names = list(stock_df.columns)
beam_width = 10
beam_depth = 3
nr_bins = 8
nr_saved = 10
subgroup_size = len(data)*0.05
target = 'growth_target'
window_size = 5

In [8]:
target_ind = column_names.index(target)
all_time_series = [i[target_ind] for i in data]
all_time_series = np.array(all_time_series)
targets_baseline = np.mean(all_time_series, axis=0)

In [9]:
att_indices = list(range(0, len(column_names)))
att_indices.remove(target_ind)
att_columns = [column_names[i] for i in att_indices]
types = categorize_columns_in_order(stock_df, att_columns)

In [10]:
results  = beam_search(data, targets_baseline, column_names, beam_width, beam_depth, nr_bins, nr_saved, subgroup_size, target, types, window_size)


In [11]:
while not results.empty():
    item = results.get()
    print(f"Priority: {item[0]}, Descriptor: {item[1]}")

Priority: 275.1380321311154, Descriptor: (('marketCap', 730348608.0, <function gt at 0x00000219C77C5FC0>), ('fullTimeEmployees', 3206.0, <function gt at 0x00000219C77C5FC0>), ('enterpriseToEbitda', 23.293, <function leeq at 0x00000219C77C6290>))
Priority: 275.7357602237447, Descriptor: (('marketCap', 730348608.0, <function gt at 0x00000219C77C5FC0>), ('fullTimeEmployees', 1100.0, <function gt at 0x00000219C77C5FC0>), ('averageVolume10days', 0.0, <function gt at 0x00000219C77C5FC0>))
Priority: 275.80134937072995, Descriptor: (('marketCap', 730348608.0, <function gt at 0x00000219C77C5FC0>), ('enterpriseToEbitda', 24.684, <function leeq at 0x00000219C77C6290>), ('enterpriseToEbitda', 0.589, <function gt at 0x00000219C77C5FC0>))
Priority: 275.85788638129264, Descriptor: (('marketCap', 1855027770.7673597, <function gt at 0x00000219C77C5FC0>), ('enterpriseToEbitda', 25.656, <function leeq at 0x00000219C77C6290>), ('fullTimeEmployees', 2053.0, <function gt at 0x00000219C77C5FC0>))
Priority: 2