In [1]:
from collections import deque
from queue import PriorityQueue
from quality_measure import quality_measure
from load_data import make_growth_target_df
import pandas as pd
import numpy as np
import math

In [2]:
def gteq(a, b):
    return a >= b

def leeq(a, b):
    return a <= b

def eq(a, b):
    return a == b

def neq(a, b):
    return not eq(a, b)

def extract_subgroup(descriptors, data):
    result = []
    for row in data:
        check = True
        for attribute in descriptors:
            att_index, descr_value, operator = attribute # unpack 3 values from attribute
            value = operator(row[att_index], descr_value)

            if not value:
                check = False
                break
        
        if check:
            result.append(row)

    return result


def refin(seed, data, types, nr_bins, descr_indices):

    res = []
    used_descr = [i[0] for i in seed]
    not_used_indices = descr_indices[:]
    [not_used_indices.remove(i) for i in used_descr]
    # refinement of descriptors

    for i in not_used_indices: # functies voor voorwaarden gewoon erin doen
        aux = list(seed)[:]

        if types[i] == 'numeric':
            s = extract_subgroup(seed, data)
            all_values = [float(entry[i]) for entry in s]
            all_values = sorted(all_values)
            n = len(all_values)
            split_points = [all_values[math.floor(j * (n/nr_bins))] for j in range(1, nr_bins)]
            for s in split_points:
                func1 = leeq
                func2 = gteq
         
                local0 = aux[:]
                local0.append((i, s, func1))
                res.append(local0)
           
                local1 = aux[:]
                local1.append((i, s, func2))
                res.append(local1)

        elif types[i] == 'binary':
            func = eq
            local0 = aux[:]
            local0.append((i, 0, func))
            local1 = aux[:]
            local1.append((i, 1, func))
            res.append(local0)
            res.append(local1)

        else:
            all_values = [entry[i] for entry in data]
            for j in set(all_values):
                func1 = eq
                func2 = neq
                local0 = aux[:]
                local0.append((i, j, func1))
                res.append(local0)
    return res

            

def constraints_satisfied(descriptors, constraints): # and subgroup len > 0
    return True

def put_item_in_queue(queue, quality, descriptor):
    if queue.full():
        min_quality, min_descriptor = queue.get()
        if min_quality >= quality:
            queue.put((min_quality, min_descriptor))
        else:
            queue.put((quality, descriptor))
    else:
        queue.put((quality, descriptor))

def categorize_columns_in_order(df, att_columns):
    # Define empty list to store the categories in order
    column_types = []

    # Loop through attribute columns in the DataFrame
    for col in att_columns:
        # Check if the column is numeric
        if pd.api.types.is_numeric_dtype(df[col]):
            column_types.append('numeric')
        # Check if the column has exactly 2 unique values (binary)
        elif df[col].nunique() == 2:
            column_types.append('binary')
        # Otherwise, treat it as nominal
        else:
            column_types.append('nominal')

    return column_types


In [3]:
def beam_search(data, targets_baseline, column_names, quality, beam_width, beam_depth, nr_bins, nr_saved, subgroup_size, constraints, target, types):

    target_ind = column_names.index(target) 
    att_indices = list(range(0, len(column_names)))
    att_indices.remove(target_ind) # ONLY INCLUDE TARGET GROWTH!
    beam_queue = deque([()]) # initialize with emtpy tuple (the empty set)
    results = PriorityQueue(nr_saved) # queue with max amount of descriptors saved

    for depth in range(beam_depth):
        beam = PriorityQueue(beam_width) # amount of combinations we keep investigating
    
        while bool(beam_queue):# while there are items in the queue
            seed = beam_queue.popleft()
            descriptor_set = refin(seed, data, types, nr_bins, att_indices)

            for descriptor in descriptor_set:
                subgroup = extract_subgroup(descriptor, data)
                if constraints_satisfied(descriptor, constraints) and len(subgroup) >= subgroup_size:
                    targets_subgroup = [i[target_ind] for i in subgroup]
                    quality_result = quality(targets_subgroup, targets_baseline)
                    put_item_in_queue(results, quality_result, tuple(descriptor))
                    put_item_in_queue(beam, quality_result, tuple(descriptor))
        


        while not beam.empty(): 

            new_combination = beam.get()
            new_combination = new_combination[1]# dit moet het item gaan geven met de beste quality measure
            beam_queue.append(new_combination)

    return results



In [11]:
# Create stock_df from the target file
stock_df = make_growth_target_df('stock_data_1year.pkl')
stock_df

Unnamed: 0,stocks,country,industry,currency,exchangeTimezoneName,exchange,sector,averageVolume10days,enterpriseToEbitda,marketCap,...,growth_autocorr_lag1,growth_min,growth_max,growth_range,growth_window_min,growth_window_max,growth_trend_slope,growth_biggest_continuous_increase_perc,growth_biggest_continuous_decrease_perc,growth_target
0,AOF.DE,Germany,Software—Application,EUR,Europe/Berlin,GER,Technology,8329.0,33.294,1.241838e+09,...,0.97943,-11.01056,6.73235,17.74291,237,205,-0.00104,16.82441,-17.72197,"[0.0, -1.31, 2.49, -0.94, 3.71, -0.24, 1.21, -..."
1,LFG.MI,Italy,Entertainment,EUR,Europe/Rome,MIL,Communication Services,330.0,-21.050,2.849466e+07,...,0.95050,-8.00000,12.17949,20.17949,56,188,-0.00086,18.59324,-8.00000,"[0.0, 2.07, -2.03, 3.45, -2.0, -2.04, 0.0, 2.0..."
2,ENV.MI,Italy,Packaged Foods,EUR,Europe/Rome,MIL,Consumer Defensive,3831.0,8.794,5.971590e+07,...,0.90582,-6.57895,5.55555,12.13450,17,16,-0.00075,11.23512,-6.57895,"[0.0, 4.17, -0.67, -1.34, -1.36, 0.69, -1.37, ..."
3,FPO.F,United Kingdom,Other Precious Metals & Mining,EUR,Europe/Berlin,FRA,Basic Materials,108.0,1.402,2.123667e+08,...,0.98542,-10.05111,25.00001,35.05112,15,60,0.00129,25.00001,-21.49774,"[0.0, -1.17, -6.44, -3.09, -2.32, 0.15, 0.0, -..."
4,M07.F,United States,Security & Protection Services,EUR,Europe/Berlin,FRA,Industrials,0.0,14.054,5.451780e+09,...,0.97800,-4.24242,7.91367,12.15609,235,17,-0.00136,9.97164,-10.13175,"[0.0, 1.36, 0.0, -1.34, 0.0, 0.0, 0.0, -0.68, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9060,INRN.SW,Switzerland,Specialty Industrial Machinery,CHF,Europe/Zurich,EBS,Industrials,1124.0,15.530,1.979479e+09,...,0.96767,-7.21831,6.00000,13.21831,208,52,-0.00123,17.62885,-12.72116,"[0.0, -0.41, 0.62, -1.43, 2.49, 1.01, -0.4, -2..."
9061,RICO.L,Japan,Business Equipment & Supplies,JPY,Europe/London,LSE,Industrials,1940.0,5.531,5.086319e+09,...,0.97884,-9.63956,12.64970,22.28926,52,235,0.00194,13.67960,-9.63956,"[0.0, -4.75, 0.0, 0.0, 2.97, 0.0, 0.0, 0.59, 0..."
9062,KNM.L,Japan,Electronic Gaming & Multimedia,JPY,Europe/London,LSE,Communication Services,3530.0,7.976,6.297095e+09,...,0.99556,-8.39559,9.76675,18.16234,39,159,0.00133,9.76675,-8.39559,"[0.0, 0.0, 0.0, 0.0, -0.51, 0.0, 2.32, 0.0, 0...."
9063,MAW.L,Japan,Electronic Components,JPY,Europe/London,LSE,Technology,350.0,7.200,1.774490e+09,...,0.99399,-6.45624,14.62789,21.08413,110,53,-0.00040,14.62789,-10.74116,"[0.0, 0.0, 2.95, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [12]:
stock_df_realworld = stock_df.copy()
stock_df_realworld.drop(['stocks',
               'og_mean',
               'og_median',
               'og_std_dev',
               'og_autocorr_lag1',
               'og_min',
               'og_max',
               'og_range',
               'og_window_min',
               'og_window_max',
               'og_trend_slope',
               'og_longest_continuous_increase',
               'og_biggest_continuous_increase',
               'og_longest_continuous_decrease',
               'og_biggest_continuous_decrease',
               'growth_mean',
               'growth_median',
               'growth_std_dev',
               'growth_autocorr_lag1',
               'growth_min',
               'growth_max',
               'growth_range',
               'growth_window_min',
               'growth_window_max',
               'growth_trend_slope',
               'growth_biggest_continuous_increase_perc',
               'growth_biggest_continuous_decrease_perc'], axis=1, inplace=True)
stock_df_realworld

Unnamed: 0,country,industry,currency,exchangeTimezoneName,exchange,sector,averageVolume10days,enterpriseToEbitda,marketCap,debtToEquity,fullTimeEmployees,growth_target
0,Germany,Software—Application,EUR,Europe/Berlin,GER,Technology,8329.0,33.294,1.241838e+09,32.492,650.0,"[0.0, -1.31, 2.49, -0.94, 3.71, -0.24, 1.21, -..."
1,Italy,Entertainment,EUR,Europe/Rome,MIL,Communication Services,330.0,-21.050,2.849466e+07,181.181,34.0,"[0.0, 2.07, -2.03, 3.45, -2.0, -2.04, 0.0, 2.0..."
2,Italy,Packaged Foods,EUR,Europe/Rome,MIL,Consumer Defensive,3831.0,8.794,5.971590e+07,68.513,232.0,"[0.0, 4.17, -0.67, -1.34, -1.36, 0.69, -1.37, ..."
3,United Kingdom,Other Precious Metals & Mining,EUR,Europe/Berlin,FRA,Basic Materials,108.0,1.402,2.123667e+08,81.212,3474.0,"[0.0, -1.17, -6.44, -3.09, -2.32, 0.15, 0.0, -..."
4,United States,Security & Protection Services,EUR,Europe/Berlin,FRA,Industrials,0.0,14.054,5.451780e+09,71.777,4800.0,"[0.0, 1.36, 0.0, -1.34, 0.0, 0.0, 0.0, -0.68, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9060,Switzerland,Specialty Industrial Machinery,CHF,Europe/Zurich,EBS,Industrials,1124.0,15.530,1.979479e+09,15.043,2566.0,"[0.0, -0.41, 0.62, -1.43, 2.49, 1.01, -0.4, -2..."
9061,Japan,Business Equipment & Supplies,JPY,Europe/London,LSE,Industrials,1940.0,5.531,5.086319e+09,35.268,78360.0,"[0.0, -4.75, 0.0, 0.0, 2.97, 0.0, 0.0, 0.59, 0..."
9062,Japan,Electronic Gaming & Multimedia,JPY,Europe/London,LSE,Communication Services,3530.0,7.976,6.297095e+09,18.311,4894.0,"[0.0, 0.0, 0.0, 0.0, -0.51, 0.0, 2.32, 0.0, 0...."
9063,Japan,Electronic Components,JPY,Europe/London,LSE,Technology,350.0,7.200,1.774490e+09,1.943,1297.0,"[0.0, 0.0, 2.95, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [14]:
data = stock_df_realworld.values.tolist()
column_names = list(stock_df_realworld.columns)
quality = quality_measure
beam_width = 3
beam_depth = 5
nr_bins = 3
nr_saved = 10
subgroup_size = 10
constraints = None
target = 'growth_target'

In [15]:
target_ind = column_names.index(target)
all_time_series = [i[target_ind] for i in data]
all_time_series = np.array(all_time_series)
targets_baseline = np.mean(all_time_series, axis=0)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9065,) + inhomogeneous part.

In [16]:
pp=[]
for i in all_time_series:
    pp.append(i)
    print(len(i))
print(len())

255
254
254
255
255
255
255
255
255
255
255
255
255
254
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
248
254
38
255
255
255
255
255
255
255
254
255
255
254
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
244
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
240
255
255
255
255
255
255
255
255
255
255
255
255
255
255
254
255
255
255
255
255
255
254
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
254
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
254
255
255
255
255
255
255
255
222
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
255
254
255
255
255
255
255
255
255
255
255
255
255
255
255
255
2

In [7]:
att_indices = list(range(0, len(column_names)))
att_indices.remove(target_ind)
att_columns = [column_names[i] for i in att_indices]
types = categorize_columns_in_order(stock_df_realworld, att_columns)


In [8]:
results  = beam_search(data, targets_baseline, column_names, quality, beam_width, beam_depth, nr_bins, nr_saved, subgroup_size, constraints, target, types)


In [9]:
while not results.empty():
    item = results.get()
    print(f"Priority: {item[0]}, Descriptor: {item[1]}")

Priority: 64.86614408246143, Descriptor: ((3, 'Europe/Berlin', <function eq at 0x000001A92AB32050>), (2, 'EUR', <function eq at 0x000001A92AB32050>))
Priority: 64.86614408246143, Descriptor: ((3, 'Europe/Berlin', <function eq at 0x000001A92AB32050>), (6, 0.0, <function gteq at 0x000001A9787F6DD0>))
Priority: 65.62197956465624, Descriptor: ((10, 10000.0, <function gteq at 0x000001A9787F6DD0>), (6, 39.0, <function gteq at 0x000001A9787F6DD0>))
Priority: 65.81396857128121, Descriptor: ((2, 'EUR', <function eq at 0x000001A92AB32050>), (6, 0.0, <function gteq at 0x000001A9787F6DD0>), (7, 5.111, <function gteq at 0x000001A9787F6DD0>))
Priority: 65.81396857128121, Descriptor: ((2, 'EUR', <function eq at 0x000001A92AB32050>), (7, 5.111, <function gteq at 0x000001A9787F6DD0>))
Priority: 67.74505877067688, Descriptor: ((10, 10000.0, <function gteq at 0x000001A9787F6DD0>), (8, 6693983744.0, <function gteq at 0x000001A9787F6DD0>))
Priority: 68.37178453583074, Descriptor: ((2, 'EUR', <function eq a

In [13]:
stock_df_timeseries = stock_df.copy()
stock_df_timeseries.drop(['stocks',
                          'country',
                          'industry',
                          'currency',
                          'exchangeTimezoneName',
                          'exchange',
                          'sector',
                          'averageVolume10days',
                          'enterpriseToEbitda',
                          'marketCap',
                          'debtToEquity',
                          'fullTimeEmployees'], axis=1, inplace=True)
stock_df_timeseries

Unnamed: 0,og_mean,og_median,og_std_dev,og_autocorr_lag1,og_min,og_max,og_range,og_window_min,og_window_max,og_trend_slope,...,growth_autocorr_lag1,growth_min,growth_max,growth_range,growth_window_min,growth_window_max,growth_trend_slope,growth_biggest_continuous_increase_perc,growth_biggest_continuous_decrease_perc,growth_target
0,119.30059,119.80,11.41623,0.97943,97.600,143.20,45.600,19,225,0.10654,...,0.97943,-11.01056,6.73235,17.74291,237,205,-0.00104,16.82441,-17.72197,"[0.0, -1.31, 2.49, -0.94, 3.71, -0.24, 1.21, -..."
1,1.63551,1.65,0.11390,0.95050,1.400,1.85,0.450,37,61,0.00047,...,0.95050,-8.00000,12.17949,20.17949,56,188,-0.00086,18.59324,-8.00000,"[0.0, 2.07, -2.03, 3.45, -2.0, -2.04, 0.0, 2.0..."
2,3.14437,3.18,0.11375,0.90582,2.820,3.30,0.480,10,146,0.00093,...,0.90582,-6.57895,5.55555,12.13450,17,16,-0.00075,11.23512,-6.57895,"[0.0, 4.17, -0.67, -1.34, -1.36, 0.69, -1.37, ..."
3,0.48760,0.48,0.11365,0.98542,0.297,0.77,0.473,228,1,-0.00128,...,0.98542,-10.05111,25.00001,35.05112,15,60,0.00129,25.00001,-21.49774,"[0.0, -1.17, -6.44, -3.09, -2.32, 0.15, 0.0, -..."
4,162.37647,162.00,10.46251,0.97800,138.000,183.00,45.000,15,200,0.07836,...,0.97800,-4.24242,7.91367,12.15609,235,17,-0.00136,9.97164,-10.13175,"[0.0, 1.36, 0.0, -1.34, 0.0, 0.0, 0.0, -0.68, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9060,2646.73307,2630.00,189.42620,0.96767,2280.000,3050.00,770.000,28,119,0.86991,...,0.96767,-7.21831,6.00000,13.21831,208,52,-0.00123,17.62885,-12.72116,"[0.0, -0.41, 0.62, -1.43, 2.49, 1.01, -0.4, -2..."
9061,1304.29555,1302.50,117.13344,0.97884,1069.500,1613.00,543.500,56,249,1.27525,...,0.97884,-9.63956,12.64970,22.28926,52,235,0.00194,13.67960,-9.63956,"[0.0, -4.75, 0.0, 0.0, 2.97, 0.0, 0.0, 0.59, 0..."
9062,10045.17815,9977.00,1928.25172,0.99556,7234.000,15070.00,7836.000,39,249,24.77846,...,0.99556,-8.39559,9.76675,18.16234,39,159,0.00133,9.76675,-8.39559,"[0.0, 0.0, 0.0, 0.0, -0.51, 0.0, 2.32, 0.0, 0...."
9063,31957.93751,32900.00,5643.14925,0.99399,22710.000,41850.00,19140.000,1,253,74.18753,...,0.99399,-6.45624,14.62789,21.08413,110,53,-0.00040,14.62789,-10.74116,"[0.0, 0.0, 2.95, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
