In [2]:
import math
from collections import deque
from queue import PriorityQueue
from quality_measure import quality_measure
from load_data import make_growth_target_df
import pandas as pd
import numpy as np
import math

In [3]:
# Create stock_df from the target file
stock_df = make_growth_target_df('stock_data_for_emm.pkl')

# Replace infinite values with NaN
stock_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Extract the growth target column
growth_data = stock_df.loc[:, "growth_target"]

# Get the indices of rows with NaN or Inf values
nan_indices = growth_data[growth_data.apply(lambda x: any(pd.isna(i) for i in x))].index.tolist()
inf_indices = growth_data[growth_data.apply(lambda x: any(i in [np.inf, -np.inf] for i in x))].index.tolist()

# Get the union of the two indices
union = list(set(nan_indices).union(set(inf_indices)))


# Drop the rows with NaN or Inf values and reset the index
stock_df = stock_df.drop(union).reset_index(drop=True)

# Convert stock_df to a list of lists (values)
stock_data = stock_df.values.tolist()


In [4]:
all_time_series = [i[12] for i in stock_data]
all_time_series = np.array(all_time_series)
targets_baseline = np.mean(all_time_series, axis=0)

In [5]:
column_names = list(stock_df.columns)
targets = ['target', 'growth_target']
target_ind = [column_names.index(a) for a in targets]
att_indices = list(range(0, len(column_names)))
[att_indices.remove(i) for i in target_ind]
att_columns = [column_names[i] for i in att_indices]

In [6]:
def gteq(a, b):
    return a >= b

def leeq(a, b):
    return a <= b

def eq(a, b):
    return a == b

def neq(a, b):
    return not eq(a, b)


In [7]:
def extract_subgroup(descriptors, data):
    result = []
    for row in data:
        check = True
        for attribute in descriptors:
            att_index, descr_value, operator = attribute # unpack 3 values from attribute
            value = operator(row[att_index], descr_value)

            if not value:
                check = False
                break
        
        if check:
            result.append(row)

    return result


def refin(seed, data, types, nr_bins, descr_indices):

    res = []
    used_descr = [i[0] for i in seed]

    not_used_descr = descr_indices[:]
    [not_used_descr.remove(i) for i in used_descr]

    # refinement of descriptors

    for i in descr_indices: # functies voor voorwaarden gewoon erin doen
        aux = list(seed)[:]

        if types[i] == 'numeric':
            s = extract_subgroup(seed, data)
            all_values = [float(entry[i]) for entry in s]
            all_values = sorted(all_values)
            n = len(all_values)
            split_points = [all_values[math.floor(j * (n/nr_bins))] for j in range(0, nr_bins-1)]
            for s in split_points:
                func1 = leeq
                func2 = gteq
                local0 = aux[:]
                local0.append((i, s, func1))
                local1 = aux[:]
                local1.append((i, s, func2))
                res.append(local0)
                res.append(local1)

        elif types[i] == 'binary':
            func = eq
            local0 = aux[:]
            local0.append((i, 0, func))
            local1 = aux[:]
            local1.append((i, 1, func))
            res.append(local0)
            res.append(local1)

        else:
            all_values = [entry[i] for entry in data]
            for j in set(all_values):
                func1 = eq
                func2 = neq
                local0 = aux[:]
                local0.append((i, j, func1))
                res.append(local0)

    return res

            

def constraints_satisfied(descriptors, constraints): # and subgroup len > 0
    return True

def insert_to_pq(descriptors:tuple, quality):
    pass

def get_highest_pq():
    pass

In [8]:
def beam_search(data, targets_baseline, column_names, quality, refin, beam_width, beam_depth, nr_bins, nr_saved, constraints, targets, types):

    target_ind = [column_names.index(a) for a in targets]
    att_indices = list(range(0, len(column_names)))
    [att_indices.remove(i) for i in target_ind]
    target_ind.remove(11) # ONLY INCLUDE TARGET GROWTH!

    beam_queue = deque([()]) # initialize with emtpy tuple (the empty set)
    results = PriorityQueue(nr_saved) # queue with max amount of descriptors saved

    for depth in range(beam_depth):
        beam = PriorityQueue(beam_width) # amount of combinations we keep investigating
    
        while bool(beam_queue): # while there are items in the queue
            seed = beam_queue.popleft()
            descriptor_set = refin(seed, data, types, nr_bins, att_indices)

            for descriptor in descriptor_set:
                subgroup = extract_subgroup(descriptor, data)
                
                if constraints_satisfied(descriptor, constraints):
                    targets_subgroup = [i[target_ind[0]] for i in subgroup]
                    quality_result = quality(targets_subgroup, targets_baseline)
                    print(quality_result)
                    results.put(-quality_result, tuple(descriptor)) # nog checken of dit klopt
                    beam.put(-quality_result, tuple(descriptor))
                    print(beam)
                    print(results)


        while not beam.empty(): # is not empty
            new_combination = beam.get() # dit moet het item gaan geven met de beste quality measure
            beam_queue.append(new_combination)

        break

    return results




In [12]:
data = stock_data
quality = quality_measure
beam_width = 3
beam_depth = 1
nr_bins = 3
nr_saved = 10
constraints = None
targets = ['target', 'growth_target']



In [11]:
def categorize_columns_in_order(df, att_columns):
    # Define empty list to store the categories in order
    column_types = []
    
    # Loop through attribute columns in the DataFrame
    for col in att_columns:
        # Check if the column is numeric
        if pd.api.types.is_numeric_dtype(df[col]):
            column_types.append('numeric')
        # Check if the column has exactly 2 unique values (binary)
        elif df[col].nunique() == 2:
            column_types.append('binary')
        # Otherwise, treat it as nominal
        else:
            column_types.append('nominal')
    
    return column_types

# Apply the function to the stock DataFrame
types = categorize_columns_in_order(stock_df, att_columns)

In [13]:
beam_search(data, targets_baseline, column_names, quality, refin, beam_width, beam_depth, nr_bins, nr_saved, constraints, targets, types)

i
739.312334442922
131.55992375069735
382.01150464991457
524.7172056150646
