In [1]:
from utils.pre_process_data import get_data
from math import ceil
from utils.units import Volume, Dollar
from deap import gp, creator, base, tools
from deap.gp import Terminal
from operator import or_, and_, gt
from fitness_functions import *
from utils.plot_decision_trees import plot_tree
from utils.save_info import save_results
from genetic_functions.cx_functions import cxSubTree
from genetic_functions.mut_functions import mutation_half, mutBranch
# from genetic_functions.genetic_program import GPAlgo
import matplotlib.pyplot as plt
import pendulum
import pickle

In [2]:
def GPAlgo(population, 
           toolbox, cxpb, 
           mutpb, ngen, 
           elite_pop_size, 
           stats=None,
            halloffame=None, 
            verbose=__debug__
            ):
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    if halloffame is not None:
        halloffame.update(population)

    record = stats.compile(population) if stats else {}
    logbook.record(gen=0, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)

    store_generations = {}
    # Begin the generational process
    for gen in range(1, ngen + 1):
        # Select the next generation individuals
        offspring = toolbox.select(population, len(population)-elite_pop_size)

        # Vary the pool of individuals
        # assert (cxpb + mutpb) <= 1.0, (
        # "The sum of the crossover and mutation probabilities must be smaller "
        # "or equal to 1.0.")

        #Elitism:
        elite_pop = sorted(population, key=attrgetter("fitness"), reverse=True)[:elite_pop_size]
        # offspring = [toolbox.clone(ind) for ind in population]
        offspring = elite_pop +offspring

        # Apply crossover and mutation on the offspring
        for i in range(elite_pop_size+1, len(offspring)):
            if random.random() < cxpb:
                bi = random.randint(elite_pop_size+1, len(offspring)-1)
                offspring[bi], offspring[i] = toolbox.mate(offspring[bi],
                                                            offspring[i])
                del offspring[bi].fitness.values, offspring[i].fitness.values

        for i in range(elite_pop_size+1, len(offspring)):
            if random.random() < mutpb:
                mut_per = ((ngen+1-gen)/(ngen+1-gen))*100
                # print("Before Mut:",str(gp.PrimitiveTree(offspring[i])))
                offspring[i], = toolbox.mutate(offspring[i], mut_per=mut_per)
                # print("After Mut:",str(gp.PrimitiveTree(offspring[i])))
                del offspring[i].fitness.values

        elits_check = []
        for ind,val in enumerate(population):
            elits_check.append(val == offspring[ind])
        # print(elits_check)
        print(sum(elits_check))
        # print(len([i for i in pop if i in offspring]))
        # print("Unique items in pop",set(pop), ' of ',len(pop))


        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        # Update the hall of fame with the generated individuals
        if halloffame is not None:
            halloffame.update(offspring)

        # Replace the current population by the offspring
        store_generations[f"gen{gen}"]={
            "pop":population,
            "offspring":offspring
        }
        population[:] = offspring

        # Append the current generation statistics to the logbook
        record = stats.compile(population) if stats else {}
        logbook.record(gen=gen, nevals=len(invalid_ind), **record)
        if verbose:
            print(logbook.stream)

    return population, logbook, store_generations

In [3]:
population_size = 50
tc=0.01
num_generations= 5

df = get_data()
df_train= df.iloc[ : ceil(len(df)*0.7)]
df_test= df.iloc[ceil(len(df)*0.7) : ]

arg_names = list(df_train.columns)
vol_args = [arg for arg in arg_names if "volume" in arg.lower()]
dol_args = [arg for arg in arg_names if "volume" not in arg.lower()]

# --- CREATE PRIMITIVE SETS AND TOOLS -----
pset = gp.PrimitiveSetTyped("main",[Volume]*len(vol_args) + [Dollar]*len(dol_args),bool)
#Rename the arguments:
arg_vol_mapping = {f"ARG{ind}": val for ind,val in enumerate(vol_args)}
pset.renameArguments(**arg_vol_mapping)
arg_dol_mapping = {f"ARG{len(vol_args)+ind}": val for ind,val in enumerate(dol_args)}
pset.renameArguments(**arg_dol_mapping)
#Check that all arguments were renamed:
unnamed_args=[i for i in pset.arguments if "ARG" in i]
if  unnamed_args:
    print(f"Some arguments were not renamed: {unnamed_args}")
pset.addPrimitive(gt, [Dollar,Dollar],bool)
pset.addPrimitive(lambda x:x ,[Dollar],Dollar, name="dollar placeholder")

pset.addPrimitive(gt, [Volume,Volume],bool)
pset.addPrimitive(lambda x:x ,[Volume],Volume, name="volume placeholder")


#Boolean operators:
pset.addPrimitive(and_, [bool,bool],bool)
pset.addPrimitive(or_,[bool,bool],bool)

for v_arg in vol_args:
        pset.addTerminal(v_arg,Volume)
for d_arg in dol_args:
        pset.addTerminal(d_arg,Dollar)

# --- Remove all the ARG terminals ---
pset.terminals[Volume] = [i for i in pset.terminals[Volume] if "ARG" not in i.name]
pset.terminals[Dollar] = [i for i in pset.terminals[Dollar] if "ARG" not in i.name]

def generate(pset):
    run=True
    while run:
        try:
            expr = toolbox.individual()            
            #Remove all the Lambda functions:
            expr=  list(filter(lambda x: x.name!="dollar placeholder", expr))
            expr=  list(filter(lambda x: x.name!="volume placeholder", expr))
            if len(expr)>3:
                run=False
        except IndexError:
            continue
    # return gp.PrimitiveTree(expr)
    return creator.Individual(expr)

# --- GP OPERATORS ----

creator.create("fitness", base.Fitness, weights=(1,))
creator.create("Individual", gp.PrimitiveTree, fitness= creator.fitness)

toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=5)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("custom_individual",generate, pset)
toolbox.register("population", tools.initRepeat, list, toolbox.custom_individual)
toolbox.register("evaluate", fitness_function, df=df_train,tc=tc, pset=pset)

toolbox.register("mate",       cxSubTree)
toolbox.register("select",     tools.selRanked) 
toolbox.register("mutate",     mutation_half, pset=pset)

hof   = tools.HallOfFame(maxsize=10)

#STATS:
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean, axis=0) 
stats.register("std", np.std, axis=0)
stats.register("min", np.min, axis=0)
stats.register("max", np.max, axis=0)

pop = toolbox.population(n=population_size)
# with open(rf"/home/khann/masters/results/run_1_hof.pkl", 'rb') as file:
#     pop = pickle.load(file)

t1 = pendulum.now()
# population, logbook, store_generations = GPAlgo(
#     pop, 
#     toolbox,
#     cxpb=0.7, 
#     mutpb=0.6, 
#     ngen=num_generations, 
#     elite_pop_size= 10,
#     stats = stats, 
#     halloffame =hof
#     )
# t2 = pendulum.now()
# run_time = (t2-t1).seconds

# best_solution=hof.items[0]
# bh_start_train = (1000/df_train.iloc[0]['Open'])*df_train.iloc[-1]['Open']*(1-tc)**2
# strat_train_profit = trading_strat(individual = best_solution, df=df_train,pset=pset)[0]

# bh_start_test = (1000/df_test.iloc[0]['Open'])*df_test.iloc[-1]['Open']*(1-tc)**2
# strat_test_profit = trading_strat(individual = best_solution, df=df_test,pset=pset)[0]

# run_info= pd.DataFrame(columns=['population_number', 'generations', 'run_time', 'best_tree','trading_cost','buy_hold_train','strategy_value_train','buy_hold_test','strategy_value_test'])
# run_info = run_info._append({
#         "time":t1,
#         "population_number":population_size,
#         "generations": num_generations,
#         "run_time":run_time,
#         "best_tree":str(hof.items[1]),
#         "trading_cost":f"{tc*100}%",
#         "fitness_value": hof.items[1].fitness.values,
#         "buy_hold_train":bh_start_train,
#         "strategy_value_train":strat_train_profit,
#         "buy_hold_test":bh_start_test,
#         "strategy_value_test":strat_test_profit
# },
# ignore_index = True)


  df[f"lag_{column.lower()}_{lag}"]= df[column].shift(lag)
  df[f"lag_{column.lower()}_{lag}"]= df[column].shift(lag)
  df[f"percentage_{column.lower()}_{(round(percentage*100))}"]= df[column]*percentage
  df[f"percentage_{column.lower()}_{(round(percentage*100))}"]= df[column]*percentage
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}_{lag}"]= df[column].rolling(window = lag).mean()
  df[f"ma_{column.lower()}

In [4]:
from operator import attrgetter

In [5]:
population = pop
elite_pop_size=10
cxpb=0.5
mutpb=0.5


# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
    ind.fitness.values = fit

In [6]:
offspring = toolbox.select(population, len(population)-elite_pop_size)
print([i.fitness.values for i in offspring])

elite_pop = sorted(population, key=attrgetter("fitness"), reverse=True)[:elite_pop_size]
offspring = elite_pop +offspring
offspring_save = [i.copy() for i in offspring]

# Apply crossover and mutation on the offspring
for i in range(elite_pop_size+1, len(offspring)):
    if random.random() < cxpb:
        bi = random.randint(elite_pop_size+1, len(offspring)-1)
        print(i,bi)
        o1 = offspring[i].copy()
        t1,t2 = toolbox.mate(offspring[bi],offspring[i])
        print(t1==offspring[bi], t2==o1 )
        del offspring[bi].fitness.values, offspring[i].fitness.values


elits_check = []
for ind,val in enumerate(offspring_save):
    elits_check.append(val == offspring[ind])
print(elits_check)
print(sum(elits_check))

[(-1.6911288087620733,), (17650.137647419222,), (-1.6911288087620733,), (1384.5076781770294,), (401.70392625505633,), (-841.0085620478509,), (17797.63182820865,), (186.58249154344963,), (401.70392625505633,), (17650.137647419222,), (-27.70791575206951,), (-90.00443001339649,), (12270.434607467323,), (-84.7495234753752,), (27348.479539589753,), (60609.24093005038,), (186.58249154344963,), (16082.038999807599,), (186.58249154344963,), (17650.137647419222,), (1976.7259042796145,), (1976.7259042796145,), (401.70392625505633,), (186.58249154344963,), (1186.5449123491808,), (186.58249154344963,), (-358.28219613107535,), (17797.63182820865,), (186.58249154344963,), (17650.137647419222,), (51469.096948967075,), (221.84261561643757,), (51469.096948967075,), (66.22615941825549,), (-140.3708003796561,), (401.70392625505633,), (6822.038227267289,), (1000.0,), (23.831742467786825,), (9885.37650494468,)]
12 45
False False
13 39
False False
14 22
False False
15 15
False False
21 19
False False
23 28


In [7]:
range(elite_pop_size+1, len(offspring))

range(11, 50)

In [8]:
print(str(offspring[1]))
t1,t2 = cxSubTree(offspring[1],offspring[2])
print(str(offspring[1]))
print(str(gp.PrimitiveTree(t1)))

or_(gt('ma_close_36', 'ma_low_11'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))
or_(gt('ma_close_36', 'ma_low_11'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))
or_(gt('ma_low_38', 'ma_open_28'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))


In [9]:
str(gp.PrimitiveTree(t1))

"or_(gt('ma_low_38', 'ma_open_28'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))"

In [12]:
print(str(offspring[1]))
m1 = mutBranch(offspring[1], pset=pset)
print(str(offspring[1]))
print(str(gp.PrimitiveTree(m1)))

or_(gt('ma_close_36', 'ma_low_11'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))
or_(gt('ma_close_36', 'ma_low_11'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'lag_volume_7')))
or_(gt('ma_close_36', 'ma_low_11'), or_(gt('ma_open_9', 'lag_high_49'), gt('lag_volume_45', 'Volume')))
