In [None]:
import pandas as pd
import numpy as np
import random
from collections import Counter

In [None]:
# Read in the data
uni_pure = pd.read_csv('EM_universe.csv', index_col='date')

In [None]:
# Make copy of uni_pure to use
uni = uni_pure.copy()

In [None]:
# Set index to datetiem and sort 
uni.index = pd.to_datetime(uni.index)
uni = uni.sort_index()

In [None]:
# Drop unwated returns
uni = uni[['name','id','Alpha', 'Market Capitalisation', 'Book-to-Market Ratio', 'EBIT', 'Investment','Stock Price Volatility', 'Mean Return', 'ROA', 'ROE',
 'SGI', 'Debt-to-Equity Ratio', 'Market Risk Factor Loading', 'SMB Factor Loading', 'HML Factor Loading', 'RMW Factor Loading', 'CMA Factor Loading', 'return_adjusted']]

In [None]:
# Rename return_adjusted
uni.rename(columns = {'return_adjusted':'return'}, inplace = True)

In [None]:
uni.head()

In [None]:
# Function that ranks data into quintiles
def rank_factors(info):
    for item in info.columns.tolist():
        string = item + " Rank" 
        info[string] = pd.qcut(info[item], 5, labels = [item + "1", item + "2", item + "3",

        item + "4", item + "5"])
    return info

In [None]:
# Function that creates an array of lists of factors for each available asset
def fpg_prep(info):
    # Drop the columns that do not contain the rankings
    state = info.drop([ 'Alpha',
 'Market Capitalisation',
 'Book-to-Market Ratio',
 'EBIT',
 'Investment',
 'Stock Price Volatility',
 'Mean Return',
 'ROA',
 'ROE',
 'SGI',
 'Debt-to-Equity Ratio',
 'Market Risk Factor Loading',
 'SMB Factor Loading',
 'HML Factor Loading',
 'RMW Factor Loading',
 'CMA Factor Loading', 'return'], axis=1)
    # Drop the returns ranked column and assign remaining info to new 
    new = state.drop("return Rank", axis = 1)
    # Reset the index of state and drop the names column
    state = state.reset_index()
    final = []
    # For each row, append final with each row as an array of its own
    for i in range(0,len(state)):
        final.append(state.loc[i, state.columns[1:]].tolist())
    # Return both final and new
    return final , new

In [None]:
# Function that mines assocation and lift rules, the 5th quintile returns and the true/false df for the FPG algorithm
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
def rules(final, sup, conf):
    # Preprocessing of input argument into true and false for each discretisation
    te = TransactionEncoder()
    te_ary = te.fit(final).transform(final)
    # Eg is the true/false dataframe in this form for the fp growth algorithm
    eg = pd.DataFrame(te_ary , columns=te.columns_)
    # True/false column of the highest quintile of returns
    high_returns = eg.return5
    # Finding frequent items in data for a minimum support of 5%
    freq_items = fpgrowth(eg, min_support=sup, use_colnames = True)
    # Discover association and causal rules
    asso_rules = association_rules(freq_items , metric="confidence",min_threshold =conf)
    lift_rules = association_rules(freq_items , metric="lift", min_threshold=1.2)
    # Return true\false dataframe , high returns , and the found
    return eg, high_returns , asso_rules , lift_rules

In [None]:
# Get high return rules from rule set
def high_ret_rules(asso_rules): 
    factors = []
# For each row, if row contains "Ret5", then add that row’s antecedents to the factors array
    for index in asso_rules.index.tolist():
        #if (list(asso_rules.loc[index, ’consequents’])[0] in ["Ret5"]): 
        if (set(["return5"]).issubset(set(list(asso_rules.loc[index, 'consequents'])))):
            factors.append(list(asso_rules.loc[index, 'antecedents']))
    return factors

In [None]:
# Function to get ruls
def get_rules(unique_asso): 
    associatons = []
    for asso in unique_asso:
        # Check if each antecedent is in the right format (list) 
        if (isinstance(asso, list) == False):
            associatons.append([asso]) 
        else:
            associatons.append(asso)
    # Return list of lists
    return associatons

In [None]:
# Function to get rules for the period, by using the above functions
def rules_for_period(info, sup, conf):
    info = rank_factors(info.iloc[:,2:])
    final, new = fpg_prep(info)
    eg, high_returns , asso_rules , lift_rules = rules(final, sup, conf)
    factors = high_ret_rules(asso_rules)
    info = info.drop([ 'Alpha',
 'Market Capitalisation',
 'Book-to-Market Ratio',
 'EBIT',
 'Investment',
 'Stock Price Volatility',
 'Mean Return',
 'ROA',
 'ROE',
 'SGI',
 'Debt-to-Equity Ratio',
 'Market Risk Factor Loading',
 'SMB Factor Loading',
 'HML Factor Loading',
 'RMW Factor Loading',
 'CMA Factor Loading', 'return'], axis=1)
    rules_set = get_rules(factors)
    return list(np.unique(np.array(rules_set))), eg, high_returns, info

# Chi Squared

In [None]:
# Function to get expected and actual frequency of a rule
def expected_freq(info, rule):
    # Get number of equites with top quintile returns
    num_ret_ind = len(info[info['return Rank'] == 'return5'])
    
    # Calculate expected frequency
    mask = info.isin(rule)
    filtered = info[mask].dropna(axis = 0, how = 'all')
    num_rule_ind = len(filtered)
    data_len = len(info)
    ef_ind = (num_ret_ind/data_len)*(num_rule_ind/data_len)*data_len
    
    # Calculate actual frequency
    rule.append('return5')
    mask = info.isin(rule)
    filtered = info[mask]
    filtered = filtered.dropna(thresh=len(rule))
    actual_freq = len(filtered)
    rule.pop(-1)
    return actual_freq, ef_ind

In [None]:
# Make list of strings into list of lists, where each string becomes its own list
def list_list(lis):
    
    list_of_lists = []
    
    for string in lis:
        # Create a new list containing the current string
        new_list = [string]
        # Add the new list to the list_of_lists
        list_of_lists.append(new_list)
    
    return list_of_lists
    

In [None]:
# Get unique elements for a list
def get_unique_elements(input_list):
    unique_elements = []
    for element in input_list:
        if element not in unique_elements:
            unique_elements.append(element)
    return unique_elements

In [None]:
from scipy import stats

# Function to get rules that pass the chi-squared pruning
def causal_chi(info, rules):
    if len(rules) == 0:
        return []
    causal = []
    chi_stats = []
    # Append rules that have a significant Chi-stat
    for rule in rules:        
        if type(rule)==np.str_:
            rule = [rule]
        actual_freq, ef_ind = expected_freq(info, rule)
        stat = ((actual_freq-ef_ind)**2)/ef_ind
        dof = len(info['return Rank'].unique())-1
        p = stats.chi2.cdf(stat, dof)
        if p > 0.99:
            causal.append(rule)
            chi_stats.append(stat)
            
    # If no rules were pruned take the 3 with the maximum chi-squared stat
    if rules == causal or list_list(rules) == causal:
        causal2 = []
        index_1 = chi_stats.index(max(chi_stats))
        causal2.append(causal[index_1])
        chi_stats[index_1] = 0
        
        index_2 = chi_stats.index(max(chi_stats))
        causal2.append(causal[index_2])
        chi_stats[index_2] = 0
        
        index_3 = chi_stats.index(max(chi_stats))
        causal2.append(causal[index_3])
        chi_stats[index_3] = 0
        
        print(index_1, index_2, index_3)
        
        return get_unique_elements(causal2)
        
        
    return causal

# LLM

In [None]:
#!pip install --upgrade openai wandb
#!pip install requests

In [None]:
import requests
import json

In [None]:
# Set parameters for Open AI API, you need to add the key
API_KEY = 'Put API key here'
API_ENDPOINT = "https://api.openai.com/v1/chat/completions"

In [None]:
# Function to prompt gpt-3.5 turbo
def generate_chat_completion(messages, model="gpt-3.5-turbo", temperature=1, max_tokens=None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}",
    }

    data = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
    }

    response = requests.post(API_ENDPOINT, headers=headers, data=json.dumps(data))

    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Error {response.status_code}: {response.text}")

In [None]:
# This part rephrase rule into something the LLM will understand
dic = {'1':'very low', '2':'low',"3":'medium', '4':'high','5':'very high'}
def rephrase(rule):
    quant = dic[rule[-1]]
    factor = rule[:-1]
    return quant, factor

In [None]:
# Get rules that survived the LLM pruning
import time
def causal_LLM(rules):
    causal = []
    for rule in rules: 
        # Creating the prompt for each rule
        if type(rule)==np.str_:
            rule = [rule]
        if len(rule)==1:
            quant, factor = rephrase(rule[0])
            q = 'Does a ' + quant + ' quarterly ' + factor + ' significantly increase the probability of a high quarterly return for a given equity?'
        else:
            q = 'Does'
            for i in rule[:-1]:
                quant, factor = rephrase(i)
                q = q + ' a ' + quant + ' quarterly ' + factor +','
            quant, factor = rephrase(rule[-1])
            q = q[:-1] + ' and a ' + quant + ' quarterly ' + factor + ' significantly increase the probability of a high quarterly return for a given equity?'
        result = None
        
        # Ask the LLM
        while result is None:
            try:
                messages = [{"role": "system", "content": "you only give one word, yes or no answers, no other answer is acceptable"},{"role": "user", "content": q}]
                response_text = generate_chat_completion(messages)
                result = 1
            except:
                 pass
                 print('error')
        # Append valid rules to causal set
        if response_text[0:3]=='yes' or response_text[0:3]=='Yes':
            causal.append(rule)

    return causal

# Odds ratio

In [None]:
# Function to get fair dataset for a rule
def get_fair_datasets(true_control, false_control):
    true_match = []
    false_match = []

    inter = pd.merge(true_control, false_control, how='inner')
    # Build the fair datasets
    for i, row in inter.iterrows():
        mask_false = (false_control == row).all(axis=1)
        mask_true = (true_control == row).all(axis=1)
        false_match.extend(false_control[mask_false].index.tolist())
        true_match.extend(true_control[mask_true].index.tolist())

    return true_match, false_match

In [None]:
def get_oddsratio_CI(exposure , non_exposure , returns):
    # Count for number of times both exposure and non-exposure groups have the consequent
    n11 = 0
    # Count for number of times the exposure has the consequent and non- exposure groups does not
    n12 = 0
    # Count for number of times the exposure does not have the consequent and non-exposure groups does
    n21 = 0
    # Count for number of times both exposure and non-exposure groups do not have the consequent
    n22 = 0
    for i in range(len(exposure)):
    # If both the exposure and non exposure groups have returns in the 5th quantile , increment n11 by one
        if (returns.loc[exposure.index[i]] == True) and (returns.loc[ non_exposure.index[i]] == True):
            n11 += 1
        elif (returns.loc[exposure.index[i]] == True) and (returns.loc[non_exposure.index[i]] == False): 
            n12 += 1
        elif (returns.loc[exposure.index[i]] == False) and (returns.loc[ non_exposure.index[i]] == True):
            n21 += 1
        elif (returns.loc[exposure.index[i]] == False) and (returns.loc[non_exposure.index[i]] == False): 
            n22 += 1
    # To ensure that you are not dividing by 0, if n12 or n21 are zero, set them to one
    if n21 == 0: 
        n21 = 1
    if n12 == 0: 
        n12 = 1
    # Calculate the odds ratio point estimate
    odds_ratio = n12/n21
    # Compute the lower and upper bounds of the odds ratio’s 80% conficence interval
    lower_bound = np.exp(np.log(odds_ratio) - (1.96*np.sqrt((1/n12) + (1/n21 ))))
    upper_bound = np.exp(np.log(odds_ratio) + (1.15*np.sqrt((1/n12) + (1/n21 ))))
    return lower_bound , upper_bound

In [None]:
# Function that mines causal rules from established associations
def get_causal_rules(eg, rules, returns): 
    # Array to store mined causal rules 
    causal_rules = []
    back_up_rules = []
    back_up_ratios = []
    for rule in rules:
        if type(rule)==np.str_:
            rule = [rule]
        #print(rule)
    
        # Otherwise , for each association antecedent , search rows for when antecedent and true and false
        trues = list(np.ones(len(rule), dtype=bool)) 
        falses = list(np.zeros(len(rule), dtype=bool))
        
        true_indices = np.all(eg[rule].values == trues, axis=1)
        false_indices = np.all(eg[rule].values == falses, axis=1)
        true = eg[true_indices]
        false = eg[false_indices]

        # Remove the returns columns and columns with antecedants in question. Only the control variables remain
        remove = ["return1", "return2", "return3", "return4", 'return5'] 
        for cond in rule:
            remove.append(cond[:-1] + "1") 
            remove.append(cond[:-1] + "2") 
            remove.append(cond[:-1] + "3")
            remove.append(cond[:-1] + "4")
            remove.append(cond[:-1] + "5")

        
        true_control = true.drop(remove , axis = 1)
        false_control = false.drop(remove , axis = 1)
        # Drop duplicates from true and false control sets to ensure that there is at most one set of matching rows

        true_control = true_control.drop_duplicates(subset = true_control.columns , keep='first')
        false_control = false_control.drop_duplicates(subset = false_control.columns , keep='first')
        

        # Retrieve the date indices from the fair datasets
        true_match , false_match = get_fair_datasets(true_control, false_control)
        
        
        # Getting returns columns for the rows of the fair dataset
        exposure_returns = returns[true_match]
        non_exposure_returns = returns[false_match]

        # Compute the bounds of the rule’s odd ratio confidence interval
        lower, _ = get_oddsratio_CI(exposure_returns, non_exposure_returns , returns)

        #print(lower)
        if (lower > 1):  
            causal_rules.append(rule)
        else:
            back_up_rules.append(rule)
            back_up_ratios.append(lower)
            
    if len(causal_rules) == 0 and len(back_up_ratios)>0: 
        high_ratios = max(back_up_ratios)
   
        for i in range(len(back_up_rules)):
            if back_up_ratios[i] == high_ratios:
                causal_rules.append(back_up_rules[i])
        
    return causal_rules

assoc_rules, eg, ret, info = rules_for_period(uni[uni.index =='2011-03-31'])
start_time = time.time()
odds1 = get_causal_rules(eg, assoc_rules, ret)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

Now I have the causal pruning functions i need to make a simulation that finds them at every period 

# Simulation

In [None]:
# Function that gets all association rules and applies causal pruning to them for a given dataset
def get_all_rule_sets(period_data, sup, conf):
    assoc_rules, eg, ret, info = rules_for_period(period_data, sup, conf)

    chi = causal_chi(info, assoc_rules)
    print("chi done")

    LMM = causal_LLM(assoc_rules)

    print('LMM done')
    odds = get_causal_rules(eg, assoc_rules, ret)

    print('odds done')

    return chi, LMM, odds, assoc_rules

In [None]:
# Makes the ensemble models from the causal rule set
def combine_rules(rule_set1, rule_set2):
    or_rules = get_unique_elements(rule_set1+rule_set2)
    and_rules = [value for value in rule_set1 if value in rule_set2]
    return or_rules, and_rules

In [None]:
# Function to find the n most frequent items from a list
def find_n_most_frequent_items(lst, N):
    # Use Counter to count occurrences of each element in the list
    random.shuffle(lst)
    item_counts = Counter(lst)

    # Get the N most common items as a list of tuples (item, count)
    most_common_items = item_counts.most_common()
    
    N_ind = min((len(most_common_items)-1), (N-1))
    
    max_count = most_common_items[N_ind][1]
    
    result1 = [item[0] for item in most_common_items if item[1] > max_count]
    
    # Extract only the items from the tuples and return as a list
    result2 = [item[0] for item in most_common_items if item[1] == max_count]
    #print(result2)
    return result1, result2

In [None]:
# This function, given a ruleset finds the picks the equities due to those rules
def get_equities_data(rules, ranked_data, cur_data):
    # Return benchmark if ruleset is empty
    if len(rules)==0:
        results = [(cur_data['return'].mean()+1), len(cur_data)]
        return results
    # Else pick equities that satisfy the max number of rules until portfolios size is 25% of universe
    else:
        ranked_data.reset_index()
        stocks = []
        number_of_rules = len(rules)
        max_stocks = round(len(ranked_data)*0.25)
        for rule in rules:
            if type(rule)==np.str_:
                rule = [rule]
            mask = ranked_data.isin(rule)
            filtered = ranked_data[mask]
            filtered = filtered.dropna(thresh=len(rule))
            stocks += filtered.index.tolist()
        

        stocks1, stocks2 = find_n_most_frequent_items(stocks,round(max_stocks))
                
        cur_data2 = cur_data[cur_data.index.isin(stocks1)]
        cur_data3 = cur_data[cur_data.index.isin(stocks2)]
        
        if len(stocks1) > 0 and len(stocks2) > 0:
            mean_returns = (cur_data2['return'].mean()+1)*(len(stocks1)/max_stocks) + (cur_data3['return'].mean()+1)*(1-(len(stocks1)/max_stocks))
        elif len(stocks1) > 0 and len(stocks2) == 0:
            mean_returns = (cur_data2['return'].mean()+1)
        elif len(stocks1) == 0 and len(stocks2) > 0:
            mean_returns = (cur_data3['return'].mean()+1)
        

            
        results = [mean_returns, max_stocks]
        return results

In [None]:
# This function runs the simulation. Picking the equities for each period and calculating the results of investing with them 
def simulation(data, periods, sup, conf):
    dates = data.index.unique()
    column_names = ['assoc_rules','chi', 'LMM', 'odds', 'chi_or_LLM', 'chi_and_LLM','chi_or_odds', 'chi_and_odds', 'LLM_or_odds', 'LLM_and_odds', 'chi_or_odds_or_LLM', 'chi_and_odds_and_LLM', 'benchmark']
    # Create an empty DataFrame with column names
    returns_df = pd.DataFrame(columns=column_names)
    size_df = pd.DataFrame(columns=column_names)

    # Itterate through dats
    for i in range(5, len(dates)-periods):
        start_time = time.time()
        # Get rolling window data
        window_data = data[data.index.isin(dates[i:i+periods+1])]

        
        # Get window data where more than periods/2 periods are available for each stock
        window_data = window_data[window_data.groupby('name').name.transform('count')>((periods/2)+1)].copy()
        # Get current period data
        current_data = window_data[window_data.index == dates[i+periods]].set_index('id')
        
        # Drop current period data from window data
        window_data = window_data[window_data.index.isin(dates[i:i+periods])]
        # Get the ARM and CRM rulesets
        chi, LLM, odds, assoc_rules = get_all_rule_sets(window_data, sup, conf)

        
        # Get the ensemble models
        chi_or_LLM, chi_and_LLM = combine_rules(chi, LLM)
        chi_or_odds, chi_and_odds = combine_rules(chi, odds)
        LLM_or_odds, LLM_and_odds = combine_rules(LLM, odds)
        chi_or_odds_or_LLM, _ = combine_rules(LLM_or_odds, chi)
        _, chi_and_odds_and_LLM = combine_rules(LLM_and_odds, chi)
    
        
        # Encode current data
        current_data = current_data.iloc[:,1:]
        info2 = rank_factors(current_data).drop([ 'Alpha','Market Capitalisation', 'Book-to-Market Ratio', 'EBIT', 'Investment', 'Stock Price Volatility', 'Mean Return', 'ROA', 'ROE', 'SGI', 'Debt-to-Equity Ratio',
         'Market Risk Factor Loading', 'SMB Factor Loading', 'HML Factor Loading', 'RMW Factor Loading', 'CMA Factor Loading','return'], axis=1) 
        
        # Get the returns of each rule set
        chi_equities = get_equities_data(chi, info2, current_data)
        LLM_equities = get_equities_data(LLM, info2, current_data)
        odds_equities = get_equities_data(odds, info2, current_data)
        chi_or_LLM_equities = get_equities_data(chi_or_LLM, info2, current_data)
        chi_and_LLM_equities = get_equities_data(chi_and_LLM, info2, current_data)
        chi_or_odds_equities = get_equities_data(chi_or_odds, info2, current_data)
        chi_and_odds_equities = get_equities_data(chi_and_odds, info2, current_data)
        LLM_or_odds_equities = get_equities_data(LLM_or_odds, info2, current_data)
        LLM_and_odds_equities = get_equities_data(LLM_and_odds, info2, current_data)
        chi_or_odds_or_LLM_equities = get_equities_data(chi_or_odds_or_LLM, info2, current_data)
        chi_and_odds_and_LLM_equities = get_equities_data(chi_and_odds_and_LLM, info2, current_data)
        assoc_rules_equities = get_equities_data(assoc_rules, info2, current_data)
        
        
        # Store results in dataframe
        returns_df.loc[dates[i+periods],:] = [assoc_rules_equities[0], chi_equities[0], LLM_equities[0], odds_equities[0], chi_or_LLM_equities[0], chi_and_LLM_equities[0],
                                         chi_or_odds_equities[0], chi_and_odds_equities[0], LLM_or_odds_equities[0], LLM_and_odds_equities[0],
                                         chi_or_odds_or_LLM_equities[0], chi_and_odds_and_LLM_equities[0], (current_data['return'].mean()+1)]
        
        size_df.loc[dates[i+periods],:] = [assoc_rules_equities[1], chi_equities[1], LLM_equities[1], odds_equities[1], chi_or_LLM_equities[1], chi_and_LLM_equities[1],
                                         chi_or_odds_equities[1], chi_and_odds_equities[1], LLM_or_odds_equities[1], LLM_and_odds_equities[1],
                                         chi_or_odds_or_LLM_equities[1], chi_and_odds_and_LLM_equities[1], len(current_data)]
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(assoc_rules)
        print(chi)
        print(elapsed_time)
        print(dates[i+periods])
    return returns_df, size_df

In [None]:
# Run the simulation
start_time = time.time()
ret, siz = simulation(uni, 12, 0.05, 0.20)
end_time = time.time()
elapsed_time = end_time - start_time
print(elapsed_time)

In [None]:
# Drop results from simulation
ret.index.name = 'date'
siz.index.name = 'date'
ret.to_csv('ret.csv',index=True)
siz.to_csv('siz.csv',index=False)

# Target shuffling

In [None]:
# A function to shuffle the target variables of the data 
def target_shuffle(data):
    random_state = np.random.randint(0, 100)
    shuffle = data.sample(frac=1, random_state = random_state).reset_index()
    data = data.reset_index() 
    data['return'] = shuffle['return']
    data = data.set_index('date')
    return data

In [None]:
# Function to run the simulation again but with target shuffling, this time only recording anual returns 
def target_shuffle_simulation(data, periods):
    dates = data.index.unique()
    column_names = ['assoc_rules','chi', 'LMM', 'odds', 'chi_or_LLM', 'chi_and_LLM','chi_or_odds', 'chi_and_odds', 'LLM_or_odds', 'LLM_and_odds', 'chi_or_odds_or_LLM', 'chi_and_odds_and_LLM']
    # Create an empty DataFrame with column names
    returns_df = pd.DataFrame(columns=column_names)
    size_df = pd.DataFrame(columns=column_names)
    
    # Itterate through periods
    for i in range(5, len(dates)-periods):
        
        # Get rolling window data
        window_data = data[data.index.isin(dates[i:i+periods+1])]
        
        # Get window data where more than periods/2 periods are available for each stock
        window_data = window_data[window_data.groupby('name').name.transform('count')==((periods/2)+1)].copy()
        # Get current period data
        current_data = window_data[window_data.index == dates[i+periods]].set_index('id')
        # Drop current period data from window data
        window_data = target_shuffle(window_data[window_data.index.isin(dates[i:i+periods])])
        
        # Get CRM and ARM rulesets for data
        chi, LLM, odds,assoc_rules = get_all_rule_sets(window_data, 0.05, 0.2)
        
        # Get ensemble model sets
        chi_or_LLM, chi_and_LLM = combine_rules(chi, LLM)
        chi_or_odds, chi_and_odds = combine_rules(chi, odds)
        LLM_or_odds, LLM_and_odds = combine_rules(LLM, odds)
        chi_or_odds_or_LLM, _ = combine_rules(LLM_or_odds, chi)
        _, chi_and_odds_and_LLM = combine_rules(LLM_and_odds, chi)
        
        # Encode current data
        current_data = current_data.iloc[:,1:]
        info2 = rank_factors(current_data).drop([ 'Alpha', 'Market Capitalisation', 'Book-to-Market Ratio', 'EBIT', 'Investment', 'Stock Price Volatility', 'Mean Return', 'ROA',
         'ROE', 'SGI', 'Debt-to-Equity Ratio', 'Market Risk Factor Loading', 'SMB Factor Loading', 'HML Factor Loading', 'RMW Factor Loading', 'CMA Factor Loading', 'return'], axis=1) 
        
        
        # Get results for each ruleset
        chi_equities = get_equities_data(chi, info2, current_data)
        LLM_equities = get_equities_data(LLM, info2, current_data)
        odds_equities = get_equities_data(odds, info2, current_data)
        chi_or_LLM_equities = get_equities_data(chi_or_LLM, info2, current_data)
        chi_and_LLM_equities = get_equities_data(chi_and_LLM, info2, current_data)
        chi_or_odds_equities = get_equities_data(chi_or_odds, info2, current_data)
        chi_and_odds_equities = get_equities_data(chi_and_odds, info2, current_data)
        LLM_or_odds_equities = get_equities_data(LLM_or_odds, info2, current_data)
        LLM_and_odds_equities = get_equities_data(LLM_and_odds, info2, current_data)
        chi_or_odds_or_LLM_equities = get_equities_data(chi_or_odds_or_LLM, info2, current_data)
        chi_and_odds_and_LLM_equities = get_equities_data(chi_and_odds_and_LLM, info2, current_data)
        assoc_rules_equities = get_equities_data(assoc_rules, info2, current_data)
        
        # Store returns in dataframe
        returns_df.loc[dates[i+periods],:] = [assoc_rules_equities[0], chi_equities[0], LLM_equities[0], odds_equities[0], chi_or_LLM_equities[0], chi_and_LLM_equities[0],
                                         chi_or_odds_equities[0], chi_and_odds_equities[0], LLM_or_odds_equities[0], LLM_and_odds_equities[0],
                                         chi_or_odds_or_LLM_equities[0], chi_and_odds_and_LLM_equities[0]]

    return returns_df.prod(axis=0)

In [None]:
# Run the target shuffling simulation for as many times as the number in the for loop 
column_names = ['assoc_rules','chi', 'LMM', 'odds', 'chi_or_LLM', 'chi_and_LLM','chi_or_odds', 'chi_and_odds', 'LLM_or_odds', 'LLM_and_odds', 'chi_or_odds_or_LLM', 'chi_and_odds_and_LLM']
# Create an empty DataFrame with column names
target_shuffle_results = pd.DataFrame(columns=column_names)

for i in range(150):
    target_shuffle_results.loc[len(target_shuffle_results.index)] = target_shuffle_simulation(uni, 12)
    print(target_shuffle_results)
    target_shuffle_results.to_csv('target_shuffle_results.csv',index=False)



