In [61]:


import pandas as pd
data = pd.read_csv("historical_data/XLE/XLE.csv")


# Label creation if only using daily data
def get_daily_volatility(df,span0=20):
    # simple percentage returns
    df0 = df.close.pct_change()
    # 20 days, a month EWM's std as boundary
    df0=df0.ewm(span=span0).std().to_frame("volatility")
    df_clean = df0.dropna()
    return df0, df_clean

def adjust_data(df, volatilities_raw):
    df_clean = df[volatilities_raw.isna()['volatility'] == False]
    return df_clean

def get_barriers(df, volatilities, upper_lower_multipliers):
    barriers = df[['close','high','low']].copy()
    barriers['volatility'] = volatilities['volatility']
    top_barrier = [0]
    bottom_barrier = [0]
    for i in range(len(barriers)-1):
        vol = volatilities.volatility.iloc[i]
        if upper_lower_multipliers[0] > 0:
            top_barrier.append(barriers.close.iloc[i] + barriers.close.iloc[i] * upper_lower_multipliers[0] * vol)
        else:
            #set it to NaNs
            top_barrier = pd.Series(index=prices.index)
        #set the bottom barrier

        if upper_lower_multipliers[1] > 0:
            bottom_barrier.append(barriers.close.iloc[i] - barriers.close.iloc[i] * upper_lower_multipliers[1] * vol)
        else:
            #set it to NaNs
            bottom_barrier = pd.Series(index=prices.index)
    barriers['top_barrier'] = top_barrier
    barriers['bottom_barrier'] = bottom_barrier
    return barriers

def get_labels_daily(df, upper_lower_multipliers=(2,2)):
    """
    top_barrier: profit taking limit
    bottom_barrier:stop loss limit
    daily_volatiliy: average daily volatility based on 20-day moving average
    barriers_df: DataFrame containing top and bottom barriers on a per-day base
    """
    daily_volatility_raw, daily_volatility_clean = get_daily_volatility(df)
    df = adjust_data(df, daily_volatility_raw)
    barriers_df = get_barriers(df = df, volatilities = daily_volatility_clean, upper_lower_multipliers = upper_lower_multipliers)
    labels = [0,0]
    nr_double_labels = 0
    for i in range(len(barriers_df.index)-1):
        if barriers_df.high.iloc[i+1] >= barriers_df.top_barrier.iloc[i+1]:
            labels.append(1)
        elif barriers_df.low.iloc[i+1] <= barriers_df.bottom_barrier.iloc[i+1]:
            labels.append(-1)
        else:
            labels.append(0)

        if barriers_df.high.iloc[i+1] >= barriers_df.top_barrier.iloc[i+1] and barriers_df.low.iloc[i+1] <= barriers_df.bottom_barrier.iloc[i+1]:
            nr_double_labels += 1

    labels.append(0)
    perc_double_labels = round(nr_double_labels / len(df),4)
    #barriers_df['label'] = labels
    #return barriers_df, barriers_df.label, perc_double_labels
    return labels

#barriers_df, labels, perc_double_labels = get_labels_daily(data)
labels_triple_barrier = get_labels_daily(data)
#print(f"Percentage of double labels: {perc_double_labels*100}%")
cols =data.columns
barriers_df = pd.DataFrame(data.values, columns=cols)
barriers_df['tb_label']=labels_triple_barrier
#barriers_df

In [62]:
barriers_df

Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume,tb_label
0,0,1998-12-22,13.527403,13.572736,13.45487,13.500203,15200,0
1,1,1998-12-23,13.545537,13.781269,13.527404,13.781269,67800,0
2,2,1998-12-24,13.817541,13.817541,13.690608,13.708741,12300,0
3,3,1998-12-28,13.781271,13.781271,13.545538,13.636205,13500,0
4,4,1998-12-29,13.672469,13.772202,13.545536,13.772202,22000,0
...,...,...,...,...,...,...,...,...
5725,5725,2021-09-23,49.080002,50.669998,48.849998,50.52,24867100,0
5726,5726,2021-09-24,50.25,51.16,50.150002,50.900002,27748900,0
5727,5727,2021-09-27,52.02,52.919998,51.950001,52.709999,40848700,0
5728,5728,2021-09-28,53.209999,53.830002,52.77,52.889999,66467100,0


In [63]:
data

Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume
0,0,1998-12-22,13.527403,13.572736,13.454870,13.500203,15200
1,1,1998-12-23,13.545537,13.781269,13.527404,13.781269,67800
2,2,1998-12-24,13.817541,13.817541,13.690608,13.708741,12300
3,3,1998-12-28,13.781271,13.781271,13.545538,13.636205,13500
4,4,1998-12-29,13.672469,13.772202,13.545536,13.772202,22000
...,...,...,...,...,...,...,...
5725,5725,2021-09-23,49.080002,50.669998,48.849998,50.520000,24867100
5726,5726,2021-09-24,50.250000,51.160000,50.150002,50.900002,27748900
5727,5727,2021-09-27,52.020000,52.919998,51.950001,52.709999,40848700
5728,5728,2021-09-28,53.209999,53.830002,52.770000,52.889999,66467100


In [8]:
grouped_df=barriers_df.groupby(by='label', axis=1)
for key, item in grouped_df:
    print(grouped_df.get_group(key), "\n\n")


In [48]:
print(barriers_df[barriers_df['label']==0].shape[0])
print(barriers_df[barriers_df['label']==1].shape[0])
print(barriers_df[barriers_df['label']==-1].shape[0])

5061
283
384


In [47]:
print(barriers_df[barriers_df['labels2']==0].shape[0])
print(barriers_df[barriers_df['labels2']==1].shape[0])
print(barriers_df[barriers_df['labels2']==-1].shape[0])

5053
334
341


In [14]:
data.shape

(5730, 7)

In [18]:
from tqdm.auto import tqdm
import numpy as np

def create_labels(df, col_name, window_size=11):  
    """
    Data is labeled as per the logic in research paper
    Label code : BUY => 1, SELL => 0, HOLD => 2

    params :
        df => Dataframe with data
        col_name => name of column which should be used to determine strategy

    returns : numpy array with integer codes for labels with
              size = total-(window_size)+1
    """

    print("creating label with bazel's strategy")
    counter_row = 0
    number_of_days_in_File = len(df)
    labels = np.zeros(number_of_days_in_File)
    labels[:] = np.nan
    print("Calculating labels")
    pbar = tqdm(total=number_of_days_in_File)

    while counter_row < number_of_days_in_File:
        counter_row += 1
        if counter_row > window_size:
            window_begin_index = counter_row - window_size
            window_end_index = window_begin_index + window_size - 1
            window_middle_index = int((window_begin_index + window_end_index) / 2)

            min_ = np.inf
            min_index = -1
            max_ = -np.inf
            max_index = -1
            for i in range(window_begin_index, window_end_index + 1):
                number = df.iloc[i][col_name]  # number is the price
                if number < min_:
                    min_ = number
                    min_index = i
                if number > max_:
                    max_ = number
                    max_index = i

            if max_index == window_middle_index:
                labels[window_middle_index] = -1  # SELL
            elif min_index == window_middle_index:
                labels[window_middle_index] = 1  # BUY
            else:
                labels[window_middle_index] = 0  # HOLD

        pbar.update(1)

    pbar.close()
    return labels
labels = create_labels(data, 'close')

creating label with bazel's strategy
Calculating labels


  0%|          | 0/5730 [00:00<?, ?it/s]

In [20]:
print(labels)

[nan nan nan ... nan nan nan]


In [52]:
l = pd.Series(labels)
l = l.fillna(0)
#l=l.iloc[2:]


In [64]:
data.shape

(5730, 7)

In [65]:
barriers_df['labels2']=l

In [46]:
barriers_df[(barriers_df['label']==-1) & (barriers_df['labels2']==1)].shape

(2, 8)

In [66]:
barriers_df

Unnamed: 0.1,Unnamed: 0,timestamp,open,high,low,close,volume,tb_label,labels2
0,0,1998-12-22,13.527403,13.572736,13.45487,13.500203,15200,0,0.0
1,1,1998-12-23,13.545537,13.781269,13.527404,13.781269,67800,0,0.0
2,2,1998-12-24,13.817541,13.817541,13.690608,13.708741,12300,0,0.0
3,3,1998-12-28,13.781271,13.781271,13.545538,13.636205,13500,0,0.0
4,4,1998-12-29,13.672469,13.772202,13.545536,13.772202,22000,0,0.0
...,...,...,...,...,...,...,...,...,...
5725,5725,2021-09-23,49.080002,50.669998,48.849998,50.52,24867100,0,0.0
5726,5726,2021-09-24,50.25,51.16,50.150002,50.900002,27748900,0,0.0
5727,5727,2021-09-27,52.02,52.919998,51.950001,52.709999,40848700,0,0.0
5728,5728,2021-09-28,53.209999,53.830002,52.77,52.889999,66467100,0,0.0
