In [None]:
import mlfinlab as mlfin
import pandas as pd
import pandas_datareader as pdr
import pandas_profiling as pf
import numpy as np
from yahoo_finance import Share
from datetime import datetime, timedelta, date
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
import pywt
from ta.momentum import RSIIndicator, StochasticOscillator, WilliamsRIndicator, ROCIndicator
from ta.trend import MACD, ADXIndicator
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, cross_val_score
from xgboost import XGBClassifier, plot_importance, plot_tree
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.validators.scatter.marker import SymbolValidator
import ipywidgets as widgets
from ipywidgets import interact, HBox, Label
import plotly.graph_objs as go
import warnings
import sys
import os

init_notebook_mode(connected=True)

os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'
sns.set_context('notebook', rc={"axes.titlesize":14,"axes.labelsize":13})
sns.set_style('white')
%matplotlib inline
bbva = ['#004481','#2DCCCD', '#D8BE75','#1973B8', '#5BBEFF', '#F7893B', '#02A5A5', '#48AE64', '#F8CD51', '#F78BE8'];
sns.set_palette(bbva);
sns.color_palette(bbva)

# Data Loading & EDA 

In [None]:
def load_data(data='countries', data_type='News_Social', asset_code_id='US'):
    """
    This function loads the sentiment indicators downloaded from Thomsom Reuters, 
    i.e. the MarketPsych Sentiment Indicators related to countries, companies and currencies.
    
    Parameters
    ----------
    data: str
        One of 'countries', 'companies', 'currencies'.
    data_type: str
        Whether to filter info by data source. One of 'News', 'Social', 'News_Social'.
    asset_code_id: str
        Code of the asset / data. One of 'US', 'US500', 'USD'.
        
    Return
    ------
    Pandas dataframe.
    """
    dirs = os.listdir('./')
    if data == 'countries':
        dim, file = 'COU', 'COU_CARGA_INICIAL.csv'
        sufix = '_USA'
    elif data == 'currencies':
        dim, file = 'CUR', 'CUR_CARGA_INICIAL.csv'
        sufix = '_USD'
    elif data == 'companies':
        dim, file = 'CMPNY', 'CMPNY_GRP_CARGA_INICIAL.csv'
        sufix = '_US500'
        
    dirs = [dire_x for dire_x in os.listdir('./') if dim in dire_x]
    dataset = pd.read_csv(file, sep=';')
    dataset = dataset[dataset.date <= '2020-03-30']
    
    for dire in dirs:
        if (dire != file):
            new_month = pd.read_csv(dire, delimiter='\t')
            if len(new_month.columns) == 1:
                new_month = pd.read_csv(dire, delimiter=';')
            if 'date' not in new_month.columns:
                new_month['date'] = new_month.id.apply(lambda x: x[3:13])
            if 'asset_code_id' not in new_month.columns:
                new_month['asset_code_id'] = new_month.assetCode
            if 'data_type' not in new_month.columns:
                new_month['data_type'] = new_month.dataType
            if 'id_refinitiv' not in new_month.columns:
                new_month['id_refinitiv'] = new_month.id
            if 'system_version' not in new_month.columns:
                new_month['system_version'] = new_month.systemVersion
            if 'date_audit_laod' not in new_month.columns:
                new_month['date_audit_laod'] = 'NA'
            if 'process_audit_load' not in new_month.columns:
                new_month['process_audit_load'] = 'NA'
            new_month = new_month[dataset.columns]
            dataset = pd.concat([dataset, new_month], ignore_index=True)

    dataset = dataset[(dataset.data_type == data_type) & 
                       (dataset.asset_code_id == asset_code_id) &
                       (dataset.date >= '2000-01-01')].sort_values(by='date')
    
    dataset['Date'] = pd.to_datetime(dataset.date)
    dataset.set_index('Date', inplace=True)
    dataset.drop(['date', 'asset_code_id', 'data_type', 'id_refinitiv',
                  'system_version', 'date_audit_laod', 'process_audit_load'], axis=1, inplace=True)
    dataset.columns = [col + sufix for col in dataset.columns]
    
    return dataset

In [None]:
# Countries info
countries = load_data(data='countries', data_type='News_Social', asset_code_id='US')

In [None]:
countries.describe()

In [None]:
# Currencies info
currencies = load_data(data='currencies', data_type='News_Social', asset_code_id='USD')

In [None]:
currencies.describe()

In [None]:
companies = load_data(data='companies', data_type='News_Social', asset_code_id='MPTRXUS500')

# Market Data

In [None]:
# Later on we will adapt dates to the data loaded before
start_date = datetime(1999, 11, 30)
end_date = datetime(2020, 8, 31)

# SP500 Yahoo Finance
sp500_yahoo = pdr.get_data_yahoo(symbols='^GSPC', start=start_date, end=end_date)
sp500_yahoo = sp500_yahoo.asfreq('D', method=None) # generating extra days so that we don't have date jumps
display(sp500_yahoo.head())

# Adding return and volatility just for plotting (as this has to be calculated separately in train and test)
original_columns = list(sp500_yahoo.columns)
original_columns.remove('Volume') # unrealiable volume data. It will not be used
sp500_yahoo['Daily Return'] = sp500_yahoo['Adj Close'].pct_change(periods=1)*100
sp500_yahoo['Daily Volatility'] = sp500_yahoo['Daily Return'].ewm(span=30).std() # exponential moving std
sp500_yahoo['Daily Expected Return'] = sp500_yahoo['Daily Return'].ewm(span=30).mean()

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days) + 1):
        yield start_date + timedelta(n)

weekend = [6, 7]
weekdays = []
for dt in daterange(start_date, end_date):
    if dt.isoweekday() not in weekend:
        weekdays.append(dt.strftime('%Y-%m-%d'))

# We'll take only weekdays and we'll delete weekends (as markets are closed during these days)
sp500_yahoo = sp500_yahoo[sp500_yahoo.index.isin(weekdays)]

In [None]:
# Plotting price, volatility and return
fig, ax1 = plt.subplots()

ax1.set_xlabel('Date')
ax1.set_ylabel('Daily Volatility [%]')
ax1.plot(sp500_yahoo['Daily Volatility'], label='Daily Volatility', color='lightgrey')
ax1.legend(loc='upper left')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

ax2.set_ylabel('S&P 500 Close Price')  # we already handled the x-label with ax1
ax2.plot(sp500_yahoo['Close'], color=bbva[0], label='S&P 500 Close Price')
ax2.legend(loc='upper right')

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

_, ax = plt.subplots()
ax.plot(sp500_yahoo['Daily Return'], color=bbva[0], label='Daily Return');
ax.plot(sp500_yahoo['Daily Expected Return'] + sp500_yahoo['Daily Volatility'], color='lightgrey')
ax.plot(sp500_yahoo['Daily Expected Return'] - sp500_yahoo['Daily Volatility'], color='lightgrey', label='Volatility Bands')
ax.plot(sp500_yahoo['Daily Expected Return'], color=bbva[1], label='Daily Expected Return')
ax.legend(loc='upper right')
ax.set_ylabel('Daily Return [%]');
ax.set_xlabel('Date');
#plt.xticks(rotation=30)

In [None]:
# Calculating events with cusum filter (just for plotting)
cusum_events = mlfin.filters.cusum_filter(sp500_yahoo['Adj Close'], 
                                          threshold=0.1) #threshold abs(change)

# interactive plot
warnings.filterwarnings("ignore")
#configure_plotly_browser_state()

# creating widgets
dependent=widgets.Select(options=['Close', 'Daily Return', 'Daily Volatility'],
                         value='Close', description='View', disabled=False)
dataframe=widgets.RadioButtons(options=['Companies', 'Countries', 'Currencies'], 
                               value='Companies', description='Indices', disabled=False)
sentiment1=widgets.Dropdown(options=companies.columns,
                            value='sentiment_US500', description='Comp. Value', disabled=False)
sentiment2=widgets.Dropdown(options=countries.columns,
                            value='stockIndexSentiment_USA', description='Count. Value', disabled=False)
sentiment3=widgets.Dropdown(options=currencies.columns,
                            value='sentiment_USD', description='Curr. Value',
                            disabled=False, layout={'positioning': 'right'})

# setting the ui for our widgets
ui = widgets.HBox([dependent, dataframe, widgets.VBox([sentiment1, sentiment2, sentiment3])])

#@interact
def plot_sentiment_index(dependent, dataframe, sentiment1, sentiment2, sentiment3):
    
    if dataframe == 'Companies':
        sentiment = sentiment1
        df = companies
    elif dataframe == 'Countries':
        sentiment = sentiment2
        df = countries
    elif dataframe == 'Currencies':
        sentiment = sentiment3
        df = currencies
    
    figura = make_subplots(specs=[[{"secondary_y": True}]])

    figura.add_trace(go.Scatter(y=sp500_yahoo[dependent].fillna('ffill'),
                                x=sp500_yahoo.index,
                                mode='lines',
                                name='S&P 500 '+ dependent),
                     secondary_y=True,)
    figura.add_trace(
        go.Scatter(y=df[sentiment],
                   x=df.index,
                   mode='lines',
                   name=sentiment + ' Index',
                   visible='legendonly'),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=df[sentiment].ewm(span=365).mean(),
                   x=df.index,
                   mode='lines',
                   name='EWMA 1y ' + sentiment[:5] + '. Index'),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=df[sentiment].ewm(span=180).mean(),
                   x=df.index,
                   mode='lines',
                   name='EWMA 6m ' + sentiment[:5] + '. Index'),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=df[sentiment].ewm(span=90).mean(),
                   x=df.index,
                   mode='lines',
                   name='EWMA 3m ' + sentiment[:5] + '. Index'),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=df[sentiment].ewm(span=30).mean(),
                   x=df.index,
                   mode='lines',
                   name='EWMA 1m ' + sentiment[:5] + '. Index',
                   visible='legendonly'),
                   secondary_y=False,)
    figura.add_trace(go.Scatter(y=sp500_yahoo['Adj Close'][cusum_events],
                                x=cusum_events,
                                mode='markers',
                                name='S&P 500 Index CUSUM Events'),
                     secondary_y=True,)
    figura.add_trace(
        go.Scatter(y=[0],
                   x=['2001-09-11'],
                   mode='markers',
                   name='Sept 11 Attacks',
                   marker=dict(size=15),
                   marker_symbol=17),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=[0],
                   x=['2002-10-09'],
                   mode='markers',
                   name='Dot-Com Bubble Burst',
                   marker=dict(size=15),
                   marker_symbol=17),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=[0],
                   x=['2008-09-15'],
                   mode='markers',
                   name='Lehman Brothers Collapse',
                   marker=dict(size=15),
                   marker_symbol=17),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=[0],
                   x=['2018-12-22'],
                   mode='markers',
                   name='U.S. Federal Government Shutdown',
                   marker=dict(size=15),
                   marker_symbol=17),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=[0],
                   x=['2020-01-20'],
                   mode='markers',
                   name='1st COVID-19 Case USA',
                   marker=dict(size=15),
                   marker_symbol=17),
                   secondary_y=False,)

    figura.update_layout(
        title_text='S&P 500 Index vs Sentiment Indices | Indicator: {}'.format(sentiment),
        colorway = bbva)

    figura.update_xaxes(rangeslider_visible=True)
    figura.update_yaxes(title_text="<b>Sentiment Index</b>", secondary_y=False)
    figura.update_yaxes(title_text="<b>S&P 500 Close Price</b>", secondary_y=True)

    figura.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            dict(font = dict(color = "black")),
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=3, label="3y", step="year", stepmode="backward"),
                dict(count=5, label="5y", step="year", stepmode="backward"),
                dict(step="all"),
            ])
        )
    )

    figura.update_layout(template='simple_white', hovermode='x')
    iplot(figura)
    
out = widgets.interactive_output(plot_sentiment_index, {'dependent': dependent, 'dataframe': dataframe, 
                                                        'sentiment1': sentiment1, 'sentiment2': sentiment2,
                                                        'sentiment3': sentiment3})
display(ui, out)

# Data Preparation 

In [None]:
# We'll only take the original columns (remember that we created three new ones just for plotting)
sp500_yahoo = sp500_yahoo[original_columns]

In [None]:
# Merging the datasets
sentiments = companies.merge(currencies.merge(countries, 
                                              left_index=True, right_index=True), left_index=True, right_index=True)
sentiments.drop_duplicates(inplace=True)
sentiments.head()

In [None]:
# We'll calculate a weighted average on Mondays adding info from the weekend
mondays_weekends = []
for dt in daterange(start_date, end_date):
    if dt.isoweekday() in [1, 6, 7]:
        mondays_weekends.append(dt.strftime('%Y-%m-%d'))

senti_mon = sentiments.reindex(pd.DatetimeIndex(mondays_weekends))

In [None]:
# First day was Saturday (this will be useful for allocating weights)
senti_mon.index[0]

In [None]:
# Continuation:
for col in senti_mon.columns:
    senti_mon[col] = \
        senti_mon[col].rolling(3).apply(lambda x: np.average(x, weights=[0.12, 0.22, 0.66])) 
        # damos mayor peso a los lunes (mon 2/3, sun 2/9, sat 1/9)
        
# Substituting indices corresponding to senti_mon
sentiments[sentiments.index.isin(mondays_weekends)] = senti_mon

del senti_mon

# Deleting weekends
sentiments = sentiments[sentiments.index.isin(weekdays)]
sentiments.head()

# Selecting train and test periods

In [None]:
# Splitting into train and test sets

train_start, train_end = '2001-08-31', '2016-08-31'
test_start, test_end = '2016-09-01', '2020-08-31'

sentiments_test = sentiments[test_start:test_end]
sentiments_train = sentiments[train_start:train_end]
sp500_test = sp500_yahoo[test_start:test_end]
sp500_train = sp500_yahoo[train_start:train_end]

In [None]:
# Train and test ratios
print("{0:.0%}".format(len(sp500_train)/len(sp500_yahoo[train_start:])))
print("{0:.0%}".format(len(sp500_test)/len(sp500_yahoo[train_start:])))

# Missing values

In [None]:
# We'll plot the rate of variables with missing values over time
sentiments_train['year'] = sentiments_train.index.year
sentiments_train['missing'] = sentiments_train.isnull().sum(axis=1)

((1-(sentiments_train.groupby('year').count().min(axis=1)/
    sentiments_train.groupby('year').count()['bondBuzz_USA']))*100).plot(legend=False, marker='o');
((1-(sentiments_train.groupby('year').count().mean(axis=1)/
    sentiments_train.groupby('year').count()['bondBuzz_USA']))*100).plot(legend=False, marker='o');
plt.title('Max. and mean missing-value rates per year');
plt.ylabel('Missing rate [%]');
plt.xlabel('Date');

plt.figure()
plt.plot(sentiments_train.groupby('year').max().index, 
         list(sentiments_train.groupby('year').max()['missing']), marker='o');
plt.plot(sentiments_train.groupby('year').mean().index, 
         list(sentiments_train.groupby('year').mean()['missing']), marker='o');
plt.title('Max. and mean number of variables with missing values per year');
plt.ylabel('Number of variables');
plt.xlabel('Date');

sentiments_train.drop(['year', 'missing'], axis=1, inplace=True)

# y axis represents the percentage of variables with missing values for every year 
# disclaimer: these are not missing value rates, see section Data Loading for checking those

In [None]:
# Dropping cols with a missing rate larger than 20%
missing_rate = (sentiments_train.isnull().sum() / len(sentiments_train))*100
drop_cols = [col for col in missing_rate.index if missing_rate[col] >= 20]
print(drop_cols)
sentiments_train.drop(drop_cols, axis=1, inplace=True)
sentiments_test.drop(drop_cols, axis=1, inplace=True)

In [None]:
# Imputing missing values for the rest of variables
# We'll perform a forward filling since we're dealing with news

sentiments_train.fillna(method='ffill', inplace=True) 
sentiments_test.fillna(method='ffill', inplace=True) 

In [None]:
#msno.matrix(sp500_yahoo[['Close']])

# We'll interpolate over the days were there is no data (be it due to the closing of markets in festive days or
# just because of an absence of data due to system errors)

sp500_train.interpolate(method='spline', order=3, limit_direction='forward', axis=0, inplace=True) 
sp500_test.interpolate(method='spline', order=3, limit_direction='forward', axis=0, inplace=True) 

# Transformations

In [None]:
# We'll apply yeo johnson to balance dispersion amongst varaibles
# in this chunk we'll only apply it to the sentiments dataset. Later on we'll apply it to the financial variables
#example
fig = plt.figure()
ax1 = fig.add_subplot(211)
x = currencies.buzz_USD
prob = stats.probplot(x, dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Probplot against normal distribution')

ax2 = fig.add_subplot(212)
xt, l = stats.yeojohnson(x)
prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
ax2.set_title('Probplot after Box-Cox transformation')
plt.show()
print('Lambda: ', l)

fig, ax = plt.subplots(2,1)
ax[0].hist(x, bins=30);
ax[1].hist(xt, bins=30);
ax[0].set_title('Original and transformed variable');

In [None]:
# We use the same lambdas for test dataset

def apply_transformation(data_train, data_test, transformation):
    """
    Applies dispersion and scale transformations on data split in train and test.
    
    Parameters
    ----------
    data_train: pandas dataframe
        Train set.
    data_test: pandas dataframe
        Test set.
    transformation: str
        One of 'dispersion', 'scale' or 'dispersion_and_scale'.
        
    Returns
    -------
    Tuple with train set and test set.
    """ 
    
    data_train = data_train.dropna()
    data_test = data_test.dropna()
    
    index_train = data_train.index
    index_test = data_test.index
    
    if transformation == 'dispersion':
        for col in data_train.columns:
            data_train[col], fitted_lambda = stats.yeojohnson(data_train[col])
            data_test[col] = stats.yeojohnson(data_test[col], fitted_lambda)
            
    elif transformation == 'scale':
        scaler = StandardScaler().fit(data_train)
        std_train = scaler.transform(data_train)
        std_test = scaler.transform(data_test)
        data_train = pd.DataFrame(std_train, columns=data_train.columns)
        data_test = pd.DataFrame(std_test, columns=data_test.columns)
        data_train['Date'] = index_train
        data_test['Date'] = index_test
        data_train.set_index('Date', inplace=True)
        data_test.set_index('Date', inplace=True)
        
    elif transformation == 'dispersion_and_scale':
        for col in data_train.columns:
            data_train[col], fitted_lambda = stats.yeojohnson(data_train[col])
            data_test[col] = stats.yeojohnson(data_test[col], fitted_lambda)
        scaler = StandardScaler().fit(data_train)
        std_train = scaler.transform(data_train)
        std_test = scaler.transform(data_test)
        data_train = pd.DataFrame(std_train, columns=data_train.columns)
        data_test = pd.DataFrame(std_test, columns=data_test.columns)
        data_train['Date'] = index_train
        data_test['Date'] = index_test
        data_train.set_index('Date', inplace=True)
        data_test.set_index('Date', inplace=True)
    
    return data_train, data_test

In [None]:
# checking linear correlation before transformation
sentiments_train.optimism_US500.ewm(90).mean().corr(sp500_yahoo.Close, method='pearson')

In [None]:
# We'll also apply standardization to even scales. This will make distributions more comparable
sentiments_train, sentiments_test = apply_transformation(sentiments_train, sentiments_test, 'dispersion_and_scale')

display(sentiments_train.describe())

In [None]:
# checking linear correlation after transformation
sentiments_train.optimism_US500.ewm(90).mean().corr(sp500_yahoo.Close, method='pearson') 
# correlation has improved a little

# Smoothing

In [None]:
# testing fast fourier transform
def filter_signal(signal, threshold=1e8):
    """
    Performs a Fast Fourier Transform over a signal and returns filtered data.
    
    Parameters
    ----------
    signal: numpy array
    threshold: double
    """
    fourier = np.fft.rfft(signal)
    frequencies = np.fft.rfftfreq(signal.size, d=1.)
    fourier[frequencies > threshold] = 0
    return np.fft.irfft(fourier)

span = 500
signal = np.array(sentiments_train.sentiment_US500[1:])
threshold = 0.1
filtered = filter_signal(signal, threshold=threshold)
plt.plot(signal[-span:], label='Raw')

plt.plot(filtered[-span:], label='Filtered')
plt.plot(np.array(pd.Series(signal).ewm(22).mean())[-span:], label='22-day EWM')

plt.legend()
plt.title("FFT Denoising with threshold = {} cycles per day".format(threshold), size=15)
plt.show()

In [None]:
fig, ax1 = plt.subplots()

ax1.set_xlabel('Date')
ax1.set_ylabel('Sentiment Index')

ax1.plot(sentiments['sentiment_US500'][train_start:train_end][:300], label='Raw Sentiment US500', color=bbva[2])
ax1.plot(sentiments['sentiment_US500'][train_start:train_end].ewm(22).mean()[:300], label='EWMA Sentiment US500', 
         color=bbva[1])
ax1.legend(loc='lower left')

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

ax2.set_ylabel('S&P 500 Close Price')  # we already handled the x-label with ax1
ax2.plot(sp500_train['Adj Close'][:300], color=bbva[0], label='S&P 500 Close Price')
ax2.legend(loc='upper right');

In [None]:
# Function for selecting smoothing technique
def select_smoothing(data, technique, span0=22):
    """
    Performs a smoothing (EWMA) or filtering (FFT) technique over data.
    
    Parameters
    ----------
    data: pandas dataframe column
    technique: str
        One of 'fft' for Fast Fourier Transform or 'ewma' for Exponentially Weighted Moving Average.
        Calls function filter_signal() for FFT.
    span0: int
        Decay in terms of span.
        
    Returns
    -------
    Pandas dataframe column.
    """
    if technique == 'fft':
        filtered = list(filter_signal(data, threshold=threshold))
        filtered.append(0)
    elif technique == 'ewma':
        filtered = data.ewm(span=span0).mean()
    return filtered

# we'll be using the ewma
for col in sentiments_train.columns:
    sentiments_train[col] = select_smoothing(sentiments_train[col], 'ewma')
    sentiments_test[col] = select_smoothing(sentiments_test[col], 'ewma')

# Feature Engineering

In [None]:
# We'll add some technical indicators used in trading for evaluating momemtum and trends
def add_technical_indicators(data):
    """
    Adds technical indicators widely used by traders when checking for bearish or bullish signals.
    
    Parameters
    ----------
    data: pandas dataframe
    """
    
    
    data.dropna(inplace=True)
   
    data['ROC'] = ROCIndicator(data['Adj Close'], 10).roc()
    data['RSI'] = RSIIndicator(data['Adj Close'], 10).rsi()
    data['Stoch'] = StochasticOscillator(high=data['High'], 
                                         low=data['Low'], 
                                         close=data['Close'], 
                                         n=10).stoch()
    data['Williams'] = WilliamsRIndicator(high=data['High'], 
                                          low=data['Low'], 
                                          close=data['Close'],  
                                          lbp=10).wr()
    data['MACD'] = MACD(data['Close'], 
                        n_slow = 22, 
                        n_fast = 8, 
                        n_sign = 5).macd()
    data['ADX'] = ADXIndicator(high=data['High'], 
                               low=data['Low'], 
                               close=data['Close'], n=10).adx()
    
    data['Close/Open'] = [1 if data.Close[x] > data.Open[x] else 0
                          for x in range(0, len(data))]
    
    data['Daily Return'] = data['Adj Close'].pct_change(periods=1)*100
    data['Daily Volatility'] = data['Daily Return'].ewm(span=22).std() # exponential moving std
    
    ### only compute this cross if it won't later be the primary model!!
    fast_window = 20
    slow_window = 60
    col = 'Adj Close'
    data['Fast EWMA {}'.format(col)] = data[col] # already averaged
    data['Slow EWMA {}'.format(col)] = data[col].ewm(slow_window).mean()

    # Compute sides
    data['sp_cross_{}'.format(col)] = np.nan

    long_signals = data['Fast EWMA {}'.format(col)] >= data['Slow EWMA {}'.format(col)]
    short_signals = data['Fast EWMA {}'.format(col)] < data['Slow EWMA {}'.format(col)]
    data.loc[long_signals, 'sp_cross_{}'.format(col)]  = 1
    data.loc[short_signals, 'sp_cross_{}'.format(col)]  = -1

    # Lagging our trading signals by one day

    data.drop(['Fast EWMA {}'.format(col), 'Slow EWMA {}'.format(col)], axis=1, inplace=True)

    return data

sp500_train = add_technical_indicators(sp500_train)
sp500_test = add_technical_indicators(sp500_test)

sp500_train.Close[-100:].plot(legend=True);
plt.figure()
sp500_train.MACD[-100:].plot(legend=True);
sp500_train.ADX[-100:].plot(legend=True);
sp500_train.RSI[-100:].plot(legend=True);
sp500_train.Stoch[-100:].plot(legend=True);
sp500_train.Williams[-100:].plot(legend=True);

In [None]:
sp500_train.drop(['Close'], axis=1, inplace=True)
sp500_test.drop(['Close'], axis=1, inplace=True)

In [None]:
# We're shifting forward the financial variables (which are related to price) since news from the
# previous day have to be used for predicting next day's prices
sp500_train = sp500_train.shift(1) 
sp500_test = sp500_test.shift(1)

In [None]:
sp500_train.head()

In [None]:
# Before continuing, we'll concat the financial and the sentiment datasets
X_train = pd.concat([sp500_train, sentiments_train], axis=1)
X_test = pd.concat([sp500_test, sentiments_test], axis=1)
X_train.dropna(inplace=True) # there are na values at the beginning, for the newly created variables (technical indicators)
X_test.dropna(inplace=True)

# Now we'll create y_train and y_test, our labels

In [None]:
# we'll add these ewmas as predictor or explanatory variables
def add_crossing_ewmas(data, fast_window, slow_window):
    """
    Adds two crossing exponentially weighted moving averages. 
    
    Parameters
    ----------
    data: pandas dataframe
    fast_window: int
        Fast decay in terms of span.
    slow_window: int
        Slow decay in terms of span.
        
    Returns
    -------
    Pandas dataframe.
    """

    

    for col in ['sentiment_US500', 'stockIndexSentiment_USA']:

        data['Fast EWMA {}'.format(col)] = data[col] # already averaged
        data['Slow EWMA {}'.format(col)] = data[col].ewm(slow_window).mean()

        # Compute sides
        data['cross_{}'.format(col)] = np.nan

        long_signals = data['Fast EWMA {}'.format(col)] >= data['Slow EWMA {}'.format(col)]
        short_signals = data['Fast EWMA {}'.format(col)] < data['Slow EWMA {}'.format(col)]
        data.loc[long_signals, 'cross_{}'.format(col)]  = 1
        data.loc[short_signals, 'cross_{}'.format(col)]  = -1

        

        data.drop(['Fast EWMA {}'.format(col), 'Slow EWMA {}'.format(col)], axis=1, inplace=True)
    
    return data

X_train = add_crossing_ewmas(X_train, 10, 60)
X_test = add_crossing_ewmas(X_test, 10, 60)

# Labelling Target Variable

In [None]:
# overriding function get_bins
# we want this to return the sign of the return when the vertical barrier is touched first
# instead of what it's currently implemented (0 if vertical barrier is touched first)
def get_bins(triple_barrier_events, close):
    """
    Advances in Financial Machine Learning, Snippet 3.7, page 51.

    Labeling for Side & Size with Meta Labels

    Compute event's outcome (including side information, if provided).
    events is a DataFrame where:

    Now the possible values for labels in out['bin'] are {0,1}, whether to take the bet or pass,
    a purely binary prediction. The previous feasible values were {−1,0,1}.
    The ML algorithm will be trained to decide if it's 1, and we can use the probability of this secondary prediction
    to derive the size of the bet, where the side (sign) of the position has been set by the primary model.

    :param triple_barrier_events: (pd.DataFrame)
                -events.index is event's starttime
                -events['t1'] is event's endtime
                -events['trgt'] is event's target
                -events['side'] (optional) implies the algo's position side
                Case 1: ('side' not in events): bin in (-1,1) <-label by price action
                Case 2: ('side' in events): bin in (0,1) <-label by pnl (meta-labeling)
    :param close: (pd.Series) Close prices
    :return: (pd.DataFrame) Meta-labeled events
    """

    # 1) Align prices with their respective events
    events_ = triple_barrier_events.dropna(subset=['t1'])
    all_dates = events_.index.union(other=events_['t1'].array).drop_duplicates()
    prices = close.reindex(all_dates, method='bfill')

    # 2) Create out DataFrame
    out_df = pd.DataFrame(index=events_.index)
    # Need to take the log returns, else your results will be skewed for short positions
   
    out_df['ret'] = prices.loc[events_['t1'].values].values / prices.loc[events_.index] - 1
    out_df['trgt'] = events_['trgt']

    # Meta labeling: Events that were correct will have pos returns
    if 'side' in events_:
        out_df['ret'] = out_df['ret'] * events_['side']  # meta-labeling

    # Added code: label 0 when vertical barrier reached
    #-------------------we change this step, as we want the outcome to be the sign of the return
   
    out_df['bin'] = np.sign(out_df['ret'])

    # Meta labeling: label incorrect events with a 0
    if 'side' in events_:
        out_df.loc[out_df['ret'] <= 0, 'bin'] = 0

    # Transform the log returns back to normal returns.
   
    # Add the side to the output. This is useful for when a meta label model must be fit
    tb_cols = triple_barrier_events.columns
    if 'side' in tb_cols:
        out_df['side'] = triple_barrier_events['side']

    return out_df

In [None]:
# Now we'll apply the labeling
def apply_trading_labeling(data, 
                           compute_side=False, 
                           horizon=14,
                           pt_sl=[1, 2], # multipliers for daily vol (contribution to horizontal barriers)
                           min_ret=0.005,
                           primary_model=False):
    """
    Applies selected trading strategy with the given parameters. Labeling uses Triple Barrier Method.

    Parameters
    __________
    dataframe: pandas dataframe 
        Data to use.
    compute_side: boolean
        Whether to use a primary model that tells the side (buy or sell).
        When True, a trend-following strategy will be applied as primary model.
        Default is False.
    horizon: int
        Prediction horizon in natural days.
    pt_sl: list
        Profit taking and stop loss multipliers to the volatility. Width of the TBM box.
    min_ret: float
        Minimum target return to run the search for triple barriers.
    primary_model: boolean
        Whether a primary model computed by the user has already decided the side.
        
    Returns
    -------
    Labels dataframe and triple-barrier events dataframe.
    """

    ####--------------------- Primary models ------------------------####
    if compute_side is True:
        # compute exponentially moving averages
        fast_window = 20
        slow_window = 90 # optimize the span for fast and slow averages

        data['Fast EWMA'] = data['Adj Close'].ewm(fast_window).mean()
        data['Slow EWMA'] = data['Adj Close'].ewm(slow_window).mean()
    
        # Compute sides
        data['Side'] = np.nan

        long_signals = data['Fast EWMA'] >= data['Slow EWMA']
        short_signals = data['Fast EWMA'] < data['Slow EWMA']
        data.loc[long_signals, 'Side'] = 1
        data.loc[short_signals, 'Side'] = -1

        # Lagging our trading signals by one day
        data[['Fast EWMA', 'Slow EWMA']] = data[['Fast EWMA', 'Slow EWMA']].shift(1)

        data[['Fast EWMA', 'Slow EWMA']].plot();
    
    data.dropna(inplace=True)

    ####--------------------- CUSUM filters ------------------------####
    # Apply Symmetric CUSUM Filter and get timestamps for events
    cusum_events = mlfin.filters.cusum_filter(data['Adj Close'],
                                              threshold=data['Daily Volatility']/100)

    ####--------------------- Vertical barriers ------------------------####
    # Compute vertical barrier
    vertical_barriers = mlfin.labeling.add_vertical_barrier(t_events=cusum_events,
                                                            close=data['Adj Close'],
                                                            num_days=horizon) # this is the length of the tbm box
    
    ####--------------------- Triple barriers ------------------------####
    # Computing triple barriers
    if (compute_side is True) | (primary_model is True):
        triple_barrier_events = mlfin.labeling.get_events(close=data['Adj Close'],
                                                       t_events=cusum_events,
                                                       pt_sl=pt_sl, # profit taking and stop loss multiples
                                                       target=data['Daily Volatility']/100, # values in conjunction with pt_sl for width of barrier
                                                       min_ret=min_ret,
                                                       num_threads=3, # num of parallel tasks
                                                       vertical_barrier_times=vertical_barriers,
                                                       side_prediction=data.Side)
        ####--------------------- Meta-labels ------------------------####
        # now we compute the meta-labelling
        meta_labeled_events = get_bins(triple_barrier_events, data['Adj Close'])
    
    else:
        triple_barrier_events = mlfin.labeling.get_events(close=data['Adj Close'],
                                                       t_events=cusum_events,
                                                       pt_sl=pt_sl, # profit taking and stop loss multiples
                                                       target=data['Daily Volatility']/100, # values in conjunction with pt_sl for width of barrier
                                                       min_ret=min_ret,
                                                       num_threads=3, # num of parallel tasks
                                                       vertical_barrier_times=vertical_barriers,
                                                       side_prediction=None)

        ####--------------------- Side ------------------------####
        # now we compute the side 
        # function that does meta-labeling returns side if no side prediction comes first
        meta_labeled_events = get_bins(triple_barrier_events, data['Adj Close'])
        meta_labeled_events['side'] = meta_labeled_events['bin']
        meta_labeled_events['bin'] = 1

    return meta_labeled_events, triple_barrier_events

In [None]:
# computing labels for side (+1, -1) 
# side will tell the sign of the bet
train_labels, tbm_train = apply_trading_labeling(X_train, 
                                     primary_model=False, 
                                     compute_side=False,
                                     horizon=14,
                                     pt_sl=[0, 0],
                                     min_ret=0.005)
test_labels, tbm_test = apply_trading_labeling(X_test, 
                                     primary_model=False, 
                                     compute_side=False,
                                     horizon=14,
                                     pt_sl=[0, 0],
                                     min_ret=0.005)

In [None]:
display(train_labels.head())
display(tbm_train.head())

In [None]:
# plotting labels to see outcome
#configure_plotly_browser_state()
@interact
def plot_sentiment_index(data=widgets.RadioButtons(
                            options=['Train', 'Test'],
                            value='Test',
                            # rows=10,
                            description='Data',
                            disabled=False)):

    figura = make_subplots(specs=[[{"secondary_y": False}]])
    if data == 'Test':
        dataset = X_test.merge(test_labels,
                            left_index=True,
                            right_index=True,
                            how='left')
        figura.add_trace(go.Scatter(y=X_test['Adj Close'],
                                    x=X_test.index,
                                    mode='lines',
                                    name='S&P 500 Close Price'),
                         secondary_y=False,)
    else:
        dataset = X_train.merge(train_labels,
                            left_index=True,
                            right_index=True,
                            how='left')
        figura.add_trace(go.Scatter(y=X_train['Adj Close'],
                                    x=X_train.index,
                                    mode='lines',
                                    name='SP500 Close Price'),
                         secondary_y=False,)
    if ('Fast EWMA' in dataset.columns) and ('Slow EWMA' in dataset.columns):
        figura.add_trace(go.Scatter(y=dataset['Fast EWMA'],
                                    x=dataset.index,
                                    mode='lines',
                                    name='SP500 Fast EWMA'),
                         secondary_y=False,)
        figura.add_trace(go.Scatter(y=dataset['Slow EWMA'],
                                    x=dataset.index,
                                    mode='lines',
                                    name='SP500 Slow EWMA'),
                         secondary_y=False,)

    figura.add_trace(
        go.Scatter(y=dataset[(dataset.bin == 1) & (dataset.side == 1)]['Adj Close'],
                   x=dataset[(dataset.bin == 1) & (dataset.side == 1)].index,
                   mode='markers',
                   name='Buy',
                   marker=dict(size=8, color='#008000'),
                   marker_symbol=5),
                   secondary_y=False,)
    figura.add_trace(
        go.Scatter(y=dataset[(dataset.bin == 1) & (dataset.side == -1)]['Adj Close'],
                   x=dataset[(dataset.bin == 1) & (dataset.side == -1)].index,
                   mode='markers',
                   name='Sell',
                   marker=dict(size=8, color='#FF0000'),
                   marker_symbol=6),
                   secondary_y=False,)

    figura.update_layout(
        title_text='S&P 500 Index and labeled positions | {}'.format(data),
        colorway = bbva)

    figura.update_xaxes(rangeslider_visible=True)
    figura.update_yaxes(title_text="<b>S&P 500 Close Price</b>", secondary_y=False)

    figura.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            dict(font = dict(color = "black")),
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="YTD", step="year", stepmode="todate"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=3, label="3y", step="year", stepmode="backward"),
                dict(count=5, label="5y", step="year", stepmode="backward"),
                dict(step="all"),
            ])
        )
    )

    figura.update_layout(template='simple_white', hovermode='x')
    iplot(figura)

In [None]:
# number of observations per label
train_labels['label'] = train_labels['bin'] * train_labels['side']
print('Train:\n', train_labels.groupby('label').count()[['ret']])

test_labels['label'] = test_labels['bin'] * test_labels['side']
print('\nTest:\n', test_labels.groupby('label').count()[['ret']])

In [None]:
# scaling price variables since we've already computed our sign labels
X_train[sp500_train.columns], X_test[sp500_test.columns] = apply_transformation(X_train[sp500_train.columns],
                                                                                X_test[sp500_test.columns],
                                                                                'dispersion_and_scale')

In [None]:
# eliminating label 0 (0 would mean that pct change between days is null)
train_labels = train_labels[train_labels.label != 0]
test_labels = test_labels[test_labels.label != 0]

# downsampling with events
X_train = X_train.reindex(train_labels.index)
X_test = X_test.reindex(test_labels.index)

X_train.head()

# Model

In [None]:
# applying our model
def apply_model(train_labels, test_labels, X_train_, X_test_, model, scoring, sfs=True):
    """
    Applies model RF or XGB to data with Cross-Validation. It may perform variable selection on demand.
    
    Parameters
    ----------
    train_labels: pandas dataframe
        Dataframe containing train labels as returned by apply_trading_labeling.
    test_labels: pandas dataframe
        Dataframe containing test labels as returned by apply_trading_labeling.
    X_train_: pandas dataframe
        Training data.
    X_test_: pandas dataframe
        Test data.
    model: str
        One of 'RF' for Random Forest or 'XGB' for Extreme Gradient Boosting.
    scoring: str
        One of the admitted possibilities in sklearn's GridSearchCV .
    sfs: boolean
        Whether to perfrom Sequential Forward Selection with a simple RF for variable selection.
        It will select from 10 to 30 total variables.
        
    Returns
    -------
    Best estimator from CV and list of selected variables.
    """
    
    y_test = test_labels.label
    y_train = train_labels.label
    
    pos_ratio = round(train_labels[train_labels.label <= 0][['label']].count() / 
                      train_labels[train_labels.label > 0][['label']].count(), 1)[0]
    
    if sfs is True:
    # Sequential Forward Floating Selection
        sffs = SFS(RandomForestClassifier(n_jobs=-1, random_state=1, n_estimators=40, max_depth=2,
                                          class_weight='balanced_subsample'), 
                   k_features=(10, 30), 
                   forward=True, 
                   floating=True, 
                   scoring=scoring,
                   cv=5,
                   n_jobs=-1,
                   verbose=0)

        sffs = sffs.fit(X_train_, y_train)

        print('\nSequential Forward Floating Selection (k=30):')
        sffs_score = sffs.k_score_
        print('CV Score: %.2f' % sffs_score)

        fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_dev')

        #plt.ylim([0.8, 1])
        plt.title('Sequential Forward Selection (w. StdDev)')
        #plt.grid()
        plt.show()

        selected_cols = []
        for i, col in enumerate(X_train_.columns):
            if i in sffs.k_feature_idx_:
                selected_cols.append(col)

        X_train_ = X_train_[selected_cols]
        X_test_ = X_test_[selected_cols]
    else:
        selected_cols = X_train_.columns
        
    # Fitting model:
    if model == 'RF':        
        parameters = {'n_estimators': [40, 60, 100, 150, 180],
                      'max_depth':[2, 3, 4, 5],
                      'min_samples_split': [4, 6],
                      'min_samples_leaf': [1, 2],
                      'ccp_alpha': [0, 0.01, 0.02]}
        
        clf = RandomForestClassifier(n_jobs=-1, oob_score=True, criterion='gini', 
                                     random_state=1, class_weight='balanced_subsample')
        
        gridcv = GridSearchCV(clf, parameters, cv=TimeSeriesSplit(max_train_size=None, n_splits=5), 
                              scoring=scoring, verbose=1, n_jobs=-1, refit=True)
    elif model == 'XGB':
        parameters = {'n_estimators': [40, 60, 100, 180, 1000],
                      'max_depth':[2, 3, 4, 5],
                      'eta': [0.00005, 0.0005],
                      'base_score': [0, 0.5, 1],
                      'early_stopping_rounds':[5, 10]}#,
                      #'scale_pos_weight': [0.5, 0.8, 1]}
        
        clf = XGBClassifier(objective='binary:logistic', predictor='gpu_predictor',
                            random_state=1, min_child_weight=2, scale_pos_weight=pos_ratio)
    
        # As this is a time-series problem, we do not shuffle samples for Cross-Validation,
        # and test always with newer registers:
        gridcv = GridSearchCV(clf, parameters, cv=TimeSeriesSplit(max_train_size=None, n_splits=3), 
                              verbose=1, n_jobs=-1, refit=True, scoring=scoring, return_train_score=True)

    gridcv.fit(X_train_, y_train)

    # Results:
    best_estimator = gridcv.best_estimator_
    #cv_results, cv_results_index = gridcv.cv_results_, gridcv.best_index_
    
    scores = cross_val_score(best_estimator, X_train_, y_train,
                             cv=TimeSeriesSplit(max_train_size=None, n_splits=3), scoring='f1')
    print("Train F1-score: ", scores.mean())
    scores = cross_val_score(best_estimator, X_train_, y_train,
                             cv=TimeSeriesSplit(max_train_size=None, n_splits=3), scoring='accuracy')
    print("Train accuracy: ", scores.mean())
    scores = cross_val_score(best_estimator, X_train_, y_train,
                             cv=TimeSeriesSplit(max_train_size=None, n_splits=3), scoring='roc_auc')
    print("Train AUC: ", scores.mean())


    print("Train best score %.2f" % gridcv.best_score_)
    print('Best parameters: {}'.format(gridcv.best_params_))

    predictions = best_estimator.predict(X_test_)
    accuracy = accuracy_score(y_test, predictions)*100
    print("Accuracy: %.2f%%" % accuracy)
    y_pred = gridcv.predict_proba(X_test_)[:,1]
    print("AUC: %.2f" % roc_auc_score(y_test, y_pred))
    if model == 'RF':
        print("OOB Score: %.2f" % best_estimator.oob_score_)
    print("Classification report:")
    print(classification_report(y_test, predictions))

    conf_mat = confusion_matrix(y_test, predictions)
    print("Confusion matrix:")
    print(conf_mat)
    #print(gridcv.cv_results_)
    
    with plt.style.context('seaborn-poster'):
        features = X_train_.columns
        importances = np.array([importance for importance in best_estimator.feature_importances_ if importance > 0])
        indices = np.argsort(importances)

        plt.title('Feature Importances')
        plt.barh(range(len(indices)), importances[indices], color='b', align='center')
        plt.yticks(range(len(indices)), [features[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.show()
    
    return best_estimator, selected_cols

In [None]:
# choosing variables
def choose_variables(X_train, X_test, var_type='technical'):
    """
    Selects variables distinguishing between technical indicators and sentiment indices.
    
    Parameters
    ----------
    X_train: pandas dataframe
        Train dataframe.
    X_test: pandas dataframe
        Test dataframe.
    var_type: str
        One of 'technical', 'sentiment' or 'all'.
        
    Returns
    -------
    Train and test dataframes with selected variables.
    """
    sentiment_cols = [col for col in X_test.columns 
                      if col.endswith(('USA', 'US500', 'USD')) 
                      or col.startswith(('pc', 'cross'))]
    if var_type == 'technical':
        # only technical indicators or financial variables
        X_train_ = X_train[[col for col in X_train.columns if col not in sentiment_cols]]
        X_test_ = X_test[[col for col in X_test.columns if col not in sentiment_cols]]
    elif var_type == 'sentiment':
        # only sentiment variables
        X_train_ = X_train[sentiment_cols]
        X_test_ = X_test[sentiment_cols]
    elif var_type == 'all':
        X_train_ = X_train
        X_test_ = X_test

    return X_train_, X_test_

X_train_, X_test_ = choose_variables(X_train, X_test, 'all')

In [None]:
# saving here the sffs selections for different configurations

# without stop loss and profit taking limits
features_all = [ 'ROC',
                 'ADX',
                 'Daily Return',
                 'fear_US500',
                 'fundamentalStrength_US500',
                 'optimism_USD',
                 'surprise_USD',
                 'timeUrgency_USD',
                 'longShort_USD',
                 'bondDefault_USA',
                 'bondPriceForecast_USA',
                 'interestRates_USA',
                 'cross_stockIndexSentiment_USA' ] #all selection
features_sent = ['fear_US500',
                 'fundamentalStrength_US500',
                 'optimism_USD',
                 'surprise_USD',
                 'longShort_USD',
                 'priceForecast_USD',
                 'stockIndexStress_USA',
                 'bondUncertainty_USA',
                 'bondPriceForecast_USA',
                 'interestRates_USA',
                 'cross_stockIndexSentiment_USA'] #sentiment selection

# with stop loss and profit taking limits
features_ptsl_all = ['Stoch',
                     'Williams',
                     'MACD',
                     'ADX',
                     'Close/Open',
                     'sp_cross_Adj Close',
                     'longShortForecast_US500',
                     'analystRating_US500',
                     'dividends_US500',
                     'stress_USD',
                     'stockIndexPriceDirection_USA',
                     'stockIndexPriceForecast_USA',
                     'cross_stockIndexSentiment_USA'] # pt sl all selection
features_ptsl_sent = ['longShortForecast_US500',
                     'priceForecast_US500',
                     'analystRating_US500',
                     'trust_USD',
                     'stress_USD',
                     'surprise_USD',
                     'timeUrgency_USD',
                     'longShort_USD',
                     'volatility_USD',
                     'stockIndexPriceDirection_USA',
                     'bondUncertainty_USA',
                     'interestRates_USA',
                     'cross_stockIndexSentiment_USA'] # pt sl sentiment selection

In [None]:
# applying first model for setting the side (sign) of the bet (buy or sell)
first_model, features = apply_model(train_labels, test_labels, 
                                    X_train_[features_all], X_test_[features_all], 
                                    'XGB', 'neg_log_loss', sfs=False)

In [None]:
# generating meta-labels with previous forecast
# these will be the labels for the next model
tbm_train = tbm_train.reindex(X_train_.index)
tbm_test = tbm_test.reindex(X_test_.index)

tbm_train['side'] = first_model.predict(X_train_[features_all]) # outcome of first model
tbm_test['side'] = first_model.predict(X_test_[features_all])

metalabels_train = get_bins(tbm_train, sp500_train['Adj Close'])
metalabels_test = get_bins(tbm_test, sp500_test['Adj Close'])
metalabels_train['label'] = metalabels_train.bin
metalabels_test['label'] = metalabels_test.bin
metalabels_train.dropna(inplace=True)
metalabels_test.dropna(inplace=True)

X_train_ = X_train_.reindex(metalabels_train.index)
X_test_ = X_test_.reindex(metalabels_test.index)
X_train_['side'] = metalabels_train.side
X_test_['side'] = metalabels_test.side

In [None]:
display(metalabels_train.groupby('label').count())
display(metalabels_test.groupby('label').count())