In [1]:
import pandas as pd
import yfinance as yf
import plotly.graph_objs as go
import numpy as np
import vectorbt as vbt

Utilizaremos a base de dados disponivel no kaggle (https://www.kaggle.com/code/shtrausslearning/news-sentiment-based-trading-strategy/notebook), que contem uma base de dados com as colunas "Datetime" "headline" "Ticker" e "sentiment".
###### "Datetime" - Data da noticia;
###### "headline" - Titulo;
###### "Ticker" - Ticker vinculado a noticia;
###### "sentiment" - Sentimento da noticia, sendo 1 positivo e 0 negativo;

In [2]:
#Lendo dados das noticias com sentimento
Data_w_sentiment = pd.read_csv('https://raw.githubusercontent.com/tatsath/fin-ml/master/Chapter%2010%20-%20Natural%20Language%20Processing/Case%20Study%201%20-%20NLP%20and%20Sentiments%20Analysis%20based%20Trading%20Strategy/Data/LabelledNewsData.csv', sep=',', header=0, encoding = "ISO-8859-1")
Data_w_sentiment['datetime'] = pd.to_datetime(Data_w_sentiment['datetime'])

#Filtrando apenas os dados da American Express entre janeiro e dezembro de 2019
data_inicio = '2019-01-01'
data_fim = '2019-12-31'

data_filtrado = Data_w_sentiment[(Data_w_sentiment['datetime'] >= data_inicio) & (Data_w_sentiment['datetime'] <= data_fim)]
noticias_mercado = data_filtrado[data_filtrado['ticker'] == 'DOW']

###### Utilizando a biblioteca Yahoo Finance para realizar o download dos dados historicos do ticker DOW

In [None]:
#Realizando download da base de dados do yahoo finance

tickers = ['DOW'] #Define uma lista de tickers, que neste caso contém apenas o ticker 'DOW';

start = '2019-01-01'#Define as datas de início e término para o período dos dados históricos que serão baixados.
end = '2020-01-10'

valores_ticker_return = pd.DataFrame() #Cria um DataFrame vazio chamado 

#itera sobre cada ticker na lista tickers. Neste caso contem apenas o ticker da American Express
for ticker in tickers:
    ticker_yf = yf.Ticker(ticker)

    data_temp = ticker_yf.history(start=start, end=end).reset_index()
    data_temp['ticker'] = ticker

    if valores_ticker_return.empty:
        valores_ticker_return = data_temp
    else:
        valores_ticker_return = pd.concat([valores_ticker_return, data_temp], ignore_index=True)

valores_ticker_return.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,2019-03-20 00:00:00-04:00,38.85756,39.410037,36.463492,36.684483,2350800,0.0,0.0,DOW
1,2019-03-21 00:00:00-04:00,36.824449,36.831814,35.505869,36.080444,1764700,0.0,0.0,DOW
2,2019-03-22 00:00:00-04:00,35.947834,36.794967,35.476388,35.800507,844700,0.0,0.0,DOW
3,2019-03-25 00:00:00-04:00,35.800513,36.389825,35.358533,36.205666,440900,0.0,0.0,DOW
4,2019-03-26 00:00:00-04:00,36.095177,36.647654,35.491135,35.98468,504700,0.0,0.0,DOW


###### Deixando as bases de dados com o mesmo formato de data, para um melhor filtro, em seguida ordenando pela data.

In [4]:
valores_mercado = valores_ticker_return
noticias_mercado['datetime'] = pd.to_datetime(noticias_mercado['datetime'], utc= True)
valores_mercado['Date'] = pd.to_datetime(valores_mercado['Date'], utc=True)

#ordenando as bases pela data
valores_mercado.sort_values('Date', inplace=True)
noticias_mercado.sort_values('datetime', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noticias_mercado['datetime'] = pd.to_datetime(noticias_mercado['datetime'], utc= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  noticias_mercado.sort_values('datetime', inplace=True)


###### Criando a coluna de fechamento em percentual, sendo o percentual calculado em relação ao primeiro valor do mercado.

In [5]:
# Calculando close em percentual com relação ao primeiro valor
primeiro_valor = valores_mercado['Close'].iloc[0]
valores_mercado['Close_pct'] = (valores_mercado['Close'] / primeiro_valor)

###### Criando uma nova base de dados, juntando as bases valores_dow, correspondente aos valores de fechamento e a base noticias_axp, correspondendo as noticias e seus sentimentos.

In [6]:
# Juntando as bases
noticias_precificadas = pd.merge_asof(valores_mercado, noticias_mercado, left_on='Date', right_on='datetime', by='ticker', tolerance=pd.Timedelta('24 hours'))

# Renomeando as colunas
noticias_precificadas.rename(columns={'headline': 'headline_same_day', 'sentiment': 'sentiment_same_day'}, inplace=True)

noticias_precificadas['sentiment_same_day'].replace({10: 1, 0: -1}, inplace=True)# Ajustando os sentimentos 10 = 1; 0 = -1

noticias_precificadas['sentiment_same_day'].fillna(value=0, inplace=True)# Substituindo os valores ausentes por 0

noticias_precificadas['sentiment_same_day']= noticias_precificadas['sentiment_same_day'].astype(int)


###### Criando uma nova coluna para melhor exibição das noticias

In [7]:
# Exibição 1

noticias_precificadas['sentimento_grafico'] = ''

noticias_precificadas.loc[noticias_precificadas['headline_same_day'].notnull(), 'sentimento_grafico'] = noticias_precificadas['sentiment_same_day']

# Exibição 2

noticias_precificadas['teste_1'] = 0

previous_sentiment = 0  # Valor inicial para o primeiro dia

for index, row in noticias_precificadas.iterrows():
    if row['sentiment_same_day'] == 1: #Se o sentimento for um define a linha como 1
        noticias_precificadas.at[index, 'teste_1'] = 1
        previous_sentiment = 1
        
    elif row['sentiment_same_day'] == -1: #Se o sentimento for um define a linha como -1
        noticias_precificadas.at[index, 'teste_1'] = -1
        previous_sentiment = -1
        
    else:
        noticias_precificadas.at[index, 'teste_1'] = previous_sentiment #Se o sentimento for 0, mantem o valor do dia anterior (1 ou -1)

noticias_precificadas.at[noticias_precificadas.index[-1], 'sentiment_same_day'] = -1
noticias_precificadas.at[noticias_precificadas.index[-1], 'teste_1'] = -1

In [8]:
#Criando a coluna de capital
#Capital = venda-compra/compra 
noticias_precificadas['capital'] = 0.0

var = noticias_precificadas['teste_1'] == 1
noticias_precificadas.loc[var, 'capital'] = ((noticias_precificadas.loc[var, 'Close'] - noticias_precificadas.loc[var, 'Close'].shift(1))) / noticias_precificadas.loc[var, 'Close'].shift(1)


In [9]:
# Calcula a série de retorno composto usando np.cumprod
noticias_precificadas['retorno_composto'] = (1 + noticias_precificadas['capital']).cumprod() - 1


# Substituindo os valores NaN (resultantes do primeiro dia) por 1
noticias_precificadas['retorno_composto'].fillna(0, inplace=True)


##### Criando a coluna ganho, metodo que utilizaremos no mercado

O valor de ganho é definido da seguinte forma, caso a noticia no dia é boa (+1), então entraremos no mercado. Caso a noticia seja ruim (-1), sairemos do mercado. Por ultimo, caso não tenhamos noticias boas ou ruins, ou seja neutro, manteremos a ação anterior.

Ganho é calculado em percentual ao primeiro valor de entrada no mercado.

In [10]:
# Encontre o índice onde o sentiment_same_day é 1 pela primeira vez
first_positive_index = noticias_precificadas[noticias_precificadas['teste_1'] == 1].index[0]

# Este valor é usado como o ponto de referência para calcular as mudanças percentuais nos preços subsequentes.
reference_value = noticias_precificadas.loc[first_positive_index, 'Close']

print(reference_value)

noticias_precificadas['ganho'] = 1.0
ganho_da_ultima_trading = 1.0

for idx in range(1, len(noticias_precificadas)):

    if noticias_precificadas.loc[idx, 'teste_1'] == 1 and noticias_precificadas.loc[idx - 1, 'teste_1'] == 1:
        ganho_parcial = (noticias_precificadas.loc[idx, 'Close'] - reference_value) / reference_value      
        noticias_precificadas.loc[idx, 'ganho'] = ganho_parcial + ganho_da_ultima_trading

    elif noticias_precificadas.loc[idx, 'teste_1'] == -1 and noticias_precificadas.loc[idx - 1, 'teste_1'] == 1:
        ganho_parcial = (noticias_precificadas.loc[idx, 'Close'] - reference_value) / reference_value
        noticias_precificadas.loc[idx, 'ganho'] = ganho_parcial + ganho_da_ultima_trading
        
    elif noticias_precificadas.loc[idx, 'teste_1'] == -1 and noticias_precificadas.loc[idx - 1, 'teste_1'] == -1:
        noticias_precificadas.loc[idx, 'ganho'] = noticias_precificadas.loc[idx - 1, 'ganho'] 
        ganho_da_ultima_trading = noticias_precificadas.loc[idx, 'ganho']
    
    # Atualizar o reference_value quando ocorrer uma nova compra
    elif noticias_precificadas.loc[idx, 'teste_1'] == 1 and noticias_precificadas.loc[idx - 1, 'teste_1'] == -1:
        reference_value = noticias_precificadas.loc[idx, 'Close']
        noticias_precificadas.loc[idx, 'ganho'] = ganho_da_ultima_trading

noticias_precificadas.to_csv('valores_dow_com_sentimento_trading.csv', index=False)
noticias_precificadas.head()

35.800506591796875


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Close_pct,datetime,headline_same_day,sentiment_same_day,sentimento_grafico,teste_1,capital,retorno_composto,ganho
0,2019-03-20 04:00:00+00:00,38.85756,39.410037,36.463492,36.684483,2350800,0.0,0.0,DOW,1.0,NaT,,0,,0,0.0,0.0,1.0
1,2019-03-21 04:00:00+00:00,36.824449,36.831814,35.505869,36.080444,1764700,0.0,0.0,DOW,0.983534,NaT,,0,,0,0.0,0.0,1.0
2,2019-03-22 04:00:00+00:00,35.947834,36.794967,35.476388,35.800507,844700,0.0,0.0,DOW,0.975903,2019-03-21 12:17:00+00:00,"$SPY Understanding, where we stand is key, har...",1,1.0,1,,0.0,1.0
3,2019-03-25 04:00:00+00:00,35.800513,36.389825,35.358533,36.205666,440900,0.0,0.0,DOW,0.986948,NaT,,0,,1,0.011317,0.011317,1.011317
4,2019-03-26 04:00:00+00:00,36.095177,36.647654,35.491135,35.98468,504700,0.0,0.0,DOW,0.980924,NaT,,0,,1,-0.006104,0.005144,1.005144


###### Grafico do ganho

In [11]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=noticias_precificadas['Date'], 
                         y=noticias_precificadas['ganho'], 
                         mode='lines', 
                         name='Trading',
                         line=dict(color='blue', width=1.4)))  # Ajuste da cor e largura da linha

fig.update_layout(
    xaxis=dict(title='', showspikes=True),  
    yaxis=dict(title='[Percentual]', gridcolor='lightgrey', tickfont=dict(size=12)),
    hovermode='x',
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

# Adicionar linha vertical para exibir os valores da data selecionada
fig.update_layout(
    xaxis=dict(
        showspikes=True,
        spikemode='across',
        spikesnap='cursor',
        spikethickness=1,
        spikedash='solid',
        spikecolor='black'
    )
)

fig.show()

In [12]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=noticias_precificadas['Date'], 
                         y=noticias_precificadas['Close_pct'], 
                         mode='lines', 
                         name='Buy and hold',
                         line=dict(color='black', width=1.4)))  # Ajuste da largura da linha

fig.add_trace(go.Scatter(x=noticias_precificadas['Date'], 
                         y=noticias_precificadas['ganho'], 
                         mode='lines', 
                         name='Estratégia',
                         line=dict(color='blue', width=1.4)))  # Ajuste da cor e largura da linha

fig.update_layout(
    title='Estratégia x Variação de mercado',
    xaxis=dict(title='', showspikes=True),  
    yaxis=dict(title='[Percentual]', gridcolor='lightgrey', tickfont=dict(size=12)),
    hovermode='x',
    plot_bgcolor='white',
    paper_bgcolor='white',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

# Adicionar linha vertical para exibir os valores da data selecionada
fig.update_layout(
    xaxis=dict(
        showspikes=True,
        spikemode='across',
        spikesnap='cursor',
        spikethickness=1,
        spikedash='solid',
        spikecolor='black'
    )
)



colors = {1: 'green', -1: 'red'}

fig2 = go.Figure()

# Adicionando a linha de teste_1
fig2.add_trace(go.Scatter(x=noticias_precificadas['Date'], 
                         y=noticias_precificadas['teste_1'], 
                         mode='lines',
                         line=dict(color='black', width=1.2),  # Linha preta com largura de 1
                         name='Sentimento da notícia'))

for val in [-1, 0, 1]:
    subset = noticias_precificadas[noticias_precificadas['sentimento_grafico'] == val]
    fig2.add_trace(go.Scatter(x=subset['Date'], 
                             y=subset['sentimento_grafico'], 
                             mode='markers',
                             marker=dict(color=colors.get(val, 'black'), size=5),
                             showlegend=False))

# Adicionando a legenda explicativa
fig2.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='green', size=6), showlegend=True, name='Notícia Positiva'))
fig2.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='red', size=6), showlegend=True, name='Notícia Negativa'))

fig2.update_layout(title='Sentimento das notícias ao longo do tempo',
                   xaxis_title='Data',
                   yaxis_title='Sentimento da notícia', 
                   xaxis=dict(tickangle=-45, showgrid=False),  # Removendo o grid do eixo x
                   yaxis=dict(
                       tickmode='linear',  # Define o modo de tick como linear
                       showgrid=True),  # Removendo o grid do eixo y
                   showlegend=True,
                   plot_bgcolor='white',  # Fundo branco
                   paper_bgcolor='white',
                   legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))  # Fundo do papel branco



fig3 = go.Figure()

fig3.add_trace(go.Scatter(x=noticias_precificadas['Date'], 
                         y=noticias_precificadas['capital'], 
                         mode='lines',
                         name='Capital ',
                         line=dict(color='black',width=1.4)))

fig3.update_layout(title='Variação percentual do capital',
                   xaxis_title='Data',
                   yaxis_title='[Percentual]',
                   xaxis=dict(tickangle=-45),
                   yaxis=dict(showgrid=True, gridcolor='lightgrey'),
                   showlegend=True,
                   plot_bgcolor='white',
                   legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))

fig.show()

fig2.show()

fig3.show()

#### Backtesting

In [13]:
valores = noticias_precificadas
# Definir a coluna 'Date' como o índice do DataFrame
valores.set_index('Date', inplace=True)
valores.head()
valores.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 205 entries, 2019-03-20 04:00:00+00:00 to 2020-01-09 05:00:00+00:00
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Open                205 non-null    float64            
 1   High                205 non-null    float64            
 2   Low                 205 non-null    float64            
 3   Close               205 non-null    float64            
 4   Volume              205 non-null    int64              
 5   Dividends           205 non-null    float64            
 6   Stock Splits        205 non-null    float64            
 7   ticker              205 non-null    object             
 8   Close_pct           205 non-null    float64            
 9   datetime            38 non-null     datetime64[ns, UTC]
 10  headline_same_day   38 non-null     object             
 11  sentiment_same_day  205 non-null    int32       

In [14]:
# Criar array de sinais
signals = np.where(valores['sentiment_same_day'] == 1, 1, np.where(valores['teste_1'] == -1, -1, 0))

# Inserir o array de sinais no DataFrame
valores['signals'] = signals
valores.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Close_pct,datetime,headline_same_day,sentiment_same_day,sentimento_grafico,teste_1,capital,retorno_composto,ganho,signals
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2019-03-20 04:00:00+00:00,38.85756,39.410037,36.463492,36.684483,2350800,0.0,0.0,DOW,1.0,NaT,,0,,0,0.0,0.0,1.0,0
2019-03-21 04:00:00+00:00,36.824449,36.831814,35.505869,36.080444,1764700,0.0,0.0,DOW,0.983534,NaT,,0,,0,0.0,0.0,1.0,0
2019-03-22 04:00:00+00:00,35.947834,36.794967,35.476388,35.800507,844700,0.0,0.0,DOW,0.975903,2019-03-21 12:17:00+00:00,"$SPY Understanding, where we stand is key, har...",1,1.0,1,,0.0,1.0,1
2019-03-25 04:00:00+00:00,35.800513,36.389825,35.358533,36.205666,440900,0.0,0.0,DOW,0.986948,NaT,,0,,1,0.011317,0.011317,1.011317,0
2019-03-26 04:00:00+00:00,36.095177,36.647654,35.491135,35.98468,504700,0.0,0.0,DOW,0.980924,NaT,,0,,1,-0.006104,0.005144,1.005144,0


#### Utilização da biblioteca VectorBt para obtenção do backtest do sistema criado

In [15]:
# Criar um objeto de portfólio
portfolio = vbt.Portfolio.from_signals(
    close=valores['Close'],  # Preço de fechamento
    entries=valores['signals'] == 1,  # Sinais de entrada
    exits=valores['signals'] == -1,   # Sinais de saída
    freq='D',
)


# Calcular estatísticas do backtest
stats = portfolio.stats()

# Imprimir as estatísticas
print(stats)

Start                         2019-03-20 04:00:00+00:00
End                           2020-01-09 05:00:00+00:00
Period                                205 days 00:00:00
Start Value                                       100.0
End Value                                    123.800038
Total Return [%]                              23.800038
Benchmark Return [%]                           9.491155
Max Gross Exposure [%]                            100.0
Total Fees Paid                                     0.0
Max Drawdown [%]                               8.072341
Max Drawdown Duration                 117 days 00:00:00
Total Trades                                          6
Total Closed Trades                                   6
Total Open Trades                                     0
Open Trade PnL                                      0.0
Win Rate [%]                                  66.666667
Best Trade [%]                                20.000047
Worst Trade [%]                               -6

In [16]:
portfolio.plot().show()

#### Detalhes das entradas e saidas

In [17]:
positions_df = portfolio.positions.records_readable

# Salvar DataFrame em um arquivo CSV
positions_df.to_csv('positions.csv', index=False)

portfolio.positions.records_readable

Unnamed: 0,Position Id,Column,Size,Entry Timestamp,Avg Entry Price,Entry Fees,Exit Timestamp,Avg Exit Price,Exit Fees,PnL,Return,Direction,Status
0,0,0,2.793257,2019-03-22 04:00:00+00:00,35.800507,0.0,2019-04-17 04:00:00+00:00,42.960625,0.0,20.000047,0.2,Long,Closed
1,1,0,3.311031,2019-05-24 04:00:00+00:00,36.2425,0.0,2019-06-11 04:00:00+00:00,38.893471,0.0,8.777446,0.073145,Long,Closed
2,2,0,3.494343,2019-07-05 04:00:00+00:00,36.853138,0.0,2019-07-15 04:00:00+00:00,38.063881,0.0,4.230751,0.032853,Long,Closed
3,3,0,3.371222,2019-07-24 04:00:00+00:00,39.45401,0.0,2019-07-26 04:00:00+00:00,36.703667,0.0,-9.272019,-0.06971,Long,Closed
4,4,0,3.455021,2019-10-11 04:00:00+00:00,35.81345,0.0,2019-10-22 04:00:00+00:00,35.577888,0.0,-0.81387,-0.006577,Long,Closed
5,5,0,3.08219,2019-12-03 05:00:00+00:00,39.881504,0.0,2020-01-09 05:00:00+00:00,40.166264,0.0,0.877683,0.00714,Long,Closed
