# <font color='red'>Pairs trading</font>

Based on 'Evaluation of pairs-trading strategy at the Brazilian
financial market'

## Getting data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pandas_datareader import data
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from statsmodels.tsa.stattools import adfuller

In [3]:
%matplotlib inline 

In [4]:
painel = pd.read_csv('base-stocks.csv')

In [5]:
painel.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7
0,0,2001-01-02,5.97125,5.725,5.725,5.9125,32521600.0,4.16012,PETR4.SA
1,1,2001-01-03,6.2125,5.825,5.8875,6.2125,60508800.0,4.371204,PETR4.SA
2,2,2001-01-04,6.26875,6.19625,6.21625,6.2375,46118400.0,4.388796,PETR4.SA
3,3,2001-01-05,6.3875,6.1625,6.25,6.25,41360000.0,4.397591,PETR4.SA
4,4,2001-01-08,6.2875,6.1375,6.25,6.17375,35968000.0,4.343941,PETR4.SA


In [6]:
painel.tail()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7
544641,544641,2019-06-26,5.3,5.15,5.23,5.15,29300.0,5.15,BBRK3.SA
544642,544642,2019-06-27,5.22,5.04,5.13,5.15,45900.0,5.15,BBRK3.SA
544643,544643,2019-06-28,5.3,5.15,5.29,5.25,104400.0,5.25,BBRK3.SA
544644,544644,2019-07-01,5.38,5.17,5.28,5.23,56500.0,5.23,BBRK3.SA
544645,544645,2019-07-02,5.25,5.17,5.25,5.2,157400.0,5.2,BBRK3.SA


In [7]:
painel = painel.iloc[:, 1:]

In [8]:
painel.columns = ['Date', 'High','Low','Open','Close','Volume','Adj Close', 'Symbol']

In [9]:
Tickerlist = painel['Symbol'].unique()

In [10]:
Tickerlist = Tickerlist[Tickerlist!='NEOE3.SA']   ####NEO3 giving problem

In [11]:
N = len(Tickerlist)

In [12]:
### Dataset with close prices

In [13]:
index = painel['Date'].unique()

In [14]:
index.sort()

In [15]:
painel_close0 = pd.DataFrame(index = index)

In [16]:
painel_close0.tail()

2019-06-26
2019-06-27
2019-06-28
2019-07-01
2019-07-02


In [17]:
for ticker in Tickerlist:
    df = (painel[painel['Symbol']==ticker]).set_index('Date')
    painel_close0[ticker] = df['Close']
    

In [18]:
### removing symbols with more than 0.1 of NaN

In [19]:
prop_na = painel_close0.isna().sum()/len(index)

In [20]:
Symbols = [ticker for ticker in Tickerlist if prop_na[ticker]<0.1]

In [21]:
painel_close = painel_close0.loc[:,Symbols]

In [22]:
painel_close.tail(20)

Unnamed: 0,PETR4.SA,VALE3.SA,ITUB4.SA,BBAS3.SA,PETR3.SA,ABEV3.SA,ITSA4.SA,VVAR3.SA,BRFS3.SA,LREN3.SA,...,ITSA3.SA,ELPL3.SA,VIVT3.SA,PMAM3.SA,RPMG3.SA,FJTA4.SA,PNVL3.SA,KEPL3.SA,SHUL4.SA,RSID3.SA
2019-06-04,26.200001,49.110001,35.02,52.939999,29.02,17.35,12.45,4.86,28.059999,43.259998,...,14.1,34.0,43.200001,21.84,1.82,3.39,399.0,18.950001,7.7,3.97
2019-06-05,25.860001,48.389999,34.34,51.48,28.610001,17.41,12.15,4.65,27.1,42.669998,...,13.86,34.0,42.650002,21.84,1.85,3.38,401.0,18.9,7.7,4.06
2019-06-06,26.280001,48.799999,34.700001,52.790001,29.059999,17.639999,12.29,4.95,27.41,43.099998,...,13.96,34.099998,42.099998,21.450001,1.88,3.33,401.0,18.799999,7.83,4.16
2019-06-07,26.76,48.66,35.07,52.360001,29.85,17.610001,12.4,5.0,26.16,44.360001,...,14.15,33.91,41.959999,21.82,1.87,3.38,401.0,18.700001,8.0,4.39
2019-06-10,26.65,48.34,34.630001,51.869999,29.35,17.65,12.21,5.06,27.200001,44.77,...,13.95,34.23,43.299999,21.469999,1.86,3.35,401.0,18.48,7.95,4.47
2019-06-11,27.16,51.43,34.799999,52.900002,29.950001,17.719999,12.24,5.0,27.0,45.27,...,13.85,34.23,43.299999,21.6,1.85,3.36,404.0,18.52,8.0,4.61
2019-06-12,26.85,51.490002,34.599998,52.259998,29.51,17.68,12.18,4.84,27.0,45.400002,...,13.75,34.049999,43.619999,21.41,1.86,3.31,404.0,18.5,7.9,4.48
2019-06-13,27.18,51.849998,33.98,51.43,29.91,17.85,12.02,5.04,28.57,45.75,...,13.35,34.029999,43.75,21.200001,1.82,3.28,404.0,19.0,8.0,4.31
2019-06-14,27.059999,51.400002,33.990002,50.52,30.190001,17.639999,11.98,4.97,29.1,45.84,...,13.37,34.009998,43.869999,21.07,1.84,3.27,404.0,19.58,7.88,4.09
2019-06-17,27.110001,50.200001,33.950001,50.470001,30.15,17.799999,11.97,5.07,28.700001,46.25,...,13.47,34.490002,42.610001,20.83,1.82,3.33,404.0,19.360001,7.81,4.3


In [23]:
painel_close= painel_close.fillna(method='ffill')

In [24]:
painel_close = painel_close.dropna()

In [25]:
### Transform data

In [26]:
painel_transform = pd.DataFrame(index = painel_close.index)

In [27]:
for ticker in Symbols:
    df = painel_close[ticker]
    average = df.rolling(window=20).mean()
    std = df.rolling(window=20).std()
    
    painel_transform[ticker] = (df-average)/std

In [28]:
painel_transform = painel_transform.dropna()

In [29]:
painel_transform.tail()

Unnamed: 0,PETR4.SA,VALE3.SA,ITUB4.SA,BBAS3.SA,PETR3.SA,ABEV3.SA,ITSA4.SA,VVAR3.SA,BRFS3.SA,LREN3.SA,...,ITSA3.SA,ELPL3.SA,VIVT3.SA,PMAM3.SA,RPMG3.SA,FJTA4.SA,PNVL3.SA,KEPL3.SA,SHUL4.SA,RSID3.SA
2019-06-26,1.012916,0.744374,2.142602,1.861509,0.725531,0.983173,2.627019,0.245688,0.561417,0.947592,...,0.726485,0.463893,0.521613,0.859232,-1.163868,-0.293575,0.521186,0.900787,1.095281,1.437296
2019-06-27,0.388252,0.75125,1.569247,1.736039,0.011862,0.756806,2.048696,0.06584,1.201229,0.990734,...,-1.96497,4.157642,0.303425,1.043611,-0.453258,0.878108,0.444789,0.86804,1.285828,1.04423
2019-06-28,0.554583,0.761171,1.418502,1.647363,0.088402,0.202242,1.634065,0.757676,1.592685,1.072698,...,-2.059257,2.893629,-0.818366,1.31436,-0.036099,1.029559,0.36394,0.762547,1.605883,1.045704
2019-07-01,0.266533,1.709629,1.091244,1.406938,-0.119565,-0.096018,1.53456,1.114309,2.996202,0.529488,...,-1.575604,2.324193,-1.609754,0.938621,2.129854,1.556186,0.3093,0.408321,1.192457,0.96601
2019-07-02,-0.498947,0.242,0.865073,0.666214,-0.715159,0.858397,1.016771,2.861342,2.788849,0.644902,...,-1.35375,1.955864,-0.045784,0.8306,3.977012,1.257434,0.834001,-0.257141,0.337231,0.753568


In [30]:
painel_transform_cut = painel_transform.loc[:'2009-01-02',:]

In [31]:
### Calculate the distance

In [32]:
distance = painel_transform_cut.cov()
deviation = painel_transform_cut.cov()
mean = painel_transform_cut.cov()

In [33]:
l = painel_transform_cut.shape[0]
for ticker1 in Symbols:
    for ticker2 in Symbols:
        dist = painel_transform_cut[ticker1] - painel_transform_cut[ticker2]
        dist2 = dist**2
        distance.loc[ticker1, ticker2] = np.sqrt(dist2.sum())/l
        deviation.loc[ticker1, ticker2] = dist2.std()
        mean.loc[ticker1, ticker2] = dist2.mean()

In [34]:
lista=[]

In [35]:
for ticker in Symbols:
    df = distance[ticker]
    df = df[df>0]
    lista.append([ticker, df.argmin(), mean.loc[ticker, df.argmin()],   deviation.loc[ticker, df.argmin()]])

In [52]:
pairs = pd.DataFrame(lista)

In [53]:
pairs.head()

Unnamed: 0,0,1,2,3
0,PETR4.SA,PETR3.SA,0.079271,0.266877
1,VALE3.SA,BRAP4.SA,0.230948,0.445027
2,ITUB4.SA,ITUB3.SA,0.417984,0.95544
3,BBAS3.SA,ITUB4.SA,0.960974,1.382933
4,PETR3.SA,PETR4.SA,0.079271,0.266877


In [54]:
pairs.columns = ['stock1', 'stock2', 'mean', 'std']

In [55]:
pairs.head()

Unnamed: 0,stock1,stock2,mean,std
0,PETR4.SA,PETR3.SA,0.079271,0.266877
1,VALE3.SA,BRAP4.SA,0.230948,0.445027
2,ITUB4.SA,ITUB3.SA,0.417984,0.95544
3,BBAS3.SA,ITUB4.SA,0.960974,1.382933
4,PETR3.SA,PETR4.SA,0.079271,0.266877


In [56]:
### testing if the series is stationary

p_values=[]
for i in pairs.index:
    dist_t = (painel_transform_cut[pairs.iloc[i,0]] - painel_transform_cut[pairs.iloc[i,1]])**2
    X = dist_t.values
    result = adfuller(X)
    p_values.append(result[1])

In [57]:
pairs['p-value']=p_values

### Backtest

In [58]:
painel_transform_back = painel_transform.loc['2009-01-02':'2017-12-01',:]

In [59]:
painel_transform_back.head()

Unnamed: 0,PETR4.SA,VALE3.SA,ITUB4.SA,BBAS3.SA,PETR3.SA,ABEV3.SA,ITSA4.SA,VVAR3.SA,BRFS3.SA,LREN3.SA,...,ITSA3.SA,ELPL3.SA,VIVT3.SA,PMAM3.SA,RPMG3.SA,FJTA4.SA,PNVL3.SA,KEPL3.SA,SHUL4.SA,RSID3.SA
2009-01-02,1.36675,1.584271,0.458773,-0.199781,1.429734,1.424503,0.163698,-1.918781,-0.92006,1.339365,...,0.09552,-0.917203,-0.484933,2.923993,0.0,-0.560777,1.554695,-0.082671,-0.84099,1.417941
2009-01-05,1.516518,2.058733,0.435716,0.051871,1.545751,0.46585,0.022534,-0.687682,-0.9318,0.698889,...,-0.496327,0.07862,0.055241,2.271352,0.0,1.490705,1.422618,1.404663,-0.796987,2.52768
2009-01-06,1.52932,2.089076,1.06885,0.904163,1.56867,0.083306,0.667197,-0.513428,-0.986041,1.068028,...,0.061194,0.071764,-0.688374,2.020148,0.0,0.404798,1.273372,1.299573,-0.606483,2.898951
2009-01-07,0.955258,1.130561,-0.058785,0.434782,0.877068,-0.125289,-0.455849,-0.375151,-0.847402,0.586096,...,-0.526956,0.064913,-0.706641,1.648507,0.0,0.757985,0.282525,1.174322,1.994105,1.858801
2009-01-08,1.570302,1.664388,0.005857,0.039984,1.572637,0.038659,-0.205043,-0.198924,-0.048722,1.027563,...,-0.620083,0.058064,-0.686429,1.828665,0.0,1.224461,1.297441,1.107056,1.697682,2.032383


In [60]:
painel_close_back = painel_close.loc['2009-01-02':'2017-12-01',:]

In [61]:
painel_close_back.tail()

Unnamed: 0,PETR4.SA,VALE3.SA,ITUB4.SA,BBAS3.SA,PETR3.SA,ABEV3.SA,ITSA4.SA,VVAR3.SA,BRFS3.SA,LREN3.SA,...,ITSA3.SA,ELPL3.SA,VIVT3.SA,PMAM3.SA,RPMG3.SA,FJTA4.SA,PNVL3.SA,KEPL3.SA,SHUL4.SA,RSID3.SA
2017-11-27,15.87,35.41,28.459999,32.16,16.32,20.74,9.80909,7.77,39.799999,31.209101,...,9.2256,16.1,39.66,25.16,9.36,2.11,480.0,20.4,4.87143,6.51
2017-11-28,15.84,36.25,28.506701,32.639999,16.280001,20.780001,9.77273,7.6,40.040001,31.790899,...,9.24358,15.99,40.07,25.16,9.45,2.2,480.0,20.280001,4.82143,6.49
2017-11-29,15.33,35.700001,27.92,31.16,15.91,20.559999,9.53636,7.55,39.290001,31.0909,...,9.07274,15.86,39.459999,25.33,9.37,2.36,480.0,19.0,4.79286,6.5
2017-11-30,15.38,35.139999,27.5133,29.93,15.95,20.559999,9.43636,7.36,38.48,30.8454,...,8.94685,16.0,38.939999,25.16,9.37,2.23,454.98999,18.26,4.71429,6.3
2017-12-01,15.61,35.48,27.4867,30.780001,16.110001,20.450001,9.50909,7.45,38.5,30.9454,...,9.07274,16.049999,39.919998,25.16,9.37,2.3,468.0,18.5,4.57143,6.35


In [62]:
pairs['position1'] = np.zeros(pairs.shape[0])
pairs['position2'] = np.zeros(pairs.shape[0])
pairs['notional'] =  np.zeros(pairs.shape[0])
pairs['price1'] =  np.zeros(pairs.shape[0])
pairs['price2'] =  np.zeros(pairs.shape[0])

In [84]:
backtest_data = pd.DataFrame(index=painel_close_back.index)

In [86]:
n = len(painel_close_back.index)

In [87]:
for i in pairs.index:
    strat = pairs.iloc[i,0] + '-' + pairs.iloc[i,1]
    backtest_data[strat] = np.zeros(n)

In [63]:
pairs.head()

Unnamed: 0,stock1,stock2,mean,std,p-value,position1,position2,notional
0,PETR4.SA,PETR3.SA,0.079271,0.266877,8.057474e-16,0.0,0.0,0.0
1,VALE3.SA,BRAP4.SA,0.230948,0.445027,0.0105284,0.0,0.0,0.0
2,ITUB4.SA,ITUB3.SA,0.417984,0.95544,1.382085e-05,0.0,0.0,0.0
3,BBAS3.SA,ITUB4.SA,0.960974,1.382933,1.178907e-19,0.0,0.0,0.0
4,PETR3.SA,PETR4.SA,0.079271,0.266877,8.057474e-16,0.0,0.0,0.0


In [82]:
for day in painel_close_back.index:
    for j in pairs.index:
        price1 = painel_close_back.loc[day, pairs.iloc[j,0]]
        price2 = painel_close_back.loc[day, pairs.iloc[j,1]]
       
        dist = abs((price1-price2)**2 - pairs.iloc[j,2])>2*pairs.iloc[j,3]
        
        if (pairs.iloc[j, 5]==0) & (dist):
            pairs.iloc[j,8] = price1
            pairs.iloc[j,9] = price2
            if price1>price2:
                pairs.iloc[j,5] = -1
                pairs.iloc[j,6] =  1
                
                pairs.iloc[j,7] = pairs.iloc[j,7] + price1 - price2
                
            else:
                pairs.iloc[j,5] =  1
                pairs.iloc[j,6] =  -1
                                             
                pairs.iloc[j,7] = pairs.iloc[j,7] - price1 + price2
                
        if (pairs.iloc[j, 5]!=0) & (not dist):  
            
                pairs.iloc[j,7] = pairs.iloc[j,7] + pairs.iloc[j,5]* price1 + pairs.iloc[j,6]*price2
                pairs.iloc[j,5] = 0
                pairs.iloc[j,6] = 0
            
            
        
        
        
        


In [83]:
pairs.head()

Unnamed: 0,stock1,stock2,mean,std,p-value,position1,position2,notional
0,PETR4.SA,PETR3.SA,0.079271,0.266877,8.057474e-16,0.0,0.0,14.940001
1,VALE3.SA,BRAP4.SA,0.230948,0.445027,0.0105284,-1.0,1.0,9.700001
2,ITUB4.SA,ITUB3.SA,0.417984,0.95544,1.382085e-05,-1.0,1.0,13.133583
3,BBAS3.SA,ITUB4.SA,0.960974,1.382933,1.178907e-19,-1.0,1.0,13.054106
4,PETR3.SA,PETR4.SA,0.079271,0.266877,8.057474e-16,0.0,0.0,14.940001


In [76]:
pairs.iloc[2, 5]!=0

False

In [75]:
a !=0

False

In [None]:
pairs.iloc[0,0]

In [None]:
ticker1 = 'PETR4.SA'
ticker2 = 'PETR3.SA'

In [None]:
dist_t = (painel_transform_cut[ticker1] - painel_transform_cut[ticker2])**2

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
X = dist_t.values
result = adfuller(X)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
	print('\t%s: %.3f' % (key, value))

In [None]:
def get_pair(distance, ticker):
    df = distance[ticker]
    df = df[df>0]
    return df.argmin()

In [None]:
get_pair(distance, ticker)

In [None]:
ticker='PETR4.SA'

In [None]:
df = distance[ticker]
df = df[df>0]

In [None]:
df.argmin()

In [None]:
df = painel_cut['PETR4.SA']


In [None]:
average = df.rolling(window=20).std()

In [None]:
average

In [None]:
averageexp=base['Close'].ewm(span=17, adjust=False).mean()

In [None]:
painel_cut = painel[painel['Date']<'2009-07-01']

In [None]:
start_date = '2000-01-01'
end_date = '2019-05-01'

In [None]:
ibov = data.DataReader('^BVSP', 'yahoo', start_date, end_date)

In [None]:
ibov=ibov.reset_index

In [None]:
ibov.head()

In [None]:
ibov.reset_index

In [None]:
dax = data.DataReader('^GDAXI', 'yahoo', start_date, end_date)

In [None]:
nikkey = data.DataReader('^N225', 'yahoo', start_date, end_date)

In [None]:
dji = data.DataReader('^DJI', 'yahoo', start_date, end_date)

In [None]:
sp500=data.DataReader('^GSPC', 'yahoo', start_date, end_date)

In [None]:
hsi = data.DataReader('^HSI', 'yahoo', start_date, end_date)

In [None]:
# Função para calcular a variação diaria da abertura até o fechamento.
def var(x):
    x['var'] = x['Close'] - x['Open']
    return x

In [None]:
dax = var(dax)
dji = var(dji)
sp500 = var(sp500)
nikkey = var(nikkey)
hsi = var(hsi)
ibov = var(ibov)

Transformando o target em 1 se a variação é positiva e -1 caso contrario

In [None]:
ibov['signal'] = np.where(ibov['var']>0,1,0)

In [None]:
dax_var_shift = dax['var'].shift(+1)
dji_var_shift = dji['var'].shift(+1)
sp500_var_shift = sp500['var'].shift(+1)
ibov_var_shift = ibov['var'].shift(+1)
nikkey_var = nikkey['var']
hsi_var = hsi['var']

In [None]:
dax_var_shift.name = 'var_dax_shift'
dji_var_shift.name = 'var_dji_shift'
sp500_var_shift.name = 'var_sp500_shift'
nikkey_var.name = 'var_nikkey'
hsi_var.name = 'var_hsi'
ibov_var_shift.name = 'var_ibov_shift'

In [None]:
df = ibov.join([dax_var_shift,ibov_var_shift,dji_var_shift,sp500_var_shift,nikkey_var,hsi_var])

In [None]:
# Drop nas linhas com NA's
df=df.dropna()

In [None]:
df.columns

In [None]:
df_f = df[['Open', 'Close','signal', 'var_dax_shift','var_ibov_shift', 'var_dji_shift', 'var_sp500_shift',
       'var_nikkey', 'var_hsi', 'var']]

In [None]:
df_f.columns

In [None]:
df_f.columns = ['Open', 'Close','signal', 'dax', 'ibov_s', 'dji', 'sp500', 'nikkey', 'hsi', 'var']

In [None]:
df_f.shape

### Divisião da base - Parte da base para modelagem e testes e outra parte para Backtest

In [None]:
backtest = df_f[df_f.index>'2018-01-01']

In [None]:
df_f = df_f[df_f.index<'2018-01-01']
df_f = df_f[['signal', 'dax', 'ibov_s', 'dji', 'sp500', 'nikkey', 'hsi']]

### Dividindo a base de dados em treino e teste

In [None]:
size=df_f.shape[0]

In [None]:
selecao_train = np.random.choice(size, int(0.7*size), replace=False)

In [None]:
selecao_test = np.setdiff1d(np.arange(size), selecao_train)

In [None]:
df_train = df_f.iloc[selecao_train]
df_test = df_f.iloc[selecao_test]

### Normalização

In [None]:
minimos = df_train.min()
maximos = df_train.max()

In [None]:
# Função para a normalização dos dados 
def normalize(x, minimos, maximos):
    name = x.name
    return (x-minimos[name])/(maximos[name]-minimos[name])

Normalização da base de treino

In [None]:
df_train_n=df_train.apply(normalize, args=[minimos, maximos])

Normalização da base teste

In [None]:
df_test_n=df_test.apply(normalize, args=[minimos, maximos])

Obtendo dataframe apenas com os valores

In [None]:
array_train = df_train_n.values
X_train = array_train[:,1:7]
Y_train = array_train[:,0]

array_test = df_test_n.values
X_test = array_test[:,1:7]
Y_test = array_test[:,0]

## Construindo e treinando os modelos

Usaremos 4 modelos de calssificação:
    - Naive Bayes
    - Randon Forest
    - Regressão logística
    - K-nn

In [None]:
from sklearn.naive_bayes import GaussianNB # Utilizando um classificador Naive Bayes
from sklearn.ensemble import RandomForestClassifier # Randon forest
from sklearn.linear_model import LogisticRegression # logistic
from sklearn.neighbors import KNeighborsClassifier #k-nn

#### Naive Bayes

In [None]:
# Criando o modelo preditivo
modelo_v1 = GaussianNB()

In [None]:
# Treinando o modelo
modelo_v1.fit(X_train, Y_train.ravel())

#### Randon forest

In [None]:
modelo_v2 = RandomForestClassifier(random_state = 42)
modelo_v2.fit(X_train, Y_train.ravel())

#### Logistica

In [None]:
modelo_v3 = LogisticRegression(C = 0.7, random_state = 42)
modelo_v3.fit(X_train, Y_train.ravel())

#### K-nn

In [None]:
modelo_v4 = KNeighborsClassifier(n_neighbors=3)
modelo_v4.fit(X_train, Y_train.ravel())

### Verificando a exatidão no modelo na base de treino

In [None]:
from sklearn import metrics

In [None]:
nb_predict_train = modelo_v1.predict(X_train)
rf_predict_train = modelo_v2.predict(X_train)
lr_predict_train = modelo_v3.predict(X_train)
knn_predict_train = modelo_v4.predict(X_train)

In [None]:
print("Naive Bayes - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_train, nb_predict_train)))
print("Randon forest - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_train, rf_predict_train)))
print("Logistica - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_train, lr_predict_train)))
print("k-nn - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_train, knn_predict_train)))
print()

### Verificando a exatidão dos modelos na base de teste


In [None]:
nb_predict_test = modelo_v1.predict(X_test)
rf_predict_test = modelo_v2.predict(X_test)
lr_predict_test = modelo_v3.predict(X_test)
knn_predict_test = modelo_v4.predict(X_test)

In [None]:
print("Naive Bayes - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_test, nb_predict_test)))
print("Randon forest - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_test, rf_predict_test)))
print("Logistica - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_test, lr_predict_test)))
print("k-nn - Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_test, knn_predict_test)))
print()

## Backtest
Escolhemos o modelo de regressão logística por apresentar melhor acuracia na base de teste e de treino

In [None]:
backtest_n=backtest[['signal','dax', 'ibov_s', 'dji', 'sp500', 'nikkey', 'hsi']].apply(normalize, args=[minimos, maximos])

In [None]:
array_backtest = backtest_n.values
X = array_backtest[:,1:7]

In [None]:
lr_predict_backtest = modelo_v3.predict(X)

In [None]:
signal = pd.Series(lr_predict_backtest, index =backtest.index) 

In [None]:
backtest['signal_predict'] = np.where(signal==1,1,-1)

In [None]:
backtest['Return'] = (backtest['Close']- backtest['Open'])/backtest['Open']

In [None]:
backtest['retorno_diario'] = backtest['Return'] * backtest['signal_predict']

In [None]:
# Calculo do retorno acumulado.
geometric_returns = (backtest['retorno_diario']+1).cumprod()


In [None]:
geometric_returns.plot(figsize=(10,5))
plt.ylabel("Strategy Returns%")
plt.xlabel("Date")
plt.show()

## Conclusão

In [None]:
print("O retorno acumulado da estratégia foi de : {0:.2f} % em {1:.0f} trading days".format(geometric_returns[-1],backtest.shape[0] ))

Uma otimização poderia ser obtida com a inserção de regras de gerenciamento de risco (stop e target), bem como a otimização do modelo para uma melhor acuraria.