In [33]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import accuracy_score, classification_report

import yfinance as yf


In [34]:
price_data = yf.download('IBM', start="2005-01-01")
df = yf.download('JPM', start="2005-01-01")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [35]:
price_data['Ativo'] = 'IBM'
df['Ativo'] = 'JPM'

In [36]:
price_data = pd.concat([price_data, df], ignore_index=False)

In [37]:
price_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM
2005-01-05,92.351814,93.527725,92.160614,92.256210,53.021126,5906448,IBM
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM
2005-01-07,92.256210,92.543022,91.271507,91.567879,52.625526,6485932,IBM
...,...,...,...,...,...,...,...
2023-09-11,144.750000,145.050003,143.690002,144.460007,144.460007,6854200,JPM
2023-09-12,144.500000,147.320007,144.050003,146.339996,146.339996,8363200,JPM
2023-09-13,147.339996,147.699997,145.820007,146.410004,146.410004,8325900,JPM
2023-09-14,147.839996,149.899994,147.520004,149.250000,149.250000,10034900,JPM


In [38]:
price_data["datetime"] = price_data.index

In [39]:
# sort the values by symbol and then date
price_data.sort_values(by = ['Ativo','datetime'], inplace = True)

# calculate the change in price
price_data['change_in_price'] = price_data['Close'].diff()

In [40]:
# identify rows where the symbol changes
mask = price_data['Ativo'] != price_data['Ativo'].shift(1)

# For those rows, let's make the value null
price_data['change_in_price'] = np.where(mask == True, np.nan, price_data['change_in_price'])

# print the rows that have a null value, should only be 5
price_data[price_data.isna().any(axis = 1)]

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,
2005-01-03,39.48,39.689999,39.009998,39.150002,23.560514,14957900,JPM,2005-01-03,


In [41]:
# define the number of days out you want to predict
days_out = 30

# Group by symbol, then apply the rolling function and grab the Min and Max.
price_data_smoothed = price_data.groupby(['Ativo'])[['Close','Low','High','Open','Volume']].transform(lambda x: x.ewm(span = days_out).mean())

# Join the smoothed columns with the symbol and datetime column from the old data frame.
smoothed_df = pd.concat([price_data[['Ativo','datetime']], price_data_smoothed], axis=1, sort=False)

smoothed_df

Unnamed: 0_level_0,Ativo,datetime,Close,Low,High,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-03,IBM,2005-01-03,93.451241,92.973228,94.741875,94.617592,5.538779e+06
2005-01-04,IBM,2005-01-04,92.932599,92.612651,94.405993,94.010038,5.763491e+06
2005-01-05,IBM,2005-01-05,92.691944,92.451819,94.093511,93.420052,5.814354e+06
2005-01-06,IBM,2005-01-06,92.492858,92.279384,93.713709,93.109909,5.527021e+06
2005-01-07,IBM,2005-01-07,92.282403,92.050068,93.447350,92.915672,5.745196e+06
...,...,...,...,...,...,...,...
2023-09-11,JPM,2023-09-11,148.199025,147.244442,149.374833,148.364813,8.089804e+06
2023-09-12,JPM,2023-09-12,148.079088,147.038349,149.242264,148.115470,8.107442e+06
2023-09-13,JPM,2023-09-13,147.971405,146.959747,149.142762,148.065440,8.121536e+06
2023-09-14,JPM,2023-09-14,148.053895,146.995892,149.191616,148.050895,8.244979e+06


In [42]:
# define the number of days out you want to predict
days_out = 30

# create a new column that will house the flag, and for each group calculate the diff compared to 30 days ago. Then use Numpy to define the sign.
smoothed_df['Signal_Flag'] = smoothed_df.groupby('Ativo')['Close'].transform(lambda x : np.sign(x.diff(days_out)))

# print the first 50 rows
smoothed_df.head(50)

Unnamed: 0_level_0,Ativo,datetime,Close,Low,High,Open,Volume,Signal_Flag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2005-01-03,IBM,2005-01-03,93.451241,92.973228,94.741875,94.617592,5538779.0,
2005-01-04,IBM,2005-01-04,92.932599,92.612651,94.405993,94.010038,5763491.0,
2005-01-05,IBM,2005-01-05,92.691944,92.451819,94.093511,93.420052,5814354.0,
2005-01-06,IBM,2005-01-06,92.492858,92.279384,93.713709,93.109909,5527021.0,
2005-01-07,IBM,2005-01-07,92.282403,92.050068,93.44735,92.915672,5745196.0,
2005-01-10,IBM,2005-01-10,92.123914,91.854739,93.13764,92.651997,5567689.0,
2005-01-11,IBM,2005-01-11,91.898771,91.628199,92.867793,92.447957,5463404.0,
2005-01-12,IBM,2005-01-12,91.76211,91.362207,92.590372,92.194277,5562223.0,
2005-01-13,IBM,2005-01-13,91.552578,91.107343,92.501603,92.051432,5565481.0,
2005-01-14,IBM,2005-01-14,91.341706,90.885789,92.183942,91.763029,5593222.0,


In [43]:
# price_data["datetime"] = price_data["datetime"].dt.strftime('%Y-%m-%d')

In [44]:
# price_data = price_data.set_index('datetime', inplace=True)

In [45]:
# Calculate the 14 day RSI
n = 14

# First make a copy of the data frame twice
up_df, down_df = price_data[['Ativo','change_in_price']].copy(), price_data[['Ativo','change_in_price']].copy()

# For up days, if the change is less than 0 set to 0.
up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0

# For down days, if the change is greater than 0 set to 0.
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0

# We need change in price to be absolute.
down_df['change_in_price'] = down_df['change_in_price'].abs()

# Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
ewma_up = up_df.groupby('Ativo')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
ewma_down = down_df.groupby('Ativo')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())

# Calculate the Relative Strength
relative_strength = ewma_up / ewma_down

# Calculate the Relative Strength Index
relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))



In [46]:
relative_strength_index = pd.DataFrame(relative_strength_index)

In [47]:
down_df = down_df.drop(down_df.index[-1])
up_df = up_df.drop(up_df.index[-1])
relative_strength_index = relative_strength_index.drop(relative_strength_index.index[-1])



In [48]:
down_df.reset_index(drop=True, inplace=True)
up_df.reset_index(drop=True, inplace=True)
relative_strength_index.reset_index(drop=True, inplace=True)

# combined_df = price_data.copy()
# price_data = combined_df

# Add the info to the data frame.
price_data['down_days'] = down_df['change_in_price'].values
price_data['up_days'] = up_df['change_in_price'].values
price_data['RSI'] = relative_strength_index["change_in_price"].values

# Display the head.
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781


In [49]:
# Calculate the Stochastic Oscillator
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['Ativo','Low']].copy(), price_data[['Ativo','High']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('Ativo')['Low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('Ativo')['High'].transform(lambda x: x.rolling(window = n).max())

# Calculate the Stochastic Oscillator.
k_percent = 100 * ((price_data['Close'] - low_14) / (high_14 - low_14))

# Add the info to the data frame.
price_data['low_14'] = low_14
price_data['high_14'] = high_14
price_data['k_percent'] = k_percent

# Display the head.
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0,,,
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0,,,
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0,,,
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0,,,
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0,,,
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0,,,
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135,,,
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164,,,
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781,,,


In [50]:
# Calculate the Williams %R
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['Ativo','Low']].copy(), price_data[['Ativo','High']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14.groupby('Ativo')['Low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14.groupby('Ativo')['High'].transform(lambda x: x.rolling(window = n).max())

# Calculate William %R indicator.
r_percent = ((high_14 - price_data['Close']) / (high_14 - low_14)) * - 100

# Add the info to the data frame.
price_data['r_percent'] = r_percent

# Display the head.
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0,,,,
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0,,,,
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0,,,,
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0,,,,
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0,,,,
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0,,,,
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135,,,,
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164,,,,
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781,,,,


In [51]:
# Calculate the MACD
ema_26 = price_data.groupby('Ativo')['Close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data.groupby('Ativo')['Close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculate the EMA
ema_9_macd = macd.ewm(span = 9).mean()

# Store the data in the data frame.
price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# Print the head.
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,,,0.0,0.0
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0,,,,,-0.022522,-0.012512
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0,,,,,-0.034626,-0.021575
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0,,,,,-0.049638,-0.031082
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0,,,,,-0.072728,-0.043471
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0,,,,,-0.089329,-0.055901
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0,,,,,-0.127887,-0.074119
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135,,,,,-0.142138,-0.090465
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164,,,,,-0.185891,-0.112509
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781,,,,,-0.231459,-0.139161


In [52]:
# Calculate the Price Rate of Change
n = 9

# Calculate the Rate of Change in the Price, and store it in the Data Frame.
price_data['Price_Rate_Of_Change'] = price_data.groupby('Ativo')['Close'].transform(lambda x: x.pct_change(periods = n))

# Print the first 30 rows
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,,,0.0,0.0,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0,,,,,-0.022522,-0.012512,
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0,,,,,-0.034626,-0.021575,
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0,,,,,-0.049638,-0.031082,
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0,,,,,-0.072728,-0.043471,
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0,,,,,-0.089329,-0.055901,
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0,,,,,-0.127887,-0.074119,
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135,,,,,-0.142138,-0.090465,
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164,,,,,-0.185891,-0.112509,
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781,,,,,-0.231459,-0.139161,-0.03734


In [53]:
def obv(group):

    # Grab the volume and close column.
    volume = group['Volume']
    change = group['Close'].diff()

    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)
    
    # Return a panda series.
    return pd.Series(obv_values, index = group.index)
        

# apply the function to each group
obv_groups = price_data.groupby('Ativo').apply(obv)



In [54]:
price_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,,,0.000000,0.000000,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.000000,0.000000,,,,,-0.022522,-0.012512,
2005-01-05,92.351814,93.527725,92.160614,92.256210,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.000000,0.000000,,,,,-0.034626,-0.021575,
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.000000,0.000000,,,,,-0.049638,-0.031082,
2005-01-07,92.256210,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.000000,0.000000,,,,,-0.072728,-0.043471,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-11,144.750000,145.050003,143.690002,144.460007,144.460007,6854200,JPM,2023-09-11,0.630005,0.000000,0.630005,31.782795,142.649994,149.639999,25.894297,-74.105703,-1.909254,-1.494445,-0.021008
2023-09-12,144.500000,147.320007,144.050003,146.339996,146.339996,8363200,JPM,2023-09-12,1.879990,0.000000,1.879990,49.228777,142.649994,149.410004,54.585756,-45.414244,-1.746639,-1.544884,-0.016268
2023-09-13,147.339996,147.699997,145.820007,146.410004,146.410004,8325900,JPM,2023-09-13,0.070007,0.000000,0.070007,49.780611,142.649994,149.410004,55.621366,-44.378634,-1.593744,-1.554656,-0.011812
2023-09-14,147.839996,149.899994,147.520004,149.250000,149.250000,10034900,JPM,2023-09-14,2.839996,0.000000,2.839996,66.714805,142.649994,149.899994,91.034567,-8.965433,-1.229240,-1.489573,0.019955


In [55]:
obv_groups = obv_groups.T


In [56]:
df_aux1 = obv_groups["IBM"].reset_index()
df_aux2 = obv_groups["JPM"].reset_index()

In [57]:
df_aux1 = df_aux1.drop(columns="Date")

In [58]:
df_aux2 = df_aux2.drop(columns="Date")

In [59]:
df_aux1.columns = ["On Balance Volume"]
df_aux2.columns = ["On Balance Volume"]

In [61]:
df_aux2

Unnamed: 0,On Balance Volume
0,0
1,-11360900
2,-1590700
3,7525200
4,-2446000
...,...
4703,-2013351100
4704,-2004987900
4705,-1996662000
4706,-1986627100


In [64]:
novo_indice = range(4708, 9416)
df_aux2["index"] = novo_indice
df_aux2 = df_aux2.set_index('index', inplace=False)


In [65]:
df_aux2

Unnamed: 0_level_0,On Balance Volume
index,Unnamed: 1_level_1
4708,0
4709,-11360900
4710,-1590700
4711,7525200
4712,-2446000
...,...
9411,-2013351100
9412,-2004987900
9413,-1996662000
9414,-1986627100


In [66]:
result = df_aux1.append(df_aux2)

In [67]:
result

Unnamed: 0,On Balance Volume
0,0
1,-5973706
2,-11880154
3,-16651692
4,-23137624
...,...
9411,-2013351100
9412,-2004987900
9413,-1996662000
9414,-1986627100


In [69]:
# add to the data frame, but drop the old index, before adding it.
price_data['On Balance Volume'] = result.reset_index(level=0, drop=True).values




# display the data frame.
price_data.head(30)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.707932,5538779,IBM,2005-01-03,,,,,,,,,0.0,0.0,,0
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131031,5973706,IBM,2005-01-04,-1.003822,1.003822,0.0,0.0,,,,,-0.022522,-0.012512,,-5973706
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021126,5906448,IBM,2005-01-05,-0.191208,0.191208,0.0,0.0,,,,,-0.034626,-0.021575,,-11880154
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06,-0.286804,0.286804,0.0,0.0,,,,,-0.049638,-0.031082,,-16651692
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625526,6485932,IBM,2005-01-07,-0.401527,0.401527,0.0,0.0,,,,,-0.072728,-0.043471,,-23137624
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10,-0.095604,0.095604,0.0,0.0,,,,,-0.089329,-0.055901,,-27975479
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196987,4964734,IBM,2005-01-11,-0.650093,0.650093,0.0,0.0,,,,,-0.127887,-0.074119,,-32940213
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.312355,6096716,IBM,2005-01-12,0.20076,0.0,0.20076,11.291135,,,,,-0.142138,-0.090465,,-26843497
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894787,5585012,IBM,2005-01-13,-0.726578,0.726578,0.0,7.673164,,,,,-0.185891,-0.112509,,-32428509
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702469,5774757,IBM,2005-01-14,-0.334602,0.334602,0.0,6.556781,,,,,-0.231459,-0.139161,-0.03734,-38203266


In [None]:
# Group by the `Symbol` column, then grab the `Close` column.
close_groups = price_data.groupby('Ativo')['Close']

# Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
close_groups = close_groups.transform(lambda x : np.sign(x.diff()))

# add the data to the main dataframe.
price_data['Prediction'] = close_groups

# for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
price_data.loc[price_data['Prediction'] == 0.0] = 1.0

# print the head
price_data.head(50)

# OPTIONAL CODE: Dump the data frame to a CSV file to examine the data yourself.
# price_data.to_csv('final_metrics.csv')

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,...,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume IBM,On Balance Volume JPM,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-03,94.617592,94.741875,92.973228,93.451241,53.70789,5538779,IBM,2005-01-03 00:00:00,,,...,,,,,0.0,0.0,,,,
2005-01-04,93.441681,94.091782,92.275337,92.447418,53.131027,5973706,IBM,2005-01-04 00:00:00,-1.003822,1.003822,...,,,,,-0.022522,-0.012512,,,,-1.0
2005-01-05,92.351814,93.527725,92.160614,92.25621,53.021137,5906448,IBM,2005-01-05 00:00:00,-0.191208,0.191208,...,,,,,-0.034626,-0.021575,,,,-1.0
2005-01-06,92.294456,92.715103,91.826004,91.969406,52.856304,4771538,IBM,2005-01-06 00:00:00,-0.286804,0.286804,...,,,,,-0.049638,-0.031082,,,,-1.0
2005-01-07,92.25621,92.543022,91.271507,91.567879,52.625534,6485932,IBM,2005-01-07 00:00:00,-0.401527,0.401527,...,,,,,-0.072728,-0.043471,,,,-1.0
2005-01-10,91.567879,91.864243,91.051628,91.472275,52.570591,4837855,IBM,2005-01-10 00:00:00,-0.095604,0.095604,...,,,,,-0.089329,-0.055901,,,,-1.0
2005-01-11,91.472275,91.577438,90.54493,90.822182,52.196976,4964734,IBM,2005-01-11 00:00:00,-0.650093,0.650093,...,,,,,-0.127887,-0.074119,,,,-1.0
2005-01-12,90.822182,91.089867,89.923515,91.022942,52.31234,6096716,IBM,2005-01-12 00:00:00,0.20076,0.0,...,,,,,-0.142138,-0.090465,,,,1.0
2005-01-13,91.19503,91.969406,89.579353,90.296364,51.894802,5585012,IBM,2005-01-13 00:00:00,-0.726578,0.726578,...,,,,,-0.185891,-0.112509,,,,-1.0
2005-01-14,89.875717,90.105164,89.435944,89.961761,51.702461,5774757,IBM,2005-01-14 00:00:00,-0.334602,0.334602,...,,,,,-0.231459,-0.139161,-0.03734,,,-1.0


In [None]:
for

SyntaxError: invalid syntax (3193057967.py, line 1)

In [None]:
# We need to remove all rows that have an NaN value.
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
price_data.head()

Before NaN Drop we have 9414 rows and 22 columns
After NaN Drop we have 40 rows and 22 columns


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,...,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume IBM,On Balance Volume JPM,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-08-08,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2005-08-17,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2005-11-25,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2006-10-11,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2006-12-20,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
price_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ativo,datetime,change_in_price,down_days,...,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On Balance Volume IBM,On Balance Volume JPM,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-08-08,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2005-08-17,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2005-11-25,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2006-10-11,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2006-12-20,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2008-06-24,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2010-04-01,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2010-07-15,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2014-11-17,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2015-03-19,1.0,1.0,1.0,1.0,1.0,1,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
