In [119]:
%%bash
pwd
pip install yfinance
pip install ta

/home/jovyan


In [120]:
import yfinance as yf
import pandas as pd
import numpy as np
ticker_data = yf.download(tickers = "AAPL",  # list of tickers
            period = "730d",       # time period
            interval = "60m",       # trading interval
            ignore_tz = True,      # ignore timezone when aligning data from different exchanges?
            prepost = False) 
ticker_data.index = pd.DatetimeIndex(ticker_data.index).to_period('D')
print(ticker_data.shape)
ticker_data.head()

[*********************100%***********************]  1 of 1 completed
(5094, 6)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-05-29,79.8125,80.0,79.1175,79.485001,79.485001,6272955
2020-05-29,79.485001,79.695,79.2425,79.580002,79.580002,3088345
2020-05-29,79.572502,79.887497,79.447502,79.712502,79.712502,3024228
2020-05-29,79.714973,79.894997,79.267502,79.741501,79.741501,2735059
2020-05-29,79.745003,80.065002,79.567497,79.982178,79.982178,2902417


In [121]:
import numpy as np
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from sklearn.metrics import mean_squared_error

def time_series_cross_validation(data, alpha_values):
    train_size = int(len(data) * 0.8)
    train, validation = data[:train_size], data[train_size:]
    
    best_alpha = None
    best_mse = np.inf
    
    for alpha in alpha_values:
        smoothing_model = SimpleExpSmoothing(train, initialization_method="heuristic").fit(smoothing_level=alpha, optimized=False)
        predictions = smoothing_model.forecast(len(validation))
        mse = mean_squared_error(validation, predictions)
        
        if mse < best_mse:
            best_mse = mse
            best_alpha = alpha
            
    return best_alpha, best_mse

# Test different alpha values
alpha_values = np.linspace(0.0001, 1, 1000)

# Assume 'Close' column contains the closing prices of the stock
best_alpha, best_mse = time_series_cross_validation(ticker_data['Close'], alpha_values)

print(f"Best alpha value: {best_alpha}, with MSE: {best_mse}")

Best alpha value: 1.0, with MSE: 475.9224184961105


In [122]:
# Choose the smoothing factor (alpha) based on the cross-validation results
alpha = best_alpha

def apply_smoothing(column, alpha):
    smoothing_model = SimpleExpSmoothing(column, initialization_method="heuristic")
    smoothing_model = smoothing_model.fit(optimized=True)
    return smoothing_model.fittedvalues

# Apply exponential smoothing to all numeric columns
smoothed_data = ticker_data.select_dtypes(include=[np.number]).apply(apply_smoothing, alpha=alpha)

# Add the smoothed values to the DataFrame as new columns
for col in smoothed_data.columns:
    ticker_data[col] = smoothed_data[col]

ticker_data.sample(20)



Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-08-12,148.664993,148.889999,148.610001,148.714996,148.714996,7525781.0
2020-07-09,96.078751,96.136421,94.672501,95.110001,95.110001,4078931.0
2021-10-05,138.585007,139.229996,138.470001,139.169998,139.169998,11477490.0
2020-11-03,110.089996,110.860001,109.975502,110.855003,110.855003,15532790.0
2023-03-27,158.768707,158.830002,157.869995,158.350006,158.350006,7377008.0
2021-07-29,144.929993,146.550003,144.720001,145.839996,145.839996,12662750.0
2022-04-19,164.809998,165.119995,163.979996,165.110001,165.110001,8524284.0
2022-02-17,169.800003,169.873001,168.895004,169.210007,169.210007,7933507.0
2022-09-12,157.429993,157.820007,157.320007,157.350006,157.350006,9132511.0
2023-02-07,150.990006,151.880005,150.779999,151.679993,151.679993,10963010.0


In [123]:
# Generate technical indicators using the TA-Lib library
from ta.utils import dropna
import ta
import pandas as pd

ticker_data = dropna(ticker_data)

# List of technical indicators to calculate
indicators = ['SMA', 'EMA', 'MACD', 'RSI', 'ADX', 'BollingerBands', 'Stochastic_Oscillator', 'Williams_R', 'Price_ROC', 'OBV']

# Create a new DataFrame to store the technical indicators
technical_indicators = pd.DataFrame(index=ticker_data.index)

# Calculate technical indicators
technical_indicators['SMA7'] = ta.trend.SMAIndicator(close=ticker_data['Close'], window=int(7*6.5)).sma_indicator()
technical_indicators['SMA21'] = ta.trend.SMAIndicator(close=ticker_data['Close'], window=int(21*6.5)).sma_indicator()
technical_indicators['EMA7'] = ta.trend.EMAIndicator(close=ticker_data['Close'], window=int(7*6.5)).ema_indicator()
technical_indicators['EMA21'] = ta.trend.EMAIndicator(close=ticker_data['Close'], window=int(21*6.5)).ema_indicator()

macd = ta.trend.MACD(close=ticker_data['Close']).macd()
macdsignal = ta.trend.MACD(close=ticker_data['Close']).macd_signal()
technical_indicators['MACD'] = macd
technical_indicators['MACD_signal'] = macdsignal

technical_indicators['RSI'] = ta.momentum.RSIIndicator(close=ticker_data['Close'], window=int(14*6.5)).rsi()
technical_indicators['ADX'] = ta.trend.ADXIndicator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], window=int(14*6.5)).adx()

technical_indicators['BB_upper'], technical_indicators['BB_middle'], technical_indicators['BB_lower'] = ta.volatility.BollingerBands(close=ticker_data['Close'], window=int(20*6.5)).bollinger_mavg(), ta.volatility.BollingerBands(close=ticker_data['Close'], window=int(20*6.5)).bollinger_hband(), ta.volatility.BollingerBands(close=ticker_data['Close'], window=int(20*6.5)).bollinger_lband()

# Adding Stochastic Oscillator, Williams %R, Price Rate of Change, and On Balance Volume
# Stochastic Oscillator has a window of 2 weeks
technical_indicators['Stochastic_Oscillator'] = ta.momentum.StochasticOscillator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], window=int(14*6.5)).stoch()
# Williams %R has a window of 2 weeks
technical_indicators['Williams_R'] = ta.momentum.WilliamsRIndicator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], lbp=int(14*6.5)).williams_r()
technical_indicators['Price_ROC'] = ta.momentum.ROCIndicator(close=ticker_data['Close'], window=int(12*6.5)).roc()
technical_indicators['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=ticker_data['Close'], volume=ticker_data['Volume']).on_balance_volume()


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [124]:
technical_indicators = technical_indicators.dropna()
technical_indicators.reset_index(inplace=True)
technical_indicators.rename(columns={'index': 'Datetime'}, inplace=True)
technical_indicators.sample(10)

Unnamed: 0,Datetime,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV
1545,2021-05-13,127.372477,131.372894,126.981187,129.348644,-1.404061,-1.437509,46.080599,7.956754,131.249912,138.204688,124.295136,21.659902,-78.340098,-6.64484,-43035240.0
2668,2021-12-31,177.776546,173.897578,177.730833,173.129936,0.338156,0.672337,55.835008,17.999923,174.418581,183.171081,165.666082,77.44431,-22.55569,2.248593,378493300.0
401,2020-09-16,115.031978,121.421157,116.065356,117.829344,-0.371869,-0.275735,47.701483,19.510148,121.652798,134.080642,109.224954,11.664177,-88.335823,-12.545425,148301100.0
719,2020-11-19,119.209951,116.082673,118.997468,117.413947,-0.091383,0.126742,50.861612,5.839738,116.118483,123.570716,108.666249,72.801647,-27.198353,6.445353,19230500.0
673,2020-11-10,115.148969,115.951187,116.406301,115.635395,0.421699,1.066437,50.900171,8.740495,115.71826,122.806981,108.629538,64.695297,-35.304703,2.156454,10367300.0
3054,2022-03-22,158.724142,160.934927,161.206878,161.735004,2.370328,2.068236,53.663171,9.477601,160.83154,169.513767,152.149313,98.370149,-1.629851,4.506902,256786700.0
4481,2023-01-12,129.767277,131.174851,130.695257,132.324145,1.053206,1.031554,49.545007,9.181308,130.620614,137.011563,124.229666,89.469768,-10.530232,2.455667,-18574230.0
1949,2021-08-04,146.098787,146.346163,146.511603,144.920999,0.294789,0.260308,54.222997,11.616177,146.464078,149.803538,143.124618,65.47794,-34.52206,0.049752,134116300.0
4271,2022-11-29,148.089943,145.820776,146.721817,146.815098,-2.154151,-1.593516,46.011836,6.196802,145.567674,156.037501,135.097847,32.988344,-67.011656,-4.065927,150604900.0
550,2020-10-15,119.331898,114.763908,119.532006,116.903188,0.158989,0.444307,53.747574,10.151102,115.144862,123.770928,106.518796,66.324995,-33.675005,3.601715,144859400.0


In [125]:
# Perform normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
technical_indicators_scaled = pd.DataFrame(scaler.fit_transform(technical_indicators.drop(columns=['Datetime'])), columns=technical_indicators.drop(columns=['Datetime']).columns)

technical_indicators_scaled

Unnamed: 0,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV
0,0.000000,0.000000,0.000000,0.000000,0.552075,0.584861,0.698980,0.000000,0.000000,0.000000,0.000000,0.746363,0.746363,0.542701,0.373925
1,0.000655,0.000907,0.000680,0.000749,0.549502,0.577453,0.714546,0.000000,0.000896,0.000788,0.000984,0.775736,0.775736,0.533120,0.383872
2,0.001205,0.001799,0.001241,0.001456,0.544950,0.570495,0.701300,0.000000,0.001803,0.001421,0.002148,0.754836,0.754836,0.530022,0.374428
3,0.001671,0.002663,0.001655,0.002112,0.538229,0.563405,0.683509,0.000000,0.002652,0.002018,0.003233,0.720832,0.720832,0.525586,0.365528
4,0.002211,0.003558,0.002302,0.002843,0.537826,0.557642,0.708755,0.000000,0.003563,0.002693,0.004362,0.772108,0.772108,0.518476,0.374309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4954,0.844664,0.853914,0.849328,0.860852,0.516780,0.564585,0.514921,0.360972,0.854976,0.821165,0.868898,0.633651,0.633651,0.475959,0.837815
4955,0.845848,0.854401,0.848993,0.861205,0.499998,0.550010,0.508823,0.359011,0.855407,0.821325,0.869594,0.614450,0.614450,0.485810,0.826518
4956,0.846183,0.854868,0.848610,0.861530,0.485443,0.535051,0.503810,0.356934,0.855792,0.821495,0.870188,0.598747,0.598747,0.499273,0.815685
4957,0.846636,0.855432,0.848474,0.861929,0.479090,0.521643,0.519578,0.355029,0.856245,0.821707,0.870875,0.655727,0.655727,0.496781,0.826252


In [126]:
# Create two copies of the technical_indicators DataFrame
technical_indicators_binary = technical_indicators.copy()
technical_indicators_signed = technical_indicators.copy()

# Add a new column called "Label" to each DataFrame, initialized with 0
technical_indicators_binary['Label'] = 0
technical_indicators_signed['Label'] = 0

# Define a function to label the data based on the fixed number of hours (d)
def label_data_fixed_days(row_index, d, close_prices, binary=True):
    if row_index + d >= len(close_prices):
        return None
    current_price = close_prices.iloc[row_index]
    future_price = close_prices.iloc[row_index + d]
    if binary:
        return 1 if future_price > current_price else 0
    else:
        return 1 if future_price > current_price else -1

# Iterate over the rows of the DataFrame and apply the label_data_fixed_days function
d = 5  # Choose the appropriate number of hours
for idx in range(len(technical_indicators)):
    binary_label = label_data_fixed_days(idx, d, ticker_data['Close'], binary=True)
    signed_label = label_data_fixed_days(idx, d, ticker_data['Close'], binary=False)
    if binary_label is not None and signed_label is not None:
        technical_indicators_binary.loc[technical_indicators_binary.index[idx], 'Label'] = binary_label
        technical_indicators_signed.loc[technical_indicators_signed.index[idx], 'Label'] = signed_label

# Remove the rows without a label (the last d rows)
technical_indicators_binary = technical_indicators_binary[:-d]
technical_indicators_signed = technical_indicators_signed[:-d]

print("Technical Indicators with Binary Labels:")
print(technical_indicators_binary.shape)
print("\nTechnical Indicators with Signed Labels:")
print(technical_indicators_signed.shape)

Technical Indicators with Binary Labels:
(4954, 17)

Technical Indicators with Signed Labels:
(4954, 17)


In [127]:
technical_indicators_binary.head()

Unnamed: 0,Datetime,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV,Label
0,2020-06-25,89.27882,85.309657,89.309288,86.149686,0.434262,0.666726,59.27572,0.0,85.570209,92.817025,78.323394,74.636324,-25.363676,3.429836,84039200.0,1
1,2020-06-25,89.338043,85.392231,89.370406,86.216333,0.414392,0.616259,59.734835,0.0,85.651594,92.890053,78.413134,77.573625,-22.426375,3.022299,90054420.0,1
2,2020-06-25,89.387737,85.473435,89.420768,86.279288,0.379244,0.568856,59.344159,0.0,85.733988,92.948748,78.519228,75.483602,-24.516398,2.890534,84343610.0,1
3,2020-06-25,89.429903,85.552093,89.458017,86.337656,0.327342,0.520553,58.819444,0.0,85.811123,93.00409,78.618155,72.083178,-27.916822,2.701857,78961500.0,1
4,2020-06-25,89.478659,85.633582,89.516147,86.402727,0.324229,0.481288,59.564032,0.0,85.893892,93.066694,78.72109,77.210767,-22.789233,2.399408,84271320.0,1


In [128]:
# Create correlation matrix
import pandas as pd

# Compute the correlation matrix
correlation_matrix = technical_indicators_signed.corr()

# Display the correlation between class labels ('Label') and features
label_correlation = correlation_matrix['Label']
print("Correlation between class labels and features:")
print(label_correlation)

Correlation between class labels and features:
SMA7                     0.006446
SMA21                    0.007332
EMA7                     0.006993
EMA21                   -0.003180
MACD                     0.029870
MACD_signal              0.011686
RSI                      0.066700
ADX                      0.038850
BB_upper                 0.009030
BB_middle                0.008477
BB_lower                 0.009386
Stochastic_Oscillator   -0.014501
Williams_R              -0.014501
Price_ROC                0.009530
OBV                      0.052729
Label                    1.000000
Name: Label, dtype: float64


  correlation_matrix = technical_indicators_signed.corr()


In [129]:
# Save the data to CSV files
technical_indicators_binary.to_csv('/home/jovyan/technical_indicators_binary.csv', index=False)
technical_indicators_signed.to_csv('/home/jovyan/technical_indicators_signed.csv', index=False)