In [334]:
%%bash
pwd
pip install yfinance
pip install ta

/home/jovyan


In [335]:
import yfinance as yf
import pandas as pd
import numpy as np
apple_data = yf.download(tickers = "AAPL",  # list of tickers
            period = "10y",         # time period
            interval = "1d",       # trading interval
            ignore_tz = True,      # ignore timezone when aligning data from different exchanges?
            prepost = False) 
apple_data.index = pd.DatetimeIndex(apple_data.index).to_period('D')
print(apple_data.shape)
apple_data.head()

[*********************100%***********************]  1 of 1 completed
(2517, 6)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-04-15,15.25,15.281786,14.983929,14.994643,12.950588,317520000
2013-04-16,15.056071,15.236071,15.020357,15.222857,13.147696,305771200
2013-04-17,15.009643,15.021429,14.218214,14.385714,12.424669,945056000
2013-04-18,14.463929,14.4925,13.919286,14.001786,12.09308,666299200
2013-04-19,13.856071,14.271429,13.753571,13.9475,12.046193,609274400


In [336]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing

# Choose the smoothing factor (alpha) between 0 and 1
alpha = 0.1

def apply_smoothing(column, alpha):
    smoothing_model = SimpleExpSmoothing(column, initialization_method="heuristic")
    smoothing_model = smoothing_model.fit(optimized=True)
    return smoothing_model.fittedvalues

# Apply exponential smoothing to all numeric columns
smoothed_data = apple_data.select_dtypes(include=[np.number]).apply(apply_smoothing, alpha=alpha)

# Add the smoothed values to the DataFrame as new columns
for col in smoothed_data.columns:
    apple_data[col] = smoothed_data[col]

apple_data.sample(20)



Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-30,143.149667,147.229996,143.080002,145.818807,145.595974,66062290.0
2015-02-11,30.016379,30.5375,30.040001,30.473744,27.496925,211671800.0
2015-05-22,32.516929,32.907501,32.4575,32.82974,29.746578,162989900.0
2016-07-20,24.875351,25.0,24.834999,24.966213,23.084792,116954800.0
2015-02-03,29.516137,29.7925,29.02,29.638953,26.6386,314856800.0
2015-06-22,31.919767,31.955,31.6,31.666697,28.692896,173942800.0
2022-12-27,131.148955,132.419998,129.639999,131.88849,131.687594,75843090.0
2021-09-15,150.390114,151.070007,146.910004,148.195547,146.901753,103826700.0
2017-01-17,29.773698,29.905001,29.702499,29.763138,27.811394,107272500.0
2022-12-30,128.113535,130.479996,127.730003,129.430896,129.232827,77154810.0


In [337]:
# Generate technical indicators using the TA-Lib library
from ta.utils import dropna
import ta
import pandas as pd

apple_data = dropna(apple_data)

# List of technical indicators to calculate
indicators = ['SMA', 'EMA', 'MACD', 'RSI', 'ADX', 'BollingerBands', 'Stochastic_Oscillator', 'Williams_R', 'Price_ROC', 'OBV']

# Create a new DataFrame to store the technical indicators
technical_indicators = pd.DataFrame(index=apple_data.index)

# Calculate technical indicators
technical_indicators['SMA7'] = ta.trend.SMAIndicator(close=apple_data['Close'], window=7).sma_indicator()
technical_indicators['SMA21'] = ta.trend.SMAIndicator(close=apple_data['Close'], window=21).sma_indicator()
technical_indicators['EMA7'] = ta.trend.EMAIndicator(close=apple_data['Close'], window=7).ema_indicator()
technical_indicators['EMA21'] = ta.trend.EMAIndicator(close=apple_data['Close'], window=21).ema_indicator()

macd = ta.trend.MACD(close=apple_data['Close']).macd()
macdsignal = ta.trend.MACD(close=apple_data['Close']).macd_signal()
technical_indicators['MACD'] = macd
technical_indicators['MACD_signal'] = macdsignal

technical_indicators['RSI'] = ta.momentum.RSIIndicator(close=apple_data['Close'], window=14).rsi()
technical_indicators['ADX'] = ta.trend.ADXIndicator(high=apple_data['High'], low=apple_data['Low'], close=apple_data['Close'], window=14).adx()

technical_indicators['BB_upper'], technical_indicators['BB_middle'], technical_indicators['BB_lower'] = ta.volatility.BollingerBands(close=apple_data['Close'], window=20).bollinger_mavg(), ta.volatility.BollingerBands(close=apple_data['Close'], window=20).bollinger_hband(), ta.volatility.BollingerBands(close=apple_data['Close'], window=20).bollinger_lband()

# Adding Stochastic Oscillator, Williams %R, Price Rate of Change, and On Balance Volume
# Stochastic Oscillator has a window of 2 weeks
technical_indicators['Stochastic_Oscillator'] = ta.momentum.StochasticOscillator(high=apple_data['High'], low=apple_data['Low'], close=apple_data['Close'], window=14).stoch()
# Williams %R has a window of 2 weeks
technical_indicators['Williams_R'] = ta.momentum.WilliamsRIndicator(high=apple_data['High'], low=apple_data['Low'], close=apple_data['Close'], lbp=14).williams_r()
technical_indicators['Price_ROC'] = ta.momentum.ROCIndicator(close=apple_data['Close'], window=12).roc()
technical_indicators['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=apple_data['Close'], volume=apple_data['Volume']).on_balance_volume()

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [338]:
technical_indicators = technical_indicators.dropna()
technical_indicators.reset_index(inplace=True)
technical_indicators.rename(columns={'index': 'Date'}, inplace=True)
technical_indicators.sample(10)

Unnamed: 0,Date,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV
955,2017-03-16,34.819013,34.464001,34.828378,34.237236,0.718424,0.833464,79.503578,63.250659,34.500648,35.284268,33.717029,93.273052,-6.726948,2.532046,14124710000.0
749,2016-05-20,23.194754,23.96284,23.381602,24.02293,-0.731181,-0.817459,39.390327,40.708951,23.835588,26.024863,21.646313,73.813537,-26.186463,-0.92743,9198491000.0
1617,2019-10-31,61.026114,58.775979,60.818275,59.094159,1.634669,1.555684,66.363214,35.501211,58.974049,62.816077,55.13202,62.887884,-37.112116,3.140476,17215160000.0
1608,2019-10-18,58.341653,56.370237,58.249594,56.714575,1.316381,1.128804,68.188607,26.187822,56.425615,59.663116,53.188115,87.409391,-12.590609,4.761614,17031710000.0
748,2016-05-19,23.165112,24.116459,23.324118,24.069818,-0.792154,-0.839028,40.14888,41.645906,23.983279,26.456746,21.509812,78.280173,-21.719827,0.912397,9359239000.0
1309,2018-08-10,51.719444,49.181867,51.364926,49.655537,1.291118,0.930128,74.56886,28.21708,49.255101,52.991037,45.519164,95.233159,-4.766841,8.224686,15440380000.0
790,2016-07-20,24.584211,24.035841,24.624311,24.291285,0.161409,0.015032,64.520469,16.741898,24.048651,25.042766,23.054535,95.92085,-4.07915,4.534041,10478740000.0
408,2015-01-13,27.242936,27.635125,27.490439,27.690187,-0.159127,-0.1283,46.15177,23.080793,27.621336,28.815657,26.427016,47.007028,-52.992972,-2.808912,10566670000.0
1931,2021-02-01,139.014986,133.06013,136.808811,133.781566,2.936521,2.800398,49.44292,20.042708,133.02384,143.252316,122.795365,29.558505,-70.441495,2.662924,20265030000.0
1304,2018-08-03,48.867962,48.01231,49.273245,48.217829,0.70665,0.422606,76.290711,19.235042,48.096302,50.241541,45.951062,93.076989,-6.923011,8.160442,15028230000.0


In [339]:
# Perform normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
technical_indicators_scaled = pd.DataFrame(scaler.fit_transform(technical_indicators.drop(columns=['Date'])), columns=technical_indicators.drop(columns=['Date']).columns)

technical_indicators_scaled

Unnamed: 0,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV
0,0.008931,0.006613,0.008844,0.004984,0.480682,0.470446,0.559746,0.113889,0.006848,0.005617,0.009070,0.793023,0.793023,0.491569,0.179297
1,0.009241,0.006722,0.009116,0.005166,0.481128,0.470447,0.540278,0.111718,0.006899,0.005674,0.009114,0.793713,0.793713,0.533452,0.163505
2,0.009531,0.006780,0.009367,0.005348,0.481522,0.470535,0.548897,0.095471,0.006910,0.005691,0.009117,0.833364,0.833364,0.611774,0.179407
3,0.009756,0.006776,0.009485,0.005488,0.481437,0.470587,0.529260,0.084182,0.006791,0.005371,0.009221,0.799208,0.799208,0.581601,0.164881
4,0.009767,0.006621,0.009354,0.005533,0.480423,0.470402,0.469230,0.070251,0.006643,0.005026,0.009289,0.692003,0.692003,0.566810,0.151192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2479,0.895449,0.881739,0.903294,0.897960,0.746333,0.710129,0.705482,0.424134,0.882334,0.892349,0.855396,0.924818,0.924818,0.653618,0.997710
2480,0.900191,0.885326,0.904629,0.901169,0.742302,0.720424,0.628806,0.419018,0.885756,0.896490,0.857968,0.766088,0.766088,0.596943,0.995449
2481,0.906186,0.888812,0.906781,0.904516,0.740297,0.728213,0.645335,0.414268,0.890061,0.899575,0.863517,0.826416,0.826416,0.580578,0.997582
2482,0.907538,0.892195,0.904671,0.906164,0.721845,0.730331,0.546825,0.396431,0.894255,0.897489,0.874465,0.568187,0.568187,0.565268,0.995481


In [340]:
# Create two copies of the technical_indicators DataFrame
technical_indicators_binary = technical_indicators.copy()
technical_indicators_signed = technical_indicators.copy()

# Add a new column called "Label" to each DataFrame, initialized with 0
technical_indicators_binary['Label'] = 0
technical_indicators_signed['Label'] = 0

# Define a function to label the data based on the fixed number of days (d)
def label_data_fixed_days(row_index, d, close_prices, binary=True):
    if row_index + d >= len(close_prices):
        return None
    current_price = close_prices.iloc[row_index]
    future_price = close_prices.iloc[row_index + d]
    if binary:
        return 1 if future_price > current_price else 0
    else:
        return 1 if future_price > current_price else -1

# Iterate over the rows of the DataFrame and apply the label_data_fixed_days function
d = 5  # Choose the appropriate number of days
for idx in range(len(technical_indicators)):
    binary_label = label_data_fixed_days(idx, d, apple_data['Close'], binary=True)
    signed_label = label_data_fixed_days(idx, d, apple_data['Close'], binary=False)
    if binary_label is not None and signed_label is not None:
        technical_indicators_binary.loc[technical_indicators_binary.index[idx], 'Label'] = binary_label
        technical_indicators_signed.loc[technical_indicators_signed.index[idx], 'Label'] = signed_label

# Remove the rows without a label (the last d rows)
technical_indicators_binary = technical_indicators_binary[:-d]
technical_indicators_signed = technical_indicators_signed[:-d]

print("Technical Indicators with Binary Labels:")
print(technical_indicators_binary.shape)
print("\nTechnical Indicators with Signed Labels:")
print(technical_indicators_signed.shape)

Technical Indicators with Binary Labels:
(2479, 17)

Technical Indicators with Signed Labels:
(2479, 17)


In [341]:
# Create correlation matrix
import pandas as pd

# Compute the correlation matrix
correlation_matrix = technical_indicators_signed.corr()

# Display the correlation between class labels ('Label') and features
label_correlation = correlation_matrix['Label']
print("Correlation between class labels and features:")
print(label_correlation)

Correlation between class labels and features:
SMA7                    -0.007301
SMA21                   -0.007031
EMA7                    -0.007274
EMA21                   -0.009710
MACD                     0.126475
MACD_signal              0.188533
RSI                      0.077504
ADX                      0.121267
BB_upper                -0.006917
BB_middle               -0.009790
BB_lower                -0.003621
Stochastic_Oscillator   -0.018629
Williams_R              -0.018629
Price_ROC               -0.030680
OBV                      0.030923
Label                    1.000000
Name: Label, dtype: float64


  correlation_matrix = technical_indicators_signed.corr()


In [342]:
# Save the data to CSV files
technical_indicators_binary.to_csv('/home/jovyan/technical_indicators_binary.csv', index=False)
technical_indicators_signed.to_csv('/home/jovyan/technical_indicators_signed.csv', index=False)