In [138]:
%%bash
pwd
pip install yfinance
pip install ta

/content
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [139]:
import yfinance as yf
import pandas as pd
import numpy as np
#2009-07-27 to 2020-06-11
ticker_data = yf.download(tickers = "MRK",  # list of tickers
            start="2009-06-09", end="2020-06-12",       # time period
            interval = "1d",       # trading interval
            ignore_tz = True,      # ignore timezone when aligning data from different exchanges?
            prepost = False) 
ticker_data.index = pd.DatetimeIndex(ticker_data.index).to_period('D')
print(ticker_data.shape)
ticker_data.head()

[*********************100%***********************]  1 of 1 completed
(2772, 6)


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-06-09,24.742367,24.933207,24.37023,24.541985,15.245304,12226702
2009-06-10,24.732824,24.933207,24.246183,24.417938,15.168242,13460198
2009-06-11,24.513359,25.353052,24.513359,24.990458,15.523888,17891037
2009-06-12,24.923664,25.830153,24.503817,24.885496,15.45869,31199170
2009-06-15,24.646948,24.80916,23.654579,23.816793,14.794823,28282586


In [140]:
import numpy as np
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from sklearn.metrics import mean_squared_error

def time_series_cross_validation(data, alpha_values):
    train_size = int(len(data) * 0.8)
    train, validation = data[:train_size], data[train_size:]
    
    best_alpha = None
    best_mse = np.inf
    
    for alpha in alpha_values:
        smoothing_model = SimpleExpSmoothing(train, initialization_method="heuristic").fit(smoothing_level=alpha, optimized=False)
        predictions = smoothing_model.forecast(len(validation))
        mse = mean_squared_error(validation, predictions)
        
        if mse < best_mse: 
            best_mse = mse
            best_alpha = alpha
            
    return best_alpha, best_mse

# Test different alpha values
alpha_values = np.linspace(0.0001, 1, 1000)

# Assume 'Close' column contains the closing prices of the stock
best_alpha, best_mse = time_series_cross_validation(ticker_data['Close'], alpha_values)

print(f"Best alpha value: {best_alpha}, with MSE: {best_mse}")

Best alpha value: 0.005104504504504504, with MSE: 343.8494785183558


In [141]:
# Choose the smoothing factor (alpha) based on the cross-validation results
alpha = best_alpha

def apply_smoothing(column, alpha):
    smoothing_model = SimpleExpSmoothing(column, initialization_method="heuristic")
    smoothing_model = smoothing_model.fit(smoothing_level=alpha, optimized=False)
    return smoothing_model.fittedvalues

# Apply exponential smoothing to all numeric columns
smoothed_data = ticker_data.select_dtypes(include=[np.number]).apply(apply_smoothing, alpha=alpha)

# Add the smoothed values to the DataFrame as new columns
for col in smoothed_data.columns:
    ticker_data[col] = smoothed_data[col]

In [142]:
from ta.utils import dropna
import ta
import pandas as pd

ticker_data = dropna(ticker_data)

# List of technical indicators to calculate
indicators = ['SMA', 'EMA', 'MACD', 'RSI', 'ADX', 'BollingerBands', 'Stochastic_Oscillator', 'Williams_R', 'Price_ROC', 'OBV']

# Create a new DataFrame to store the technical indicators
technical_indicators = pd.DataFrame(index=ticker_data.index)

# Calculate technical indicators
technical_indicators['SMA7'] = ta.trend.SMAIndicator(close=ticker_data['Close'], window=7).sma_indicator()
technical_indicators['SMA21'] = ta.trend.SMAIndicator(close=ticker_data['Close'], window=21).sma_indicator()
technical_indicators['EMA7'] = ta.trend.EMAIndicator(close=ticker_data['Close'], window=7).ema_indicator()
technical_indicators['EMA21'] = ta.trend.EMAIndicator(close=ticker_data['Close'], window=21).ema_indicator()

macd = ta.trend.MACD(close=ticker_data['Close']).macd()
macdsignal = ta.trend.MACD(close=ticker_data['Close']).macd_signal()
technical_indicators['MACD'] = macd
technical_indicators['MACD_signal'] = macdsignal

technical_indicators['RSI'] = ta.momentum.RSIIndicator(close=ticker_data['Close'], window=14).rsi()
technical_indicators['ADX'] = ta.trend.ADXIndicator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], window=14).adx()

technical_indicators['BB_upper'], technical_indicators['BB_middle'], technical_indicators['BB_lower'] = ta.volatility.BollingerBands(close=ticker_data['Close'], window=20).bollinger_mavg(), ta.volatility.BollingerBands(close=ticker_data['Close'], window=20).bollinger_hband(), ta.volatility.BollingerBands(close=ticker_data['Close'], window=20).bollinger_lband()

# Adding Stochastic Oscillator, Williams %R, Price Rate of Change, and On Balance Volume
# Stochastic Oscillator has a window of 14 days
technical_indicators['Stochastic_Oscillator'] = ta.momentum.StochasticOscillator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], window=14).stoch()
# Williams %R has a window of 14 days
technical_indicators['Williams_R'] = ta.momentum.WilliamsRIndicator(high=ticker_data['High'], low=ticker_data['Low'], close=ticker_data['Close'], lbp=14).williams_r()
technical_indicators['Price_ROC'] = ta.momentum.ROCIndicator(close=ticker_data['Close'], window=12).roc()
technical_indicators['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=ticker_data['Close'], volume=ticker_data['Volume']).on_balance_volume()


  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


In [143]:
# Merge technical indicators with sentiment analysis vectors
from google.colab import drive
drive.mount('/content/drive')
technical_indicators = technical_indicators.dropna()
sentiment_vectors = pd.read_csv('/content/drive/My Drive/sentiment_vectors.csv', index_col=0)
sentiment_vectors.index = pd.DatetimeIndex(sentiment_vectors.index)

# Convert Period objects to datetime objects
technical_indicators.index = technical_indicators.index.to_timestamp()

# Standardize date format for both DataFrames
technical_indicators.index = technical_indicators.index.strftime('%Y-%m-%d')
sentiment_vectors.index = sentiment_vectors.index.strftime('%Y-%m-%d')

# Merge the DataFrames on the index
feature_vectors = pd.merge(technical_indicators, sentiment_vectors, left_index=True, right_index=True, how='left')

# Fill missing values with forward-fill method
feature_vectors.fillna(method='ffill', inplace=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [144]:
print("Technical indicators date range:")
print(technical_indicators.index.min(), "to", technical_indicators.index.max())

print("Sentiment data date range:")
print(sentiment_vectors.index.min(), "to", sentiment_vectors.index.max())

feature_vectors.sample(10)

Technical indicators date range:
2009-07-27 to 2020-06-11
Sentiment data date range:
2009-07-27 to 2020-06-11


Unnamed: 0_level_0,SMA7,SMA21,EMA7,EMA21,MACD,MACD_signal,RSI,ADX,BB_upper,BB_middle,BB_lower,Stochastic_Oscillator,Williams_R,Price_ROC,OBV,numArticles,neutral,positive,negative
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2010-11-10,32.188972,32.086279,32.186377,32.082063,0.108022,0.111339,99.999765,99.988759,32.094384,32.267649,31.921119,59.640387,-40.359613,0.491622,5926503000.0,1.0,0.91393,0.037931,0.048138
2014-07-18,48.302202,48.027233,48.30003,48.024204,0.278421,0.280614,100.0,100.0,48.047318,48.501223,47.593414,71.042997,-28.957003,0.966212,16251800000.0,2.0,0.476522,0.048952,0.474525
2011-12-02,32.34951,32.327305,32.350616,32.336507,0.006246,3.9e-05,70.615729,42.681534,32.329181,32.37519,32.283172,53.870525,-46.129475,0.125555,6047778000.0,2.0,0.448772,0.491923,0.059305
2016-10-12,54.083094,53.876256,54.083635,53.871995,0.212872,0.214427,99.997499,99.883501,53.890794,54.232766,53.548821,68.137047,-31.862953,0.656363,19058270000.0,2.0,0.211409,0.632064,0.156528
2012-01-27,32.892534,32.745097,32.892543,32.761754,0.115897,0.103918,99.494733,91.441694,32.755013,32.996557,32.51347,66.732068,-33.267932,0.779163,6681918000.0,2.0,0.93062,0.022403,0.046977
2016-04-25,51.817126,51.773507,51.820081,51.803665,-0.009035,-0.03007,64.717487,59.574827,51.774045,51.844885,51.703205,57.950648,-42.049352,0.204822,17900850000.0,1.0,0.02859,0.018386,0.953024
2015-10-07,53.581461,53.776173,53.581783,53.743894,-0.130827,-0.105293,3.508894,77.617818,53.764808,54.071254,53.458362,35.08915,-64.91085,-0.681058,19183640000.0,2.0,0.44038,0.44104,0.11858
2009-12-31,27.769114,27.477375,27.767265,27.490043,0.265705,0.256512,99.999802,99.980229,27.498051,27.978685,27.017418,65.886969,-34.113031,1.812504,2234620000.0,1.0,0.870135,0.120457,0.009407
2018-11-29,60.608851,60.20596,60.609415,60.227742,0.365775,0.352612,99.999421,99.963891,60.233587,60.894287,59.572887,73.336564,-26.663436,1.169294,22202710000.0,5.0,0.487107,0.371084,0.141809
2019-03-29,65.218209,64.746384,65.21919,64.757894,0.44762,0.436704,99.999999,99.999892,64.779869,65.552756,64.006981,74.399606,-25.600394,1.268541,23141280000.0,2.0,0.588565,0.148991,0.262444


In [145]:
# Perform normalization
from sklearn.preprocessing import MinMaxScaler

feature_vectors = feature_vectors.dropna()
feature_vectors = feature_vectors.copy()
feature_vectors.reset_index(inplace=True)
feature_vectors.rename(columns={'index': 'Datetime'}, inplace=True)

scaler = MinMaxScaler()
feature_vectors_scaled = pd.DataFrame(scaler.fit_transform(feature_vectors.drop(columns=['Date'])), columns=feature_vectors.drop(columns=['Date']).columns)


In [147]:
# Save the data to CSV files
feature_vectors_scaled.to_csv('/content/drive/My Drive/feature_vectors_scaled_day_sentiment.csv', index=False)
ticker_data.to_csv('/content/drive/My Drive/ticker_data_day_sentiment.csv', index=False)