In [480]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA

import warnings

In [481]:
pd.set_option('display.max_columns', 100)

In [482]:
# Set the start and end date
start_date = '1993-01-01'
end_date = '2023-07-27'
 
# Set the ticker
sp = '^GSPC'
dji = '^DJI'
ru2000 = '^RUT' #Russel 2000
ru3000 = '^RUA' # Russel 3000 --> captures 98% of the stock market
 
# Get the data
def stock_data(ticker, start=start_date, end=end_date):
    return yf.download(ticker, start, end)

sp = stock_data(sp)
dji = stock_data(dji)
ru2000 = stock_data(ru2000)
ru3000 = stock_data(ru3000)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [483]:
sp.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-01-04,435.700012,437.320007,434.480011,435.380005,435.380005,201210000
1993-01-05,435.380005,435.399994,433.549988,434.339996,434.339996,240350000
1993-01-06,434.339996,435.170013,432.519989,434.519989,434.519989,295240000
1993-01-07,434.519989,435.459991,429.76001,430.730011,430.730011,304850000
1993-01-08,430.730011,430.730011,426.880005,429.049988,429.049988,263470000


In [484]:
sp.drop(columns=['High', 'Low', 'Close', 'Open'], inplace=True)

In [485]:
sp['pct_change_adj_close'] = sp['Adj Close'].pct_change()*100
sp['pct_change_volume'] = sp['Volume'].pct_change()*100

sp.head()

Unnamed: 0_level_0,Adj Close,Volume,pct_change_adj_close,pct_change_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1993-01-04,435.380005,201210000,,
1993-01-05,434.339996,240350000,-0.238874,19.452314
1993-01-06,434.519989,295240000,0.041441,22.837529
1993-01-07,430.730011,304850000,-0.872222,3.254979
1993-01-08,429.049988,263470000,-0.390041,-13.573889


In [486]:
sp['lag_pct_change_adj_close'] = sp['pct_change_adj_close'].shift()
sp['lag_pct_change_volume'] = sp['pct_change_volume'].shift()

# sp['last_pct_change_adj_Close'] = sp['Adj Close'].shift().pct_change() # Alternatively
sp.head()

Unnamed: 0_level_0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1993-01-04,435.380005,201210000,,,,
1993-01-05,434.339996,240350000,-0.238874,19.452314,,
1993-01-06,434.519989,295240000,0.041441,22.837529,-0.238874,19.452314
1993-01-07,430.730011,304850000,-0.872222,3.254979,0.041441,22.837529
1993-01-08,429.049988,263470000,-0.390041,-13.573889,-0.872222,3.254979


In [487]:
sp['lag_ma5_pct_change_adj_close'] = sp['lag_pct_change_adj_close'].rolling(5).mean()
sp.head(10) 

Unnamed: 0_level_0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1993-01-04,435.380005,201210000,,,,,
1993-01-05,434.339996,240350000,-0.238874,19.452314,,,
1993-01-06,434.519989,295240000,0.041441,22.837529,-0.238874,19.452314,
1993-01-07,430.730011,304850000,-0.872222,3.254979,0.041441,22.837529,
1993-01-08,429.049988,263470000,-0.390041,-13.573889,-0.872222,3.254979,
1993-01-11,430.950012,217150000,0.442845,-17.580749,-0.390041,-13.573889,
1993-01-12,431.040009,239410000,0.020883,10.250979,0.442845,-17.580749,-0.20337
1993-01-13,433.029999,245360000,0.461672,2.485276,0.020883,10.250979,-0.151419
1993-01-14,435.940002,281040000,0.67201,14.541898,0.461672,2.485276,-0.067373
1993-01-15,437.149994,309720000,0.277559,10.204953,0.67201,14.541898,0.241474


In [488]:
fomc_topic = pd.read_pickle('../data/fomc_topic_modeling.pkl')
fomc_topic.drop(columns=['minutes_paragraphs', 'paragraphs_length', 'minutes_text',
       'text_length', 'topic_prob', 'topic_score', 'doc2bow'], inplace=True)

fomc_topic.head()

Unnamed: 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6
1993-02-03,0.021367,0.249447,0.285851,0.090775,0.317266,0.035295
1993-03-23,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546
1993-05-18,0.020222,0.059072,0.359659,0.10695,0.429912,0.024184
1993-07-07,0.02019,0.057558,0.407705,0.130867,0.360023,0.023656
1993-08-17,0.020382,0.057332,0.281115,0.125689,0.484803,0.030679


In [489]:
# day_of_week = meeting_datetime.strftime('%A')
fomc_topic['dayofweek'] = fomc_topic.index.strftime('%A')
fomc_topic['dayofweek'].value_counts()

dayofweek
Wednesday    135
Tuesday       99
Thursday       8
Friday         1
Sunday         1
Name: count, dtype: int64

In [490]:
fomc_topic.reset_index(names='date', inplace=True)

In [491]:
sunday_row = fomc_topic['dayofweek'] =='Sunday'
fomc_topic[sunday_row]

Unnamed: 0,date,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,dayofweek
217,2020-03-15,0.021008,0.140745,0.510809,0.021022,0.283044,0.023372,Sunday


In [492]:
fomc_topic.loc[sunday_row, 'date'] = fomc_topic.loc[sunday_row, 'date'] + pd.Timedelta(1, "d")
fomc_topic.loc[sunday_row, 'date']

217   2020-03-16
Name: date, dtype: datetime64[ns]

In [493]:
fomc_topic.loc[sunday_row, 'dayofweek'] = 'Monday'

In [494]:
fomc_topic['dayofweek'].value_counts()

dayofweek
Wednesday    135
Tuesday       99
Thursday       8
Friday         1
Monday         1
Name: count, dtype: int64

In [495]:
fomc_topic.set_index('date', inplace=True)

In [496]:
fomc_topic.head()

Unnamed: 0_level_0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,dayofweek
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1993-02-03,0.021367,0.249447,0.285851,0.090775,0.317266,0.035295,Wednesday
1993-03-23,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,Tuesday
1993-05-18,0.020222,0.059072,0.359659,0.10695,0.429912,0.024184,Tuesday
1993-07-07,0.02019,0.057558,0.407705,0.130867,0.360023,0.023656,Wednesday
1993-08-17,0.020382,0.057332,0.281115,0.125689,0.484803,0.030679,Tuesday


In [497]:
data = pd.merge(sp, fomc_topic, how='left', left_index=True, right_index=True).drop(columns='dayofweek')
data.tail()

Unnamed: 0_level_0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2023-07-20,4534.870117,3761770000,-0.67569,-8.598843,0.235791,0.627382,0.415526,,,,,,
2023-07-21,4536.339844,3570190000,0.032409,-5.092815,-0.67569,-8.598843,0.110985,,,,,,
2023-07-24,4554.640137,3856250000,0.403415,8.012459,0.032409,-5.092815,0.137955,,,,,,
2023-07-25,4567.459961,3812470000,0.281467,-1.1353,0.403415,8.012459,0.141531,,,,,,
2023-07-26,4566.75,3990290000,-0.015544,4.664168,0.281467,-1.1353,0.055479,,,,,,


In [498]:
print(sp.shape)
print(fomc.shape)

(7696, 7)
(244, 6)


In [499]:
data.isna().sum()

Adj Close                          0
Volume                             0
pct_change_adj_close               1
pct_change_volume                  1
lag_pct_change_adj_close           2
lag_pct_change_volume              2
lag_ma5_pct_change_adj_close       6
topic 1                         7452
topic 2                         7452
topic 3                         7452
topic 4                         7452
topic 5                         7452
topic 6                         7452
dtype: int64

In [500]:
data_filled = data.ffill()
# data_filled.dropna(inplace=True)

In [501]:
data_filled.head(5)

Unnamed: 0_level_0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1993-01-04,435.380005,201210000,,,,,,,,,,,
1993-01-05,434.339996,240350000,-0.238874,19.452314,,,,,,,,,
1993-01-06,434.519989,295240000,0.041441,22.837529,-0.238874,19.452314,,,,,,,
1993-01-07,430.730011,304850000,-0.872222,3.254979,0.041441,22.837529,,,,,,,
1993-01-08,429.049988,263470000,-0.390041,-13.573889,-0.872222,3.254979,,,,,,,


In [502]:
fomc_laughran_mcdonald = pd.read_pickle('../data/fomc_raw_laughran_mcdonald_sentiment.pkl')
fomc_laughran_mcdonald = fomc_laughran_mcdonald[['sentiment_mc']]
fomc_laughran_mcdonald.head()

Unnamed: 0,sentiment_mc
1993-02-03,1.30719
1993-03-23,1.11151
1993-05-18,1.486831
1993-07-07,2.200566
1993-08-17,2.022472


In [503]:
fomc_zstc_sentiment = pd.read_pickle('../data/fomc_raw_zstc_sentiment.pkl')
fomc_zstc_sentiment.drop(columns='fomc_text', inplace=True)
fomc_zstc_sentiment.head()

Unnamed: 0_level_0,sentiment
date,Unnamed: 1_level_1
1993-02-03,0.703742
1993-03-23,0.402003
1993-05-18,0.270261
1993-07-07,0.57182
1993-08-17,0.412569


In [504]:
data = pd.concat([data, fomc_laughran_mcdonald, fomc_zstc_sentiment], axis=1)
data.head()

Unnamed: 0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,sentiment_mc,sentiment
1993-01-04,435.380005,201210000.0,,,,,,,,,,,,,
1993-01-05,434.339996,240350000.0,-0.238874,19.452314,,,,,,,,,,,
1993-01-06,434.519989,295240000.0,0.041441,22.837529,-0.238874,19.452314,,,,,,,,,
1993-01-07,430.730011,304850000.0,-0.872222,3.254979,0.041441,22.837529,,,,,,,,,
1993-01-08,429.049988,263470000.0,-0.390041,-13.573889,-0.872222,3.254979,,,,,,,,,


In [505]:
fred = pd.read_pickle('../data/fred.pkl')
fred.head(3)

Unnamed: 0_level_0,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-01-01,2.66,7.3,3.442924,175108.0,6.6,,,,,,
1993-01-02,2.66,,,,,,,,,,
1993-01-03,2.66,,,,,,,,,,


In [506]:
fred.describe()

Unnamed: 0,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth
count,10207.0,366.0,366.0,365.0,365.0,7648.0,7705.0,7643.0,7668.0,365.0,121.0
mean,2.478277,5.710383,2.747473,369190.167123,3.90074,1.547496,19.705348,7.335381,109.208848,0.207468,0.610526
std,2.209135,1.786939,1.272248,127066.737926,1.7379,1.174596,8.178197,0.890454,13.812318,0.279168,1.218524
min,0.04,3.4,-0.286064,172306.0,0.62,-1.89,9.14,5.7076,75.72,-1.770548,-8.484336
25%,0.15,4.4,2.1005,275192.0,2.4,0.64,13.69,6.52935,102.6975,0.068489,0.364447
50%,1.89,5.4,2.542004,357331.0,3.85,1.55,17.92,7.029,109.5,0.203804,0.638435
75%,4.91,6.3,3.162344,448091.0,5.24,2.53,23.28,8.2772,117.9675,0.32769,0.943827
max,7.8,14.7,8.463917,692501.0,7.96,3.85,82.69,8.7409,149.82,1.376849,7.854483


In [507]:
(fred['10_year_treasury_yeild']==0).sum()

0

> there is no value equal to zero in the fred data. Therefore, we can create a sample that contain all days from 1993 to 2023 by using resample and then replace the zero values with nan again. 

In [508]:
fred = fred.resample('D').sum()
fred.head(3)

Unnamed: 0_level_0,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-01-01,2.66,7.3,3.442924,175108.0,6.6,0.0,0.0,0.0,0.0,0.0,0.0
1993-01-02,2.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1993-01-03,2.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [509]:
fred.replace(0, np.nan, inplace=True)
fred.head(3)

Unnamed: 0_level_0,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-01-01,2.66,7.3,3.442924,175108.0,6.6,,,,,,
1993-01-02,2.66,,,,,,,,,,
1993-01-03,2.66,,,,,,,,,,


In [510]:
fred.ffill(inplace=True)

In [511]:
fred.head(3)

Unnamed: 0_level_0,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-01-01,2.66,7.3,3.442924,175108.0,6.6,,,,,,
1993-01-02,2.66,7.3,3.442924,175108.0,6.6,,,,,,
1993-01-03,2.66,7.3,3.442924,175108.0,6.6,,,,,,


In [512]:
us_policy_uncertainty = pd.read_excel('../data/us_policy_uncertainty.xlsx', sheet_name='Main Index')
us_policy_uncertainty = us_policy_uncertainty.iloc[0:-1, :]
us_policy_uncertainty.head()

Unnamed: 0,Year,Month,Three_Component_Index,News_Based_Policy_Uncert_Index
0,1985,1.0,125.224739,103.748802
1,1985,2.0,99.020813,78.313202
2,1985,3.0,112.190506,100.761482
3,1985,4.0,102.811325,84.77887
4,1985,5.0,120.082726,98.053658


In [513]:
us_policy_uncertainty.dtypes

Year                               object
Month                             float64
Three_Component_Index             float64
News_Based_Policy_Uncert_Index    float64
dtype: object

In [514]:
us_policy_uncertainty['date']= us_policy_uncertainty['Year'] + \
                                '-' + \
                                us_policy_uncertainty['Month'].astype(int).astype(str) + \
                                '-1'
            
us_policy_uncertainty['date'] = pd.to_datetime(us_policy_uncertainty['date'])
us_policy_uncertainty.set_index('date', inplace=True)
us_policy_uncertainty.drop(columns=['Year', 'Month'], inplace=True)

In [515]:
us_policy_uncertainty = us_policy_uncertainty.resample('D').sum()
us_policy_uncertainty.head(3)

Unnamed: 0_level_0,Three_Component_Index,News_Based_Policy_Uncert_Index
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-01,125.224739,103.748802
1985-01-02,0.0,0.0
1985-01-03,0.0,0.0


In [516]:
us_policy_uncertainty.replace(0, np.nan, inplace=True)
us_policy_uncertainty.head(3)

Unnamed: 0_level_0,Three_Component_Index,News_Based_Policy_Uncert_Index
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-01,125.224739,103.748802
1985-01-02,,
1985-01-03,,


In [517]:
us_policy_uncertainty.ffill(inplace=True)
us_policy_uncertainty.head(3)

Unnamed: 0_level_0,Three_Component_Index,News_Based_Policy_Uncert_Index
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1985-01-01,125.224739,103.748802
1985-01-02,125.224739,103.748802
1985-01-03,125.224739,103.748802


In [518]:
data = pd.merge(data, fred, how='left', left_index=True, right_index=True) \
         .merge(us_policy_uncertainty, how='left', left_index=True, right_index=True)
data.ffill(inplace=True)
data.dropna(inplace=True)

In [519]:
data.head(3) 

Unnamed: 0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,sentiment_mc,sentiment,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth,Three_Component_Index,News_Based_Policy_Uncert_Index
1993-04-01,450.299988,234530000.0,-0.303324,-15.996275,-0.066373,20.762144,0.161433,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.31,7.1,4.09652,176749.0,5.97,3.1,13.02,5.7333,114.1,0.348918,0.582113,109.256836,89.651123
1993-04-02,441.390015,323330000.0,-1.978675,37.86296,-0.303324,-15.996275,-0.024659,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.13,7.1,4.09652,176749.0,5.97,3.19,14.5,5.7333,114.2,0.348918,0.582113,109.256836,89.651123
1993-04-05,442.290009,296080000.0,0.2039,-8.427922,-1.978675,37.86296,-0.282885,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.15,7.1,4.09652,176749.0,5.97,3.16,14.12,5.7333,113.6,0.348918,0.582113,109.256836,89.651123


In [520]:
data['price_increase'] = np.where(data['pct_change_adj_close']>=0, 1, 0)

In [521]:
data.head()

Unnamed: 0,Adj Close,Volume,pct_change_adj_close,pct_change_volume,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,sentiment_mc,sentiment,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth,Three_Component_Index,News_Based_Policy_Uncert_Index,price_increase
1993-04-01,450.299988,234530000.0,-0.303324,-15.996275,-0.066373,20.762144,0.161433,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.31,7.1,4.09652,176749.0,5.97,3.1,13.02,5.7333,114.1,0.348918,0.582113,109.256836,89.651123,0
1993-04-02,441.390015,323330000.0,-1.978675,37.86296,-0.303324,-15.996275,-0.024659,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.13,7.1,4.09652,176749.0,5.97,3.19,14.5,5.7333,114.2,0.348918,0.582113,109.256836,89.651123,0
1993-04-05,442.290009,296080000.0,0.2039,-8.427922,-1.978675,37.86296,-0.282885,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.15,7.1,4.09652,176749.0,5.97,3.16,14.12,5.7333,113.6,0.348918,0.582113,109.256836,89.651123,1
1993-04-06,441.160004,293680000.0,-0.25549,-0.810592,0.2039,-8.427922,-0.375652,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,2.97,7.1,4.09652,176749.0,5.97,3.1,14.24,5.7333,114.0,0.348918,0.582113,109.256836,89.651123,0
1993-04-07,442.730011,300000000.0,0.355882,2.152002,-0.25549,-0.810592,-0.479992,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,2.93,7.1,4.09652,176749.0,5.97,3.09,13.64,5.7333,113.7,0.348918,0.582113,109.256836,89.651123,1


In [523]:
data.drop(columns=['Adj Close', 'Volume', 'pct_change_adj_close', 'pct_change_volume'], inplace=True)

In [524]:
data.head(3)

Unnamed: 0,lag_pct_change_adj_close,lag_pct_change_volume,lag_ma5_pct_change_adj_close,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,sentiment_mc,sentiment,ffr,unemployment_rate,median_cpi,retail_sales,10_year_treasury_yeild,10year_3month_yield_spread,vix,us_china_exchange_rate,us_japan_exchange_rate,inflation,growth,Three_Component_Index,News_Based_Policy_Uncert_Index,price_increase
1993-04-01,-0.066373,20.762144,0.161433,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.31,7.1,4.09652,176749.0,5.97,3.1,13.02,5.7333,114.1,0.348918,0.582113,109.256836,89.651123,0
1993-04-02,-0.303324,-15.996275,-0.024659,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.13,7.1,4.09652,176749.0,5.97,3.19,14.5,5.7333,114.2,0.348918,0.582113,109.256836,89.651123,0
1993-04-05,-1.978675,37.86296,-0.282885,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546,1.11151,0.402003,3.15,7.1,4.09652,176749.0,5.97,3.16,14.12,5.7333,113.6,0.348918,0.582113,109.256836,89.651123,1


In [525]:
data.to_pickle('../data/full_data.pkl')