In [18]:
import pandas as pd
import numpy as np
import os as os
import yfinance as yf
import pandas_datareader.data as pdr


tickers = ['AAPL', 'MSI', 'GOOGL']

price_data = yf.download(tickers, start='2011-11-01', end='2022-12-31', interval='1mo')['Close']

#we have to download the data separately for Samsung as it is not in US prices:
samsung_data = yf.download('005930.KS', start='2011-11-01', end='2022-12-31', interval='1mo')['Close']

#fetching usd/krw historical exchange rate to convert samsung KWR stock price to USD stock price
exchange_rates = pdr.DataReader('DEXKOUS', 'fred', start='2011-11-01', end='2022-12-31')
exchange_rates = exchange_rates.resample('1ME').ffill().reindex(samsung_data.index, method='nearest')
price_data['SAMS'] = samsung_data / exchange_rates['DEXKOUS']
order = ['AAPL', 'SAMS', 'GOOGL', 'MSI']
price_data = price_data[order]



weights = [0.6823, 0.2520, 0.0390, 0.0267]
weighted_prices = price_data.multiply(weights, axis='columns')
portfolio_value = weighted_prices.sum(axis=1)
log_returns = np.log(portfolio_value / portfolio_value.shift(1)).dropna()
log_returns

[*********************100%%**********************]  3 of 3 completed
[*********************100%%**********************]  1 of 1 completed


Date
2011-12-01    0.051866
2012-01-01    0.078040
2012-02-01    0.146806
2012-03-01    0.083715
2012-04-01    0.001341
                ...   
2022-08-01   -0.031358
2022-09-01   -0.128034
2022-10-01    0.095568
2022-11-01   -0.017879
2022-12-01   -0.117456
Length: 133, dtype: float64

In [32]:
#Next, we will preprocess the other exogenous variables, which includes the CPI growth rate in the US, GDP growth rate, unemployment rate, and the Michigan Sentiment Consumer Index
class PreprocessCPIGrowth:
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        self.data["Date"] = pd.to_datetime(self.data["DATE"], format='%Y/%m/%d')
        self.data = self.data.sort_values("Date")
        self.data = self.data[(self.data["Date"] >= '2011-12-01') & (self.data["Date"] <= '2022-12-31')]
        self.data = self.data.groupby(self.data["Date"].dt.to_period("M")).last()
        if 'DATE' in self.data.columns:
            self.data.drop(columns=['DATE'], inplace=True)
            self.data.reset_index(drop=True, inplace=True)
        self.data['Date'] = pd.to_datetime(self.data['Date'])
        self.data.set_index('Date', inplace=True)
        return self.data


cpigrowth = pd.read_csv("./tsdata/cpigrowth.csv")
cpigrowth_preprocessed = PreprocessCPIGrowth(cpigrowth)
cpigrowth = cpigrowth_preprocessed.preprocess()


In [34]:
class PreprocessUnemploymentRate:
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        self.data["Date"] = pd.to_datetime(self.data["DATE"], format='%Y-%m-%d')
        self.data = self.data.sort_values("Date")
        self.data = self.data[(self.data["Date"] >= '2011-12-01') & (self.data["Date"] <= '2022-12-31')]
        self.data = self.data.groupby(self.data["Date"].dt.to_period("M")).last()
        self.data.drop(columns=["DATE", "Date"], inplace=True)
        return self.data

unemploymentrate = pd.read_csv("./tsdata/unemploymentrate.csv")
unemploymentrat_preprocessed = PreprocessUnemploymentRate(unemploymentrate)
unemploymentrate = unemploymentrat_preprocessed.preprocess()


In [28]:
class PreprocessMichiganSentiment:
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        self.data["Date"] = pd.to_datetime(self.data["YYYY"].astype(str) + '-' + self.data["Month"], format='%Y-%B')
        self.data = self.data.sort_values("Date")
        self.data = self.data[(self.data["Date"] >= '2011-12-01') & (self.data["Date"] <= '2022-12-31')]
        self.data = self.data.groupby(self.data["Date"].dt.to_period("M")).last()
        self.data.reset_index(drop=True, inplace=True)
        self.data.drop(columns=['Month', 'YYYY'], inplace=True)

        self.data['Date'] = pd.to_datetime(self.data['Date'])
        self.data.set_index('Date', inplace=True)
        return self.data



michigansentiment = pd.read_csv("./tsdata/michigansentiment.csv")
michigansentiment_preprocessed = PreprocessMichiganSentiment(michigansentiment)
michigansentiment = michigansentiment_preprocessed.preprocess()


In [29]:
class PreprocessGDPGrowth:

    def __init__(self, data):
        self.data = data

    def preprocess(self):
        self.data = self.data.drop(columns = ["...1"])
        self.data["Date"] = pd.date_range(start="1992-01-01", periods=len(self.data), freq='ME')
        self.data = self.data[(self.data["Date"] >= '2011-12-01') & (self.data["Date"] <= '2022-12-31')]
        self.data = self.data[['Date', 'Monthly Real GDP Index']]

        self.data['Date'] = pd.to_datetime(self.data['Date'])
        self.data.set_index('Date', inplace=True)
        self.data['GDP Growth Rate'] = np.log(self.data['Monthly Real GDP Index'] / self.data['Monthly Real GDP Index'].shift(1))
        #self.data = self.data.iloc[1:]
        return self.data

gdpgrowth = pd.read_csv("./tsdata/gdpgrowth.csv")
gdpgrowth_preprocessed = PreprocessGDPGrowth(gdpgrowth)
gdpgrowth = gdpgrowth_preprocessed.preprocess()

Unnamed: 0_level_0,Monthly Real GDP Index,GDP Growth Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-12-31,17207.578455,
2012-01-31,17267.041564,0.003450
2012-02-29,17510.271157,0.013988
2012-03-31,17324.287582,-0.010678
2012-04-30,17412.941827,0.005104
...,...,...
2022-08-31,21980.002956,0.012468
2022-09-30,21869.882052,-0.005023
2022-10-31,21905.775870,0.001640
2022-11-30,22011.625365,0.004820


In [35]:
#Create final data frame
df = pd.DataFrame()

df.index = log_returns.index

sentiment = pd.read_csv("./Sentiment_data/sentiment_scores.csv")


df["returns"] = log_returns.values
#df["gdpgrowth"] = gdpgrowth["Monthly Real GDP Index"].values
#df["cpigrowth"] = cpigrowth["USACPALTT01CTGYM"].values
#df["unemp"] = unemploymentrate["UNRATE"].values
df["msci"] = michigansentiment["ICS_ALL"].values
df.to_csv('./tsdata/preprocessed_data.csv', index=True)