In [None]:
!pip install polars
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yfinance
  Downloading yfinance-0.1.90-py2.py3-none-any.whl (29 kB)
Collecting requests>=2.26
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.6 MB/s 
Installing collected packages: requests, yfinance
  Attempting uninstall: requests
    Found existing installation: requests 2.23.0
    Uninstalling requests-2.23.0:
      Successfully uninstalled requests-2.23.0
Successfully installed requests-2.28.1 yfinance-0.1.90


In [None]:
import polars as pl
import yfinance as yf
import pandas as pd
import concurrent.futures
import psutil
import time

In [None]:
start_date = '2015-01-01'
end_date = '2019-12-31'
proxy = None
ticker_list = [
    'AXP',
    'AMGN',
    'AAPL',
    'BA',
    'CAT',
    'CSCO',
    'CVX',
    'GS',
    'HD',
    'HON',
    'IBM',
    'INTC',
    'JNJ',
    'KO',
    'JPM',
    'MCD',
    'MMM',
    'MRK',
    'MSFT',
    'NKE',
    'PG',
    'TRV',
    'UNH',
    'CRM',
    'VZ',
    'V',
    'WBA',
    'WMT',
    'DIS',
    'DOW',
]
INDICATORS = [
    'macd',
    'boll_ub',
    'boll_lb',
    'rsi_30',
    'cci_30',
    'dx_30',
    'close_30_sma',
    'close_60_sma',
    'wr_30',
    'atr_30',
    'supertrend',
    'mfi_30',
    'close_30_tema',
]

In [None]:
class YahooDownloader:
    def __init__(self, start_date: str, end_date: str, ticker_list: list):
        """
        Initialize the date range

        Args:
            start_date: starting date
            end_date: ending date
            ticker_list: list of stock tickers
        """
        self.start_date = start_date
        self.end_date = end_date
        self.ticker_list = ticker_list

    def fetch_data(self, proxy=None):
        """
        Fetch the latest data from Yahoo Finance and return it as a pandas DataFrame.

        Args:
            proxy: proxy to download from
        """
        # Download and save the data in a pandas DataFrame:
        data = yf.download(
            self.ticker_list, start=self.start_date, end=self.end_date, proxy=proxy
        )
        data = data.unstack().reset_index().melt(id_vars=['Date', 'level_0', 'level_1'], value_vars=0)
        data_df = data.pivot_table(values='value', index=['Date', 'level_1'], columns='level_0').reset_index().rename({'level_1': 'tic'}, axis=1)
        # reset the index, we want to use numbers as index instead of dates
        try:
            # convert the column names to standardized names
            data_df.columns = [
                'date',
                'tic',
                'adjcp',
                'close',
                'high',
                'low',
                'open',
                'volume',
            ]
            # use adjusted close price instead of close price
            data_df['close'] = data_df['adjcp']
            # drop the adjusted close price column
            data_df = data_df.drop(labels='adjcp', axis=1)
        except NotImplementedError:
            print('the features are not supported currently')
        # create day of the week column (monday = 0)
        data_df['day'] = data_df['date'].dt.dayofweek
        # convert date to standard string format, easy to filter
        data_df['date'] = data_df.date.apply(lambda x: x.strftime('%Y-%m-%d'))
        # drop missing data
        data_df = data_df.dropna()
        data_df = data_df.reset_index(drop=True)
        print('Shape of DataFrame: ', data_df.shape)
        # print("Display DataFrame: ", data_df.head())

        data_df = data_df.sort_values(by=['date', 'tic']).reset_index(drop=True)
        return data_df

    def select_equal_rows_stock(self, df):
        """
        Select only rows with equal counts that have a stock value greater than the mean value.

        Args:
            df: stock dataframe
        """
        df_check = df.tic.value_counts()
        df_check = pd.DataFrame(df_check).reset_index()
        df_check.columns = ['tic', 'counts']
        mean_df = df_check.counts.mean()
        equal_list = list(df.tic.value_counts() >= mean_df)
        names = df.tic.value_counts().index
        select_stocks_list = list(names[equal_list])
        df = df[df.tic.isin(select_stocks_list)]
        return df


In [None]:
import datetime

import numpy as np
import pandas as pd
import polars as pl
# from multiprocessing.sharedctypes import Value


def load_dataset(*, file_name: str):
    """
    load csv dataset from path
    :return: (df) pandas dataframe
    """
    return pl.read_csv(file_name)


def data_split(df, start, end, target_date_col='date'):
    """
    split the dataset into training or testing using date
    :param data: (df) pandas dataframe, start, end
    :return: (df) pandas dataframe
    """
    data = df[(df[target_date_col] >= start) & (df[target_date_col] < end)]
    data = data.sort_values([target_date_col, 'tic'], ignore_index=True)
    data.index = data[target_date_col].factorize()[0]
    return data


def convert_to_datetime(time):
    """
    Convert time string to datetime. datetime.

    Args:
        time: time repreesnted in string
    """
    if isinstance(time, str):
        time_fmt = '%Y-%m-%dT%H:%M:%S'
        return datetime.datetime.strptime(time, time_fmt)


class FeatureEngineer:
    """Provides methods for preprocessing the stock price data

    Attributes
    ----------
        use_technical_indicator : boolean
            we technical indicator or not
        tech_indicator_list : list
            a list of technical indicator names
        use_turbulence : boolean
            use turbulence index or not
        user_defined_feature:boolean
            use user defined features or not

    Methods
    -------
    preprocess_data()
        main method to do the feature engineering

    """

    def __init__(
        self,
        use_technical_indicator=True,
        tech_indicator_list=config.INDICATORS,
        use_vix=False,
        use_turbulence=False,
        user_defined_feature=False,
    ):
        """
        Initializes the configuration.

        Args:
            use_technical_indicator: Boolean
            tech_indicator_list: Boolean
            INDICATORS: list of technical indicators to use from config file
            use_vix: Boolean
            use_turbulence: Boolean
            user_defined_feature: Boolean
        """
        self.use_technical_indicator = use_technical_indicator
        self.tech_indicator_list = tech_indicator_list
        self.use_vix = use_vix
        self.use_turbulence = use_turbulence
        self.user_defined_feature = user_defined_feature

    def preprocess_data(self, df):
        """main method to do the feature engineering
        @:param config: source dataframe
        @:return: a DataMatrices object
        """
        # clean data
        df = self.clean_data(df)

        # add technical indicators using stockstats
        if self.use_technical_indicator:
            df = self.add_technical_indicator(df)
            print('Successfully added technical indicators')

        # add vix for multiple stock
        if self.use_vix:
            df = self.add_vix(df)
            print('Successfully added vix')

        # add turbulence index for multiple stock
        if self.use_turbulence:
            df = self.add_turbulence(df)
            print('Successfully added turbulence index')

        # add user defined feature
        if self.user_defined_feature:
            df = self.add_user_defined_feature(df)
            print('Successfully added user defined features')

        # fill the missing values at the beginning and the end
        df = df.fillna(method='ffill').fillna(method='bfill')
        return df

    def clean_data(self, data):
        """
        clean the raw data
        deal with missing values
        reasons: stocks could be delisted, not incorporated at the time step
        :param data: (df) pandas dataframe
        :return: (df) pandas dataframe
        """
        df = data.copy()
        df = df.sort(
            [pl.col('date'), pl.col('tic')],
            reverse=[False, False]
        )
        df.index = df.date.factorize()[0]
        merged_closes = df.pivot_table(index='date', columns='tic', values='close')
        merged_closes = merged_closes.dropna(axis=1)
        tics = merged_closes.columns
        df = df[df.tic.isin(tics)]
        return df

    def add_technical_indicator(self, data):
        """
        calculate technical indicators
        use stockstats package to add technical inidactors
        :param data: (df) pandas dataframe
        :return: (df) pandas dataframe
        """
        df = data.copy()
        df = df.sort_values(by=['tic', 'date'])
        stock = Sdf.retype(df.copy())
        unique_ticker = stock.tic.unique()

        for indicator in self.tech_indicator_list:
            indicator_df = pd.DataFrame()
            for i in range(len(unique_ticker)):
                try:
                    temp_indicator = stock[stock.tic == unique_ticker[i]][indicator]
                    temp_indicator = pd.DataFrame(temp_indicator)
                    temp_indicator['tic'] = unique_ticker[i]
                    temp_indicator['date'] = df[df.tic == unique_ticker[i]][
                        'date'
                    ].to_list()
                    indicator_df = indicator_df.append(
                        temp_indicator, ignore_index=True
                    )
                except Exception as e:
                    print(e)
            df = df.merge(
                indicator_df[['tic', 'date', indicator]], on=['tic', 'date'], how='left'
            )
        df = df.sort_values(by=['date', 'tic'])
        return df

    def add_user_defined_feature(self, data):
        """
        add user defined features
        :param data: (df) pandas dataframe
        :return: (df) pandas dataframe
        """
        df = data.copy()
        df['daily_return'] = df.close.pct_change(1)
        return df

    def add_vix(self, data):
        """
        add vix from yahoo finance
        :param data: (df) pandas dataframe
        :return: (df) pandas dataframe
        """
        df = data.copy()
        df_vix = YahooDownloader(
            start_date=df.date.min(), end_date=df.date.max(), ticker_list=['^VIX']
        ).fetch_data()
        vix = df_vix[['date', 'close']]
        vix.columns = ['date', 'vix']

        df = df.merge(vix, on='date')
        df = df.sort_values(['date', 'tic']).reset_index(drop=True)
        return df

    def add_turbulence(self, data):
        """
        add turbulence index from a precalcualted dataframe
        :param data: (df) pandas dataframe
        :return: (df) pandas dataframe
        """
        df = data.copy()
        turbulence_index = self.calculate_turbulence(df)
        df = df.merge(turbulence_index, on='date')
        df = df.sort_values(['date', 'tic']).reset_index(drop=True)
        return df

    def calculate_turbulence(self, data):
        """calculate turbulence index based on dow 30"""
        # can add other market assets
        df = data.copy()
        df_price_pivot = df.pivot(index='date', columns='tic', values='close')
        # use returns to calculate turbulence
        df_price_pivot = df_price_pivot.pct_change()

        unique_date = df.date.unique()
        # start after a year
        start = 252
        turbulence_index = [0] * start
        # turbulence_index = [0]
        count = 0
        for i in range(start, len(unique_date)):
            current_price = df_price_pivot[df_price_pivot.index == unique_date[i]]
            # use one year rolling window to calcualte covariance
            hist_price = df_price_pivot[(
                df_price_pivot.index < unique_date[i]) & (
                    df_price_pivot.index >= unique_date[i - 252]
            )]
            # Drop tickers which has number missing values more than the "oldest" ticker
            filtered_hist_price = hist_price.iloc[
                hist_price.isna().sum().min():
            ].dropna(axis=1)

            cov_temp = filtered_hist_price.cov()
            current_temp = current_price[list(filtered_hist_price)] - np.mean(
                filtered_hist_price, axis=0
            )

            temp = current_temp.values.dot(np.linalg.pinv(cov_temp)).dot(
                current_temp.values.T
            )
            if temp > 0:
                count += 1
                turbulence_temp = temp[0][0] if count > 2 else 0
            else:
                turbulence_temp = 0
            turbulence_index.append(turbulence_temp)
        try:
            turbulence_index = pd.DataFrame(
                {'date': df_price_pivot.index, 'turbulence': turbulence_index}
            )
        except ValueError:
            raise Exception('Turbulence information could not be added.')
        return turbulence_index


In [None]:
data = YahooDownloader(start_date=start_date, end_date=end_date, ticker_list=ticker_list).fetch_data()

[*********************100%***********************]  30 of 30 completed
Shape of DataFrame:  (36651, 8)


In [None]:
data.index = data.date.factorize()[0]
data.pivot_table(index='date', columns='tic', values='close')

tic,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,24.603207,128.152267,82.486214,113.657219,72.959435,59.240002,21.631609,79.640038,87.376266,,...,42.481163,40.811428,43.636982,71.751961,87.615868,89.239807,62.743183,32.520531,60.070499,72.252914
2015-01-05,23.910097,126.629402,80.304810,112.870064,69.108185,58.169998,21.200697,76.456680,86.099388,,...,43.112556,40.436138,42.934418,71.410805,86.560539,87.769913,61.358200,32.250435,58.884918,72.042648
2015-01-06,23.912346,122.549767,78.593369,111.540627,68.663498,57.200001,21.192860,76.421288,85.642715,,...,44.806156,39.842632,42.681862,71.085495,85.787766,87.592796,60.962830,32.575935,59.035091,72.597778
2015-01-07,24.247646,126.829781,80.309853,113.272369,69.727562,56.930000,21.388725,76.357620,86.518822,,...,45.764374,40.348854,43.563503,71.458374,87.250237,88.487137,61.779617,32.365719,60.544727,74.523941
2015-01-08,25.179295,126.372917,81.448219,115.275261,70.442207,58.590000,21.553257,78.104935,87.413528,,...,46.685471,41.535843,44.569149,72.275574,89.061737,92.710945,62.608238,33.059410,61.295624,76.096863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,69.557091,222.372681,119.349228,335.553558,137.964401,163.740005,43.802269,104.443703,144.679993,47.014267,...,79.321320,153.031235,97.310539,116.018761,126.941963,282.666840,183.412933,53.355652,51.639957,113.459450
2019-12-24,69.623230,221.732162,119.588890,331.030457,137.016754,163.250000,43.510864,104.452377,145.289993,46.988598,...,79.061920,153.002075,97.407822,116.316017,126.569260,282.140045,183.893311,53.251373,51.445988,113.917000
2019-12-26,71.004593,221.338715,120.231232,327.968689,137.704224,164.509995,43.574615,104.678047,145.699997,47.262482,...,78.975456,154.256180,97.962234,116.316017,126.997864,283.203339,185.452164,53.260063,51.930912,113.926514
2019-12-27,70.977646,221.000168,120.020325,328.187408,137.760010,164.979996,43.501759,104.417664,145.750000,46.928688,...,79.113792,154.538147,98.798790,117.124146,127.473114,283.509796,185.677643,53.468616,52.036720,113.993240


In [None]:
data = data.unstack().reset_index().melt(id_vars=['Date', 'level_0', 'level_1'], value_vars=0)
data

Unnamed: 0,Date,level_0,level_1,variable,value
0,2015-01-02,Adj Close,AAPL,0,2.460321e+01
1,2015-01-05,Adj Close,AAPL,0,2.391009e+01
2,2015-01-06,Adj Close,AAPL,0,2.391234e+01
3,2015-01-07,Adj Close,AAPL,0,2.424764e+01
4,2015-01-08,Adj Close,AAPL,0,2.517930e+01
...,...,...,...,...,...
226255,2019-12-23,Volume,WMT,0,4.485800e+06
226256,2019-12-24,Volume,WMT,0,2.227400e+06
226257,2019-12-26,Volume,WMT,0,4.223800e+06
226258,2019-12-27,Volume,WMT,0,3.544000e+06


In [None]:
data = data.pivot_table(values='value', index=['Date', 'level_1'], columns='level_0').reset_index().rename({'level_1': 'tic'})

level_0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,2015-01-02,AAPL,24.603207,27.332500,27.860001,26.837500,27.847500,212818400.0
1,2015-01-02,AMGN,128.152283,159.889999,162.589996,158.600006,160.160004,2605400.0
2,2015-01-02,AXP,82.486214,93.019997,93.940002,92.139999,93.169998,2437500.0
3,2015-01-02,BA,113.657234,129.949997,131.839996,129.089996,131.070007,4294200.0
4,2015-01-02,CAT,72.959427,91.879997,92.370003,90.660004,91.769997,3767900.0
...,...,...,...,...,...,...,...,...
36646,2019-12-30,UNH,281.479095,293.850006,296.540009,293.450012,296.049988,1511700.0
36647,2019-12-30,V,184.148178,187.830002,189.479996,187.119995,189.309998,4833600.0
36648,2019-12-30,VZ,53.190540,61.209999,61.689999,61.090000,61.650002,6765400.0
36649,2019-12-30,WBA,51.939724,58.910000,59.599998,58.810001,59.090000,3264500.0


In [None]:
data

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AAPL,AMGN,AXP,BA,CAT,CRM,CSCO,CVX,DIS,DOW,...,MRK,MSFT,NKE,PG,TRV,UNH,V,VZ,WBA,WMT
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-02,24.603207,128.152206,82.486252,113.657211,72.959435,59.240002,21.631603,79.639999,87.376274,,...,7415962,27913900,4985800,7251400,1270800,3060900,8389600,11421200,3938500,4501800
2015-01-05,23.910093,126.629379,80.304771,112.870049,69.108170,58.169998,21.200695,76.456688,86.099388,,...,16727338,39673900,6889200,8626100,1728700,4679000,12751200,18964500,4767900,6979000
2015-01-06,23.912348,122.549767,78.593346,111.540627,68.663498,57.200001,21.192865,76.421288,85.642700,,...,25453510,36447900,7576000,7791200,2350900,3468300,11070000,22950100,4881600,8205100
2015-01-07,24.247643,126.829811,80.309830,113.272369,69.727554,56.930000,21.388725,76.357628,86.518814,,...,19528956,29114100,7256000,5986600,1558200,3225800,9346800,20793600,5672100,8498400
2015-01-08,25.179295,126.372910,81.448235,115.275269,70.442238,58.590000,21.553255,78.104919,87.413544,,...,20038808,29645200,5978200,6823300,1941200,5346100,10443200,17617500,4083900,12713600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-23,69.557098,222.372665,119.349205,335.553558,137.964417,163.740005,43.802273,104.443703,144.679993,47.014271,...,7895946,17718200,6350500,6619500,1003000,2098900,5831700,9477500,4813200,4485800
2019-12-24,69.623230,221.732162,119.588905,331.030457,137.016739,163.250000,43.510864,104.452377,145.289993,46.988598,...,2343223,8989200,3328100,2022100,486200,714000,2420900,6640100,1475400,2227400
2019-12-26,71.004585,221.338715,120.231247,327.968689,137.704300,164.509995,43.574615,104.678047,145.699997,47.262482,...,3062675,14520600,3709900,4760400,927700,1050600,5237000,8117800,2736100,4223800
2019-12-27,70.977623,221.000168,120.020317,328.187408,137.760010,164.979996,43.501759,104.417656,145.750000,46.928680,...,3758966,18412800,5307500,5192000,614400,1547100,5448600,8166800,3647600,3544000
