In [16]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import date
import datetime
import time
from tqdm import tqdm
import yfinance as yf
from datetime import timedelta
from yahoo_earnings_calendar import YahooEarningsCalendar
import dateutil.parser
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, GRU, SimpleRNN, Dense, GlobalMaxPool1D
from tensorflow.keras.optimizers import SGD, Adam
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import twint
import nest_asyncio
import requests
import talib
pd.set_option('display.max_rows', 10000)

# Data Ideas

In [105]:
# TEST LSTM MODEL ONLY ON ONE STOCK FIRST. LIMITED TO 8 API CALLS PER MINUTE
# Sources
    # https://betterprogramming.pub/can-tweets-predict-stock-market-returns-using-python-ddba669a4efc
    # https://twelvedata.com/
    # https://www.alphavantage.co/documentation/
    # https://pypi.org/project/stockstats/
    # https://finnhub.io/docs/api/investment-themes-thematic-investing
    # http://finviz.com/
    
# Time frame 
    # D, W, M level 
    
# Look back
    # At least 6 months 
    
# Features 
    # Candlesticks (all) - get_candlesticks (0)
    # Volume
        # Relative - get_rvol (1)
        # Current - get_ohlcv (1)
    # SMA (10, 20, 50, 100, 200) - get_sma (0)
    # EMA - get_ema (1)
    # MACD - get_macd (1)
    # Patterns detection - captured by all technical indicators (28)
    # RSI - get_rsi (1)
    # Market news 
    # Number of days till next report 
        # CPI
        # Unemployment
        # Earnings
        # Fed meetings
    # Economic indicators (alpha vantage) - only allows 5 calls per minute
        # CPI - get_cpi (1)
        # Inflation - get_inflation (1)
        # Treasury Yield - get_treasury_yield (1)
        # Consumer sentiment - get_consumer_sentiment (1)
        # Interest rate - get_interest_rate (1)
    # VIX - get_vix (1)
    # S&P 500 price - get_spy (1)
    # Social sentiment - not necessary (priced in, captured by technical indicators, momenum, volume)
    # ATR - get_atr (1)
    # Gap - get_gap (0)
    # Fundamentals (70)
        # P/B
        # P/E
        # Forward p/e
        # P/S
        # Dividend yield 
        # Debt/equity
        # Institutional ownership
        # Float short 
        # Institutional transactions 
        # ROE
        # Year over year revenue growth 
        # Year over year earnings performance 
        # Analyst predictions 
        # Institutional owners(buffet, arkk etc) 
    # Support/resistance 
    # Bollinger bands - get_bb (1)
    # Sector
    # Market cap 
    # Difference from 
        # SMA - get_difference_from
        # Bollinger bands - get_percent_b
        # 10 day SMA relative to 20 day SMA etc. - get_difference_from
    # Sector performance - get_sector_perf (16)
    # sector trends - get_sector_slope (16)
    # Previous dependent variable - get_day_return
    # Price - get_ohlcv (1)
    # commodities prices (1) - get_commodities
        # gold, oil, milk, eggs, coffee, metals
    # Predictions
        # SMA
        # EMA
        # SES
        # SARIMAX
        # FB Prophet
    
# Feature engineering 
    # Lag terms 
    # Adstock/carryover 
        # What is the half life for each feature?
    # PCA 
    # min-max scale before pca - min_max_robust_scale()
    # box-cox transformation
    
# Dependent variable (next time period)
    # Up/down - binary
    # Up 2%/down - binary
    # Up 5%/down - binary
    # Candlesticks - binary
    # % return - continuous


In [17]:
api_url = 'https://www.alphavantage.co/query?function=EARNINGS_CALENDAR&symbol=IBM&horizon=12month&apikey=0QUFD5QZ18YFM807'
with requests.Session() as s:
    download = s.get(api_url)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    my_list = list(cr)
    for row in my_list:
        print(row)

['symbol', 'name', 'reportDate', 'fiscalDateEnding', 'estimate', 'currency']
['IBM', 'International Business Machines Corp', '2022-01-24', '2021-12-31', '3.4', 'USD']
['IBM', 'International Business Machines Corp', '2022-04-18', '2022-03-31', '', 'USD']
['IBM', 'International Business Machines Corp', '2022-07-18', '2022-06-30', '', 'USD']
['IBM', 'International Business Machines Corp', '2022-10-18', '2022-09-30', '', 'USD']


# Data Pull Class

In [67]:
class TimeSeriesData:
    """
    """
    def __init__(self, api_key, api_key_alpha, tickers, interval, start_dt, end_dt):
        self.api_key = api_key
        self.api_key_alpha = api_key_alpha
        self.tickers = tickers
        self.interval = interval
        self.start_dt = start_dt
        self.end_dt = end_dt
        
    def _org_df(self, df, ticker):
        """
        This is a private method organizes the output a data frame to have the ticker symbol as the first column and the rest
        are the remaining columns. 
        
        Parameters
        ----------
        df: DataFrame
            The dataframe to organize
        ticker: string
            The ticker to use as the first column 
        
        Returns
        -------
        df_out: DataFrame
            The organized dataframe
        """
        df_out = df.copy()
        df_out['ticker'] = ticker
        df_out = df_out[['ticker'] + df_out.drop(['ticker'], axis=1).columns.tolist()]
        return df_out
    
    def _call_api(self, api, params=''):
        """
        This method calls the necessary api to get the output of the desired features for every stock
    
        Returns
        -------
        df_final: DataFrame
            Contains all columns desired for each stock
        """
        df_final = pd.DataFrame()
        for i in self.tickers:
            api_url = f'https://api.twelvedata.com/{api}?symbol={i}&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&{params}apikey={self.api_key}'
            data = requests.get(api_url).json()
            df = pd.DataFrame(data['values'])
            df = self._org_df(df, i)
            df_final = pd.concat([df_final, df])
        df_final = df_final.reset_index().drop(['index'], axis=1)
        return df_final
    
    def _call_api_alpha(self, api, params=''):
        """
        Calls the necessary alpha vantage api
        """
        api_url = f'https://www.alphavantage.co/query?function={api}&{params}apikey={self.api_key_alpha}'
        data = requests.get(api_url).json()
        df_out = pd.DataFrame(data['data'])
        df_out.columns = ['datetime', api]
        df_out.columns = df_out.columns.str.lower()
        df_out = df_out[(df_out['datetime'] >= self.start_dt) & (df_out['datetime'] <= self.end_dt)]
        df_out = df_out.sort_values(['datetime'], ascending=True)
        return df_out
        
    def get_ohlcv(self):
        """
        Get open, high, low, close, volume
        """
        df_final = self._call_api(api='time_series')
        return df_final
    
    def get_bb(self):
        """
        Get bollinger bands
        """
        params = 'ma_type=SMA&'
        df_final = self._call_api(api='bbands', params=params)
        return df_final
    
    def get_candlesticks(self, df):
        """
        Gets all candlesticks using ta-lib
        """
        cdl_df = df.copy()
        candle_names = talib.get_function_groups()['Pattern Recognition']
        op = cdl_df['open']
        hi = cdl_df['high']
        lo = cdl_df['low']
        cl = cdl_df['close']
        # create columns for each pattern
        for candle in candle_names:
            # below is same as;
            # df["CDL3LINESTRIKE"] = talib.CDL3LINESTRIKE(op, hi, lo, cl)
            cdl_df[candle] = getattr(talib, candle)(op, hi, lo, cl)
        return cdl_df
    
    def get_sma(self, df, size, price_col):
        """
        Gets the simple moving average based on specified size. Size can be a list of window sizes. 
        """
        df_out = df.copy()
        for i in size:
            df_out[f'sma_{i}'] = df_out.groupby('ticker').rolling(window = i)[price_col].mean().reset_index(drop=True)
            df_out[f'sma_{i}'] = pd.to_numeric(df_out[f'sma_{i}'])
            #df_out[f'sma_{i}'] = df_out[price_col].rolling(window = i).mean()
        return df_out
    
    def get_ema(self):
        """
        Gets the exponential moving average. 
        """
        params = 'time_period=9&'
        df_final = self._call_api(api='ema', params=params)
        return df_final
    
    def get_macd(self):
        """
        Get moving average convergence divergence
        """
        params = 'fast_period=12&slow_period=26&'
        df_final = self._call_api(api='macd', params=params)
        return df_final
    
    def get_rsi(self):
        """
        Get relative strength index
        """
        params = 'time_period=14&'
        df_final = self._call_api(api='rsi', params=params)
        return df_final
    
    def get_vix(self):
        """
        Get closing price of vix
        """
        api_url = f'https://api.twelvedata.com/time_series?symbol=VIX&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&apikey={self.api_key}'
        data = requests.get(api_url).json()
        df = pd.DataFrame(data['values'])
        df = df[['datetime', 'close']]
        df = df.rename(columns={'close':'vix_close'})
        df = df.reset_index().drop(['index'], axis=1)
        return df
    
    def get_spy(self):
        """
        Get closing price of s&p 500
        """
        api_url = f'https://api.twelvedata.com/time_series?symbol=SPY&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&apikey={self.api_key}'
        data = requests.get(api_url).json()
        df = pd.DataFrame(data['values'])
        df = df[['datetime', 'close']]
        df = df.rename(columns={'close':'spy_close'})
        df = df.reset_index().drop(['index'], axis=1)
        return df
    
    def get_atr(self):
        """
        Get average true range
        """
        params = 'time_period=14&'
        df_final = self._call_api(api='atr', params=params)
        return df_final
    
    def get_ad(self):
        """
        Get ad line. Chaikin A/D Line(AD) calculates the Advance/Decline of an asset. This indicator belongs to the group of
        Volume Indicators.
        """
        params = ''
        df_final = self._call_api(api='ad', params=params)
        return df_final
    
    def get_adosc(self):
        """
        Chaikin A/D Oscillator(ADOSC) is an indicator, which finds the relationship between increasing and
        decreasing volume with price fluctuations. The Chaikin Oscillator measures the momentum of the
        Accumulation/Distribution Line(ADL) using two Exponential Moving Averages of varying length to the line(MACD).
        """
        params = ''
        df_final = self._call_api(api='adosc', params=params)
        return df_final
    
    def get_adx(self):
        """
        Average Directional Index(ADX) is used to decide if the price trend is strong.
        """
        params = ''
        df_final = self._call_api(api='adx', params=params)
        return df_final
    
    def get_adxr(self):
        """
        Average Directional Movement Index Rating(ADXR) is a smoothed version of the ADX indicator. ADXR quantifies momentum
        change in the ADX.
        """
        params = ''
        df_final = self._call_api(api='adxr', params=params)
        return df_final
    
    def get_apo(self):
        """
        Absolute Price Oscillator(APO) calculates the difference between two price moving averages.
        """
        params = ''
        df_final = self._call_api(api='apo', params=params)
        return df_final
    
    def get_aroon(self):
        """
        Aroon Indicator(AROON) is used to identify if the price is trending. It can also spot the beginning of a new trend and
        its strength.
        """
        params = ''
        df_final = self._call_api(api='aroon', params=params)
        return df_final
    
    def get_aroonosc(self):
        """
        Aroon Oscillator(AROONOSC) uses classic Aroon(Aroon Up and Aroon down) to measure the strength of persisting trends and
        whether they will continue.
        """
        params = ''
        df_final = self._call_api(api='aroonosc', params=params)
        return df_final
    
    def get_bop(self):
        """
        Balance of Power(BOP) measures the relative strength between buyers and sellers by assessing the ability of move price
        to an extreme level.
        """
        params = ''
        df_final = self._call_api(api='bop', params=params)
        return df_final
    
    def get_cci(self):
        """
        Commodity Channel Index(CCI) is a universal indicator that can help to identify new trends and assess current critical
        conditions.
        """
        params = ''
        df_final = self._call_api(api='cci', params=params)
        return df_final
    
    def get_cmo(self):
        """
        Chande Momentum Oscillator(CMO) is used to show overbought and oversold conditions.
        """
        params = ''
        df_final = self._call_api(api='cmo', params=params)
        return df_final
    
    def get_coppock(self):
        """
        Coppock Curve(COPPOCK) is usually used to detect long-term trend changes, typically on monthly charts.
        """
        params = ''
        df_final = self._call_api(api='coppock', params=params)
        return df_final
    
    def get_crsi(self):
        """
        ConnorsRSI(CRSI) is used to show the oversold and overbought levels of the RSI values.
        """
        params = ''
        df_final = self._call_api(api='crsi', params=params)
        return df_final
    
    def get_dpo(self):
        """
        Detrended Price Oscillator(DPO) is used to separate price from the trend, in order to more clearly identify the length
        of cycles.
        """
        params = ''
        df_final = self._call_api(api='dpo', params=params)
        return df_final
    
    def get_dx(self):
        """
        Directional Movement Index(DX) identifies which direction the price is moving.
        """
        params = ''
        df_final = self._call_api(api='dx', params=params)
        return df_final
    
    def get_heikinashicandles(self):
        """
        Heikin-Ashi Candles(HEIKINASHICANDLES) translated from Japanese means "average bar". It can be used to detect market
        trends and predict future price fluctuations.
        """
        params = ''
        df_final = self._call_api(api='heikinashicandles', params=params)
        return df_final
    
    def get_ichimoku(self):
        """
        Get Ichimoku. Ichimoku Kinkō Hyō(ICHIMOKU) is a group of technical indicators that shows trend direction, momentum, and
        support & resistance levels. Overall it tends to improve the accuracy of forecasts.
        """
        params = '' # default
        df_final = self._call_api(api='ichimoku', params=params)
        return df_final
    
    def get_kama(self):
        """
        Kaufman's Adaptive Moving Average(KAMA) is a type of Moving Average(MA) that incorporates market noise and volatility.
        """
        params = ''
        df_final = self._call_api(api='kama', params=params)
        return df_final
    
    def get_keltner(self):
        """
        Keltner Channels(KELTNER) is a volatility indicator used to spot trend changes and accelerations.
        """
        params = ''
        df_final = self._call_api(api='keltner', params=params)
        return df_final

    def get_linearregslope(self):
        """
        Linear Regression Slope(LINEARREGSLOPE) calculates the slope for the linear regression trendline for each data point.
        """
        params = ''
        df_final = self._call_api(api='linearregslope', params=params)
        return df_final
    
    def get_macd_slope(self):
        """
        Moving Average Convergence Divergence Regression Slope(MACD_SLOPE) shows slopes of macd line, signal line, and
        histogram. A negative and rising slope shows improvement within a downtrend. A positive and falling slope shows
        deterioration within an uptrend. MACD has an unstable period of ~ 100.
        """
        params = ''
        df_final = self._call_api(api='macd_slope', params=params)
        return df_final
    
    def get_mfi(self):
        """
        Money Flow Index(MFI) is used to identify overbought and oversold levels in an asset. In some cases, it can be used to
        detect divergences, which might be a sign of upcoming trend changes.
        """
        params = ''
        df_final = self._call_api(api='mfi', params=params)
        return df_final
    
    def get_mom(self):
        """
        Momentum(MOM) compares the current price with the previous price N timeperiods ago.
        """
        params = ''
        df_final = self._call_api(api='mom', params=params)
        return df_final
    
    def get_obv(self):
        """
        On Balance Volume(OBV) is a momentum indicator, which uses volume flow to forecast upcoming price changes.
        """
        params = ''
        df_final = self._call_api(api='obv', params=params)
        return df_final
    
    def get_percent_b(self):
        """
        %B Indicator(PERCENT_B) measures the position of an asset price relative to upper and lower Bollinger Bands.
        """
        params = ''
        df_final = self._call_api(api='percent_b', params=params)
        return df_final
    
    def get_pivot_points_hl(self):
        """
        %B Indicator(PERCENT_B) measures the position of an asset price relative to upper and lower Bollinger Bands.
        """
        params = ''
        df_final = self._call_api(api='pivot_points_hl', params=params)
        return df_final
    
    def get_rvol(self):
        """
        Pivot Points (High/Low) (PIVOT_POINTS_HL) are typically used to foresee potential price reversals.
        """
        params = ''
        df_final = self._call_api(api='rvol', params=params)
        return df_final
    
    def get_stoch(self):
        """
        Stochastic Oscillator(STOCH) is used to decide if the price trend is strong.
        """
        params = ''
        df_final = self._call_api(api='stoch', params=params)
        return df_final
    
    def get_stochf(self):
        """
        Stochastic Fast(STOCHF) is more sensitive to price changes; therefore, it changes direction more quickly.
        """
        params = ''
        df_final = self._call_api(api='stochf', params=params)
        return df_final
    
    def get_stochrsi(self):
        """
        Stochastic RSI(STOCHRSI) as an independent indicator takes advantage of the STOCH and RSI indicators. It is used to
        determine overbought and oversold levels, as well as current market trends for an asset.
        """
        params = ''
        df_final = self._call_api(api='stochrsi', params=params)
        return df_final
    
    def get_commodities(self):
        """
        Get the price of the commodities index DJP
        """
        api_url = f'https://api.twelvedata.com/time_series?symbol=DJP&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&apikey={self.api_key}'
        data = requests.get(api_url).json()
        df = pd.DataFrame(data['values'])
        df = df[['datetime', 'close']]
        df = df.rename(columns={'close':'djp_close'})
        df = df.reset_index().drop(['index'], axis=1)
        return df
    
    def get_sector_perf(self):
        """
        Get % increase/decrease of each sector from open
        """
        sectors = ['XLE', 'XLK', 'XLU', 'XLI', 'XLF', 'XLV', 'XLY', 'XLP', 'XLB', 'XHB', 'XME', 'XRT', 'XOP', 'XTL', 'XLC',
                   'XLRE']
        x = 0
        for i in sectors:
            x = x + 1
            api_url = f'https://api.twelvedata.com/time_series?symbol={i}&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&apikey={self.api_key}'
            data = requests.get(api_url).json()
            df = pd.DataFrame(data['values'])
            df = df[['datetime', 'open', 'close']]
            df['open'] = pd.to_numeric(df['open'])
            df['close'] = pd.to_numeric(df['close'])
            df[f'{i}_change'] = (df['open'] - df['close'])/df['open']
            if x == 1:
                df_final = df[['datetime']]
                df_final = df_final.merge(df.drop(['open', 'close'], axis=1), on='datetime')
            else:
                df_final = df_final.merge(df.drop(['open', 'close'], axis=1), on='datetime')
        df_final = df_final.reset_index().drop(['index'], axis=1)
        return df_final
    
    def get_sector_slope(self):
        """
        Linear Regression Slope(LINEARREGSLOPE) calculates the slope for the linear regression trendline for each data point for
        each sector.
        """
        sectors = ['XLE', 'XLK', 'XLU', 'XLI', 'XLF', 'XLV', 'XLY', 'XLP', 'XLB', 'XHB', 'XME', 'XRT', 'XOP', 'XTL', 'XLC',
                   'XLRE']
        x = 0
        for i in sectors:
            x = x + 1
            api_url = f'https://api.twelvedata.com/linearregslope?symbol={i}&interval={self.interval}&order=ASC&start_date={self.start_dt}&end_date={self.end_dt}&apikey={self.api_key}'
            data = requests.get(api_url).json()
            df = pd.DataFrame(data['values'])
            df = df.rename(columns={'linearregslope':f'{i}_linearregslope'})
            if x == 1:
                df_final = df[['datetime']]
                df_final = df_final.merge(df, on='datetime')
            else:
                df_final = df_final.merge(df, on='datetime')
        df_final = df_final.reset_index().drop(['index'], axis=1)
        return df_final
    
    def get_difference_from(self, df):
        """
        Get the percent difference b/w price and each of the sma, ema, each of the sma with each other
        """
        df_out = df.copy()
        # difference between price and sma
        df_out['diff_from_sma10'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['sma_10']))/pd.to_numeric(x['sma_10'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_from_sma20'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['sma_20']))/pd.to_numeric(x['sma_20'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_from_sma50'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['sma_50']))/pd.to_numeric(x['sma_50'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_from_sma100'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['sma_100']))/pd.to_numeric(x['sma_100'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_from_sma200'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['sma_200']))/pd.to_numeric(x['sma_200'])).reset_index(drop=True)).flatten().tolist()
        
        # difference between price and ema
        df_out['diff_from_ema'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['close']) - pd.to_numeric(x['ema']))/pd.to_numeric(x['ema'])).reset_index(drop=True)).flatten().tolist()
        
        # difference between each sma with each other
        df_out['diff_sma10_sma20'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_10']) - pd.to_numeric(x['sma_20']))/pd.to_numeric(x['sma_20'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma10_sma50'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_10']) - pd.to_numeric(x['sma_50']))/pd.to_numeric(x['sma_50'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma10_sma100'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_10']) - pd.to_numeric(x['sma_100']))/pd.to_numeric(x['sma_100'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma10_sma200'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_10']) - pd.to_numeric(x['sma_200']))/pd.to_numeric(x['sma_200'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma20_sma50'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_20']) - pd.to_numeric(x['sma_50']))/pd.to_numeric(x['sma_50'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma20_sma100'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_20']) - pd.to_numeric(x['sma_100']))/pd.to_numeric(x['sma_100'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma20_sma200'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_20']) - pd.to_numeric(x['sma_200']))/pd.to_numeric(x['sma_200'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma50_sma100'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_50']) - pd.to_numeric(x['sma_100']))/pd.to_numeric(x['sma_100'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma50_sma200'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_50']) - pd.to_numeric(x['sma_200']))/pd.to_numeric(x['sma_200'])).reset_index(drop=True)).flatten().tolist()
        df_out['diff_sma100_sma200'] = np.array(df_out.groupby('ticker').apply(lambda x: (pd.to_numeric(x['sma_100']) - pd.to_numeric(x['sma_200']))/pd.to_numeric(x['sma_200'])).reset_index(drop=True)).flatten().tolist()
        
        return df_out
    
    def get_gap(self, df):
        """
        Get the difference between the previous close and current open
        """
        df_out = df.copy()
        df_out['close'] = pd.to_numeric(df_out['close'])
        df_out['open'] = pd.to_numeric(df_out['open'])
        df_out['previous_close'] = df_out.groupby('ticker')['close'].shift()
        df_out['gap'] = np.array(df_out.groupby('ticker').apply(lambda x: (x['open'] - x['previous_close'])/x['previous_close']).reset_index(drop=True)).flatten().tolist()
        df_out = df_out.drop(['previous_close'], axis=1)
        return df_out
    
    def get_day_return(self, df):
        """
        Get the percent difference b/w the open and close price of the same day
        """
        df_out = df.copy()
        df_out['day_return'] = (pd.to_numeric(df_out['open']) - pd.to_numeric(df_out['close']))/pd.to_numeric(df_out['open'])
        return df_out
    
    def get_cpi(self):
        """
        Alpha Vantage - get monthly CPI
        """
        params = 'interval=monthly&'
        df_final = self._call_api_alpha(api='CPI', params=params)
        return df_final
    
    def get_inflation(self):
        """
        Alpha Vantage - get monthly inflation expectation
        """
        params = ''
        df_final = self._call_api_alpha(api='INFLATION_EXPECTATION', params=params)
        return df_final
    
    def get_treasury_yield(self):
        """
        Alpha Vantage - get daily treasury yield
        """
        params = 'interval=daily&'
        df_final = self._call_api_alpha(api='TREASURY_YIELD', params=params)
        return df_final
    
    def get_consumer_sentiment(self):
        """
        Alpha Vantage - get monthly consumer sentiment as measured by the Surveys of Consumers by University of Michigan
        (Consumer Sentiment © [UMCSENT]), retrieved from FRED, Federal Reserve Bank of St. Louis
        """
        params = ''
        df_final = self._call_api_alpha(api='CONSUMER_SENTIMENT', params=params)
        return df_final
    
    def get_interest_rate(self):
        """
        Alpha Vantage - get daily federal funds interest rate
        """
        params = 'interval=daily&'
        df_final = self._call_api_alpha(api='FEDERAL_FUNDS_RATE', params=params)
        return df_final
    
    def get_all_data(self):
        #df_final = self.get_ohlcv().merge(self.get_bb(), on=['ticker', 'datetime']).merge(
        #    self.get_ema(), on=['ticker', 'datetime']).merge(self.get_macd(), on=['ticker', 'datetime']).merge(
        #    self.get_rsi(), on=['ticker', 'datetime']).merge(self.get_atr(), on=['ticker', 'datetime']).merge(
        #    self.get_ichimoku(), on=['ticker', 'datetime']).merge(self.get_vix(), on='datetime', how='left').merge(
        #    self.get_spy(), on='datetime', how='left').merge(self.get_sector_perf(), on='datetime', how='left')
        df_final = self.get_ohlcv().merge(self.get_bb(), on=['ticker', 'datetime']).merge(
            self.get_ema(), on=['ticker', 'datetime']).merge(self.get_macd(), on=['ticker', 'datetime']).merge(
            self.get_rsi(), on=['ticker', 'datetime']).merge(self.get_atr(), on=['ticker', 'datetime']).merge(
            self.get_ichimoku(), on=['ticker', 'datetime']).merge(self.get_vix(), on='datetime', how='left').merge(
            self.get_spy(), on='datetime', how='left').merge(self.get_cpi(), on='datetime', how='left').merge(
            self.get_inflation(), on='datetime', how='left').merge(self.get_consumer_sentiment(), on='datetime', how='left')
        df_final['cpi'].interpolate(method ='pad', limit_direction ='backward', inplace=True)
        df_final['inflation_expectation'].interpolate(method ='pad', limit_direction ='backward', inplace=True)
        df_final['consumer_sentiment'].interpolate(method ='pad', limit_direction ='backward', inplace=True)
        df_final = self.get_candlesticks(df_final)
        df_final = self.get_sma(df_final, size=[10, 20, 50, 100, 200], price_col='close')
        df_final = self.get_difference_from(df_final)
        df_final = self.get_day_return(df_final)
        df_final['prior_day_return'] = df_final.groupby('ticker')['day_return'].shift(1)
        df_final['next_day_return'] = df_final.groupby('ticker')['day_return'].shift(-1)
        df_final = df_final.sort_values(['ticker', 'datetime'], ascending=[True, True])
        df_final = df_final.reset_index().drop(['index'], axis=1)
        df_final.columns = df_final.columns.str.lower()
        return df_final
        
    





In [29]:
class FeatureEngineering:
    """
    """
    def _scaler(self, X):
        transformer = RobustScaler().fit(X)
        return pd.DataFrame(transformer.transform(X))
    
    def min_max_robust_scale(self, df, feats):
        """
        Performs robust (insensitive to outliers) scaling of features by ticker symbol
        """
        df_out = df.copy()
        df_out[feats] = df_out.groupby('ticker')[feats].apply(self._scaler).reset_index(drop=True)
        return df_out
    
    def do_pca(self, df):
        pass

In [61]:
api_key = ''
api_key_alpha = ''
tickers = ['AAPL']
interval = '1day'
start_dt = '2021-01-01'
end_dt = '2021-12-31'

tsd = TimeSeriesData(api_key=api_key, api_key_alpha=api_key_alpha, tickers=tickers, interval=interval, start_dt=start_dt, end_dt=end_dt)
df = tsd.get_all_data()
df

Unnamed: 0,ticker,datetime,open,high,low,close,volume,upper_band,middle_band,lower_band,...,diff_sma10_sma200,diff_sma20_sma50,diff_sma20_sma100,diff_sma20_sma200,diff_sma50_sma100,diff_sma50_sma200,diff_sma100_sma200,day_return,prior_day_return,next_day_return
0,AAPL,2021-02-10,136.48,136.99001,134.39999,135.39,73046600,144.24874,135.273,126.29726,...,,,,,,,,0.007987,,0.005666
1,AAPL,2021-02-11,135.89999,136.39,133.77,135.13,64280000,144.23406,135.485,126.73594,...,,,,,,,,0.005666,0.007987,-0.007592
2,AAPL,2021-02-12,134.35001,135.53,133.69,135.37,60145100,144.02294,135.808,127.59306,...,,,,,,,,-0.007592,0.005666,0.016975
3,AAPL,2021-02-16,135.49001,136.00999,132.78999,133.19,80576300,143.42235,136.1105,128.79865,...,,,,,,,,0.016975,-0.007592,0.003124
4,AAPL,2021-02-17,131.25,132.22,129.47,130.84,97918500,142.9852,136.261,129.5368,...,,,,,,,,0.003124,0.016975,-0.003947
5,AAPL,2021-02-18,129.2,130.0,127.41,129.71001,96856700,143.22765,136.145,129.06235,...,,,,,,,,-0.003947,0.003124,0.002841
6,AAPL,2021-02-19,130.24001,130.71001,128.8,129.87,87668800,143.37418,135.795,128.21583,...,,,,,,,,0.002841,-0.003947,0.002841
7,AAPL,2021-02-19,130.24001,130.71001,128.8,129.87,87668800,143.37418,135.795,128.21583,...,,,,,,,,0.002841,0.002841,0.015702
8,AAPL,2021-02-22,128.00999,129.72,125.6,126.0,103916400,143.67256,135.1415,126.61044,...,,,,,,,,0.015702,0.002841,-0.016968
9,AAPL,2021-02-23,123.76,126.71,118.39,125.86,158273000,142.94856,134.2885,125.62844,...,,,,,,,,-0.016968,0.015702,-0.003282


In [56]:
df.columns.tolist()

['ticker',
 'datetime',
 'open',
 'high',
 'low',
 'close',
 'volume',
 'upper_band',
 'middle_band',
 'lower_band',
 'ema',
 'macd',
 'macd_signal',
 'macd_hist',
 'rsi',
 'atr',
 'tenkan_sen',
 'kijun_sen',
 'senkou_span_a',
 'senkou_span_b',
 'chikou_span',
 'vix_close',
 'spy_close',
 'cpi',
 'inflation_expectation',
 'consumer_sentiment',
 'cdl2crows',
 'cdl3blackcrows',
 'cdl3inside',
 'cdl3linestrike',
 'cdl3outside',
 'cdl3starsinsouth',
 'cdl3whitesoldiers',
 'cdlabandonedbaby',
 'cdladvanceblock',
 'cdlbelthold',
 'cdlbreakaway',
 'cdlclosingmarubozu',
 'cdlconcealbabyswall',
 'cdlcounterattack',
 'cdldarkcloudcover',
 'cdldoji',
 'cdldojistar',
 'cdldragonflydoji',
 'cdlengulfing',
 'cdleveningdojistar',
 'cdleveningstar',
 'cdlgapsidesidewhite',
 'cdlgravestonedoji',
 'cdlhammer',
 'cdlhangingman',
 'cdlharami',
 'cdlharamicross',
 'cdlhighwave',
 'cdlhikkake',
 'cdlhikkakemod',
 'cdlhomingpigeon',
 'cdlidentical3crows',
 'cdlinneck',
 'cdlinvertedhammer',
 'cdlkicking',
 '

In [3]:
api_key = '4513da89573e4b9d87fce1afb935b1f0'
tickers = ['AAPL']
interval = '1day'
start_dt = '2021-01-01'
end_dt = '2021-12-31'
#api_url = f'https://api.twelvedata.com/earnings?symbol=AAPL&interval={interval}&order=ASC&start_date={start_dt}&end_date={end_dt}&apikey={api_key}'
api_url = f'https://api.twelvedata.com/earnings?symbol=AAPL&order=ASC&start_date={start_dt}&end_date={end_dt}&apikey={api_key}'
data = requests.get(api_url).json()
df = pd.DataFrame(data['values'])
df['ticker'] = 'AAPL'
df

KeyError: 'values'

In [104]:
tsd = TimeSeriesData(api_key=api_key, tickers=tickers, interval=interval, start_dt=start_dt, end_dt=end_dt)
chk = tsd.get_gap(df)
chk

Unnamed: 0,datetime,open,high,low,close,volume,ticker,gap
0,2021-01-04,133.52,133.61,126.76,129.41,143301900,AAPL,
1,2021-01-05,128.89,131.74001,128.42999,131.00999,97664900,AAPL,-0.004018
2,2021-01-06,127.72,131.05,126.38,126.6,155088000,AAPL,-0.025113
3,2021-01-07,128.36,131.63,127.86,130.92,109578200,AAPL,0.013902
4,2021-01-08,132.42999,132.63,130.23,132.05,105158200,AAPL,0.011534
5,2021-01-11,129.19,130.17,128.5,128.98,100384500,AAPL,-0.021658
6,2021-01-12,128.5,129.69,126.86,128.8,91951100,AAPL,-0.003722
7,2021-01-13,128.75999,131.45,128.49001,130.89,88636800,AAPL,-0.000311
8,2021-01-14,130.8,131.0,128.75999,128.91,90221800,AAPL,-0.000688
9,2021-01-15,128.78,130.22,127.0,127.14,111598500,AAPL,-0.001008


In [11]:
pd.DataFrame(data['values'])

Unnamed: 0,datetime,open,high,low,close,volume
0,2021-01-04,36.64,36.71,35.37,35.38,4967013
1,2021-01-05,35.4,35.68,35.31,35.34,4432400
2,2021-01-06,35.35,35.395,34.97,35.17,7383479
3,2021-01-07,35.25,35.4,35.015,35.28,3433837
4,2021-01-08,35.37,35.77,35.32,35.66,3419871
5,2021-01-11,35.47,35.63,34.955,35.06,4050012
6,2021-01-12,34.99,35.13,34.65,35.09,4372475
7,2021-01-13,35.07,35.63,35.07,35.57,3024381
8,2021-01-14,35.67,36.02,35.5,35.79,3792654
9,2021-01-15,35.71,36.4061,35.63,36.34,4212957
