Michael Ricardo DS 2500 Project Trading torch

In [3]:
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta

In [4]:
def fetch_ticker_data(ticker, years_ago= 10):
    """
    Gets the market data for a given date and ticker.
    Fetches from yfinance library.

    Args:
        years ago (int): Representing the most recent day for stock entries 
        ticker(str): Representing the offical company stock ticker
    Returns
        df (DataFrame): DataFrame with Stock pricing data and history data
    """
    most_recent = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    start_date = '2014-12-01'
    
    
    stock_data = yf.Ticker(ticker)
    returning_data = stock_data.history(start = start_date, end= most_recent)

    return returning_data

In [5]:
'''
#13 week, 5 year, and 10 year indexes, will be going by their tickers for this project
tickers = ['^IRX', '^FVX', '^TNX']
df_master_bonds = pd.DataFrame()
for ticker in tickers:
    data = fetch_ticker_data(ticker)
    data = data.drop(columns=['Open', 'Dividends', 'Stock Splits', 'Volume'])
    data.index = pd.to_datetime(data.index).normalize()
    df_master_bonds[f'High_{ticker}'] = data['High']
    df_master_bonds[f'Low_{ticker}'] = data['Low']
    df_master_bonds[f'Close_{ticker}'] = data['Close']
'''

"\n#13 week, 5 year, and 10 year indexes, will be going by their tickers for this project\ntickers = ['^IRX', '^FVX', '^TNX']\ndf_master_bonds = pd.DataFrame()\nfor ticker in tickers:\n    data = fetch_ticker_data(ticker)\n    data = data.drop(columns=['Open', 'Dividends', 'Stock Splits', 'Volume'])\n    data.index = pd.to_datetime(data.index).normalize()\n    df_master_bonds[f'High_{ticker}'] = data['High']\n    df_master_bonds[f'Low_{ticker}'] = data['Low']\n    df_master_bonds[f'Close_{ticker}'] = data['Close']\n"

In [6]:
#Data for the IRX, Thirteen_Week
ticker = '^IRX'
Thirteen_Week = fetch_ticker_data(ticker)
Thirteen_Week = Thirteen_Week.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume', 'High', 'Low'])
Thirteen_Week.index = pd.to_datetime(Thirteen_Week.index).date
Thirteen_Week = Thirteen_Week.rename(columns={'Close': '13_week_close'})
Thirteen_Week


Unnamed: 0,13_week_close
2014-12-01,0.007
2014-12-02,0.015
2014-12-03,0.005
2014-12-04,0.013
2014-12-05,0.010
...,...
2024-11-21,4.413
2024-11-22,4.415
2024-11-25,4.405
2024-11-26,4.395


In [7]:
#Data for the FVX, Five Year
ticker = '^FVX'
Five_Year = fetch_ticker_data(ticker)
Five_Year = Five_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume'])
Five_Year.index = pd.to_datetime(Five_Year.index).date
Five_Year = Five_Year.rename(columns={'High': '5_year_high', 'Low': '5_year_low', 'Close': '5_year_close'})

In [8]:
#Data for the TNX, Ten Year
ticker = '^TNX'
Ten_Year = fetch_ticker_data(ticker)
Ten_Year = Ten_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume'])
Ten_Year.index = pd.to_datetime(Ten_Year.index).date
Ten_Year = Ten_Year.rename(columns={'High': '10_year_high', 'Low': '10_year_low', 'Close': '10_year_close'})

Adding variables not related to security itself: 

In [9]:
#Data for the S&P 500 & VIX, these will be used as variables 
ticker = '^GSPC'
SP500 = fetch_ticker_data(ticker)
SP500 = SP500.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
SP500.index = pd.to_datetime(SP500.index).date
SP500 = SP500.rename(columns={'Close': 'S&P500_close'})

ticker = '^VIX'
VIX = fetch_ticker_data(ticker)
VIX = VIX.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
VIX.index = pd.to_datetime(VIX.index).date
VIX = VIX.rename(columns={'Close': 'VIX_close'})

In [10]:
def external_csv(csv):
    '''
    Pulling in a csv, setting the 'Date column as the index and making it a proper datetime object
    Args:
        csv (CSV) - Data collection with Date column along with pricing data 
    Returns
        df (DataFrame): DataFrame     
    '''
    csv_read = pd.read_csv(csv)
    csv_read['Date'] = csv_read['Date'].str.replace('/', '-')
    csv_read['Date'] = pd.to_datetime(csv_read['Date'])
    csv_read = csv_read.set_index('Date')

    return csv_read

In [11]:
#CPI data CSV
cpi_csv = 'CPI_Data_10.csv'
cpi = external_csv(cpi_csv)
cpi = cpi.rename(columns={'Close': 'CIP_close'})

In [12]:
#Data for the 1 year Treasury Bill (Pulled in via CSV)
one_year_csv = '1year_bond_master.csv'
One_Year = external_csv(one_year_csv)
One_Year = One_Year.drop(columns='Open')

In [13]:
#Further cleaning needed for the 1 year, flipping the rows, eliminating % sign within each
One_Year = One_Year.iloc[::-1] 

titles = ['High', 'Low','Close']
for title in titles:
    One_Year[title] = One_Year[title].str.replace('%', '')

One_Year = One_Year.rename(columns={'High': '1_year_high', 'Low': '1_year_low', 'Close': '1_year_close'})
One_Year

Unnamed: 0_level_0,1_year_high,1_year_low,1_year_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2014-12-01,0.13,0.11,0.12
2014-12-02,0.14,0.11,0.12
2014-12-03,0.14,0.11,0.13
2014-12-04,0.14,0.12,0.13
2014-12-05,0.16,0.12,0.15
...,...,...,...
2024-11-20,4.37,4.29,4.37
2024-11-21,4.39,4.34,4.39
2024-11-22,4.41,4.36,4.41
2024-11-25,4.41,4.36,4.37


Section will be for data curration and preperation for regression 

In [14]:
def security_editor(df, close_col, high_col,low_col):
    """
    Gaining volatility and moving index metrics from the bond, bill, or note functioning as df. Yield Volatilioty represents the historical volatility for an asset over a given
    time though its Standerd deviation. Simple moving average measures the average closing price for a week & month of an asset. 

    Args:
        df (dataframe): Specific asset df imported with Close, Volume, High, and Low data for each day of market activity 
    Returns
        df(dataframe): Added ATR. SMA_7 and SMA_30 cols for given securuity. Removing high and low cols
    """
    df['Yield_STDEV'] = ta.stdev(df[close_col], length = 14)
    df = df.drop(columns=[high_col, low_col])
 
    df['SMA_7'] = ta.sma(df[close_col], length=10) 
    df['SMA_30'] = ta.sma(df[close_col], length=50)

    return df

In [15]:
df = Ten_Year
close_col = '10_year_close'
high_col = '10_year_high'
low_col = '10_year_low'
Ten_Year_Complete = security_editor(Ten_Year, close_col, high_col, low_col)
print(Ten_Year)

            10_year_high  10_year_low  10_year_close  Yield_STDEV
2014-12-01         2.218        2.155          2.218          NaN
2014-12-02         2.289        2.243          2.285          NaN
2014-12-03         2.307        2.278          2.287          NaN
2014-12-04         2.298        2.253          2.257          NaN
2014-12-05         2.331        2.252          2.307          NaN
...                  ...          ...            ...          ...
2024-11-21         4.438        4.377          4.432     0.057928
2024-11-22         4.430        4.383          4.410     0.054405
2024-11-25         4.332        4.261          4.265     0.058041
2024-11-26         4.324        4.281          4.302     0.060994
2024-11-27         4.275        4.227          4.242     0.070597

[2515 rows x 4 columns]


In [16]:

def combine(df_first, dfs):
    """
    Combining the data from other indexes to be used as variabels
    Args:
        df_main (dataframe): Main secuirty which will have the others cols added
        df_1 (dataframe): Sub index which will be added to df_main, will be used for S&P 500
        df_2 (dataframe): Sub index which will be added to df_main, will be used for VIx 
    Returns:
        df_mained: Combined df_main with df1 & df2
    """
    for df in dfs:
        df_first = pd.concat([df_first, df], axis=1)
    return df_first


In [17]:
#merging the dfs from above into the 10 year bond
dfs = [SP500, VIX, Five_Year, cpi, One_Year]
df_first = Ten_Year_Complete
final_df = combine(df_first, dfs)
final_df = final_df.dropna(subset=['10_year_close', '5_year_close'])
final_df

Unnamed: 0,10_year_close,Yield_STDEV,SMA_7,SMA_30,S&P500_close,VIX_close,5_year_high,5_year_low,5_year_close,CIP_close,1_year_high,1_year_low,1_year_close
2014-12-01,2.218,,,,2053.439941,14.160000,1.521,1.454,1.521,236.252,0.13,0.11,0.12
2014-12-02,2.285,,,,2066.550049,12.850000,1.591,1.554,1.590,,0.14,0.11,0.12
2014-12-03,2.287,,,,2074.330078,12.500000,1.621,1.585,1.608,,0.14,0.11,0.13
2014-12-04,2.257,,,,2071.919922,12.380000,1.623,1.579,1.587,,0.14,0.12,0.13
2014-12-05,2.307,,,,2075.370117,11.890000,1.697,1.589,1.682,,0.16,0.12,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-21,4.432,0.057928,4.3974,4.09002,5948.709961,16.870001,4.314,4.243,4.305,,4.39,4.34,4.39
2024-11-22,4.410,0.054405,4.4078,4.10522,5969.339844,15.240000,4.311,4.264,4.298,,4.41,4.36,4.41
2024-11-25,4.265,0.058041,4.4035,4.11810,5987.370117,14.600000,4.231,4.170,4.172,,4.41,4.36,4.37
2024-11-26,4.302,0.060994,4.3905,4.13130,6021.629883,14.100000,4.224,4.172,4.190,,4.40,4.34,4.36


In [18]:
def line_of_best_fit(X, y):
    """ 
    Finds line of best fit based off of a set of vectors
    Args:
        X (array): can be either 1-d or 2-d
        Y (array): can be either 1-d or 2-d
    Returns:
        p (array): 1d array, giving line and slope
    """

    if X.ndim == 1:
        X = add_bias_column(X).T
    else:
        X = X.T
    XTXinv = np.linalg.inv(np.matmul(X, X.T))
    p = np.matmul(XTXinv, np.matmul(X, y))
    return p

    

Regressions:

In [None]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")
    return Xnew

In [None]:
from sklearn.metrics import r2_score
def linreg_predict(Xnew, ynew, m):
    """Predicts vals for a given array, and returns dct of mse and r^2
    Args:
        Xnew: (an array, either 1-d or 2-d which includes all the $p$ predictor features, not including bias term)
        ynew: (a 1-d array which includes all corresponding response values to `Xnew`)
        m: (a 1-d array of length $p+1$ which contains the coefficients from the `line_of_best_fit` function)
    Returns:
        dct: dictionary, dct of mse and r^2 vals
    """

    dct = {}
    
    
    dct["ypreds"] = np.matmul(Xnew, m)
    dct["resids"] = ynew  - dct["ypreds"] 
    dct["mse"] = (dct["resids"] ** 2).mean()
    dct["r2"] = r2_score(ynew, dct["ypreds"])


In [None]:
# normalizing all X var columns
