Michael Ricardo DS 2500 Project Trading torch

In [602]:
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.model_selection import train_test_split

In [603]:
def fetch_ticker_data(ticker):
    """
    Gets the market data for a given date and ticker.
    Fetches from yfinance library.

    Args:
        years ago (int): Representing the most recent day for stock entries 
        ticker(str): Representing the offical company stock ticker
    Returns
        df (DataFrame): DataFrame with Stock pricing data and history data
    """
    most_recent = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    start_date = '2014-12-01'
    
    
    stock_data = yf.Ticker(ticker)
    returning_data = stock_data.history(start = start_date, end= most_recent)

    return returning_data

In [604]:
'''
#13 week, 5 year, and 10 year indexes, will be going by their tickers for this project
tickers = ['^IRX', '^FVX', '^TNX']
df_master_bonds = pd.DataFrame()
for ticker in tickers:
    data = fetch_ticker_data(ticker)
    data = data.drop(columns=['Open', 'Dividends', 'Stock Splits', 'Volume'])
    data.index = pd.to_datetime(data.index).normalize()
    df_master_bonds[f'High_{ticker}'] = data['High']
    df_master_bonds[f'Low_{ticker}'] = data['Low']
    df_master_bonds[f'Close_{ticker}'] = data['Close']
'''

"\n#13 week, 5 year, and 10 year indexes, will be going by their tickers for this project\ntickers = ['^IRX', '^FVX', '^TNX']\ndf_master_bonds = pd.DataFrame()\nfor ticker in tickers:\n    data = fetch_ticker_data(ticker)\n    data = data.drop(columns=['Open', 'Dividends', 'Stock Splits', 'Volume'])\n    data.index = pd.to_datetime(data.index).normalize()\n    df_master_bonds[f'High_{ticker}'] = data['High']\n    df_master_bonds[f'Low_{ticker}'] = data['Low']\n    df_master_bonds[f'Close_{ticker}'] = data['Close']\n"

In [605]:
#Data for the IRX, Thirteen_Week
ticker = '^IRX'
Thirteen_Week = fetch_ticker_data(ticker)
Thirteen_Week = Thirteen_Week.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume', 'High', 'Low'])
Thirteen_Week.index = pd.to_datetime(Thirteen_Week.index).date
Thirteen_Week = Thirteen_Week.rename(columns={'Close': '13_week_close'})
Thirteen_Week


Unnamed: 0,13_week_close
2014-12-01,0.007
2014-12-02,0.015
2014-12-03,0.005
2014-12-04,0.013
2014-12-05,0.010
...,...
2024-11-22,4.415
2024-11-25,4.405
2024-11-26,4.395
2024-11-27,4.385


In [606]:
#Data for the FVX, Five Year
ticker = '^FVX'
Five_Year = fetch_ticker_data(ticker)
Five_Year = Five_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume', 'High', 'Low'])
Five_Year.index = pd.to_datetime(Five_Year.index).date
Five_Year = Five_Year.rename(columns={'Close': '5_year_close'})

In [607]:
#Data for the TNX, Ten Year
ticker = '^TNX'
Ten_Year = fetch_ticker_data(ticker)
Ten_Year = Ten_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume'])
Ten_Year.index = pd.to_datetime(Ten_Year.index).date
Ten_Year = Ten_Year.rename(columns={'High': '10_year_high', 'Low': '10_year_low', 'Close': '10_year_close'})

Adding variables not related to security itself: 

In [608]:
#Data for the S&P 500 & VIX, these will be used as variables 
ticker = '^GSPC'
SP500 = fetch_ticker_data(ticker)
SP500 = SP500.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
SP500.index = pd.to_datetime(SP500.index).date
SP500 = SP500.rename(columns={'Close': 'S&P500_close'})

ticker = '^VIX'
VIX = fetch_ticker_data(ticker)
VIX = VIX.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
VIX.index = pd.to_datetime(VIX.index).date
VIX = VIX.rename(columns={'Close': 'VIX_close'})

In [609]:
def external_csv(csv):
    '''
    Pulling in a csv, setting the 'Date column as the index and making it a proper datetime object
    Args:
        csv (CSV) - Data collection with Date column along with pricing data 
    Returns
        df (DataFrame): DataFrame     
    '''
    csv_read = pd.read_csv(csv)
    csv_read['Date'] = csv_read['Date'].str.replace('/', '-')
    csv_read['Date'] = pd.to_datetime(csv_read['Date'])
    csv_read = csv_read.set_index('Date')

    return csv_read

In [610]:
#CPI data CSV
cpi_csv = 'CPI_Data_10.csv'
cpi = external_csv(cpi_csv)
cpi = cpi.rename(columns={'Close': 'CIP_close'})

In [611]:
#Data for the 1 year Treasury Bill (Pulled in via CSV)
one_year_csv = '1year_bond_master.csv'
One_Year = external_csv(one_year_csv)
One_Year = One_Year.drop(columns='Open')

In [612]:
#Further cleaning needed for the 1 year, flipping the rows, eliminating % sign within each
One_Year = One_Year.iloc[::-1] 
One_Year = One_Year.drop(columns=['High', 'Low'])
One_Year['Close'] = One_Year['Close'].str.replace('%', '')
    
One_Year = One_Year.rename(columns={'Close': '1_year_close'})
One_Year

Unnamed: 0_level_0,1_year_close
Date,Unnamed: 1_level_1
2014-12-01,0.12
2014-12-02,0.12
2014-12-03,0.13
2014-12-04,0.13
2014-12-05,0.15
...,...
2024-11-20,4.37
2024-11-21,4.39
2024-11-22,4.41
2024-11-25,4.37


Section will be for data curration and preperation for regression 

In [613]:
def security_editor(df, close_col, high_col,low_col):
    """
    Gaining volatility and moving index metrics from the bond, bill, or note functioning as df. Yield Volatilioty represents the historical volatility for an asset over a given
    time though its Standerd deviation. Simple moving average measures the average closing price for a week & month of an asset. 

    Args:
        df (dataframe): Specific asset df imported with Close, Volume, High, and Low data for each day of market activity 
    Returns
        df(dataframe): Added ATR. SMA_7 and SMA_30 cols for given securuity. Removing high and low cols
    """
    df['Yield_STDEV'] = ta.stdev(df[close_col], length = 14)
    df = df.drop(columns=[high_col, low_col])
 
    df['SMA_7'] = ta.sma(df[close_col], length=10) 
    df['SMA_30'] = ta.sma(df[close_col], length=50)

    return df

In [614]:
df = Ten_Year
close_col = '10_year_close'
high_col = '10_year_high'
low_col = '10_year_low'
Ten_Year_Complete = security_editor(Ten_Year, close_col, high_col, low_col)
print(Ten_Year)

            10_year_high  10_year_low  10_year_close  Yield_STDEV
2014-12-01         2.218        2.155          2.218          NaN
2014-12-02         2.289        2.243          2.285          NaN
2014-12-03         2.307        2.278          2.287          NaN
2014-12-04         2.298        2.253          2.257          NaN
2014-12-05         2.331        2.252          2.307          NaN
...                  ...          ...            ...          ...
2024-11-22         4.430        4.383          4.410     0.054405
2024-11-25         4.332        4.261          4.265     0.058041
2024-11-26         4.324        4.281          4.302     0.060994
2024-11-27         4.275        4.227          4.242     0.070597
2024-11-29         4.225        4.172          4.178     0.086214

[2516 rows x 4 columns]


In [615]:

def combine(df_first, dfs):
    """
    Combining the data from other indexes to be used as variabels
    Args:
        df_main (dataframe): Main secuirty which will have the others cols added
        df_1 (dataframe): Sub index which will be added to df_main, will be used for S&P 500
        df_2 (dataframe): Sub index which will be added to df_main, will be used for VIx 
    Returns:
        df_mained: Combined df_main with df1 & df2
    """
    for df in dfs:
        df_first = pd.concat([df_first, df], axis=1)
    return df_first


In [616]:
#merging the dfs from above into the 10 year bond
dfs = [SP500, VIX, Five_Year, cpi, One_Year]
df_first = Ten_Year_Complete
final_df = combine(df_first, dfs)
final_df = final_df.dropna(subset=['10_year_close' , 'SMA_7', 'SMA_30', '10_year_close', 'S&P500_close', '1_year_close'])
final_df

Unnamed: 0,10_year_close,Yield_STDEV,SMA_7,SMA_30,S&P500_close,VIX_close,5_year_close,CIP_close,1_year_close
2015-02-11,1.988,0.106079,1.8356,2.01794,2068.530029,16.959999,1.514,,0.23
2015-02-12,1.986,0.114389,1.8591,2.01330,2088.479980,15.340000,1.497,,0.23
2015-02-13,2.021,0.124403,1.8937,2.00802,2096.989990,14.690000,1.516,,0.23
2015-02-17,2.145,0.146688,1.9409,2.00518,2100.340088,15.800000,1.619,,0.22
2015-02-18,2.066,0.148306,1.9695,2.00136,2099.679932,15.450000,1.520,,0.21
...,...,...,...,...,...,...,...,...,...
2024-11-20,4.406,0.056236,4.3883,4.07498,5917.109863,17.160000,4.275,,4.37
2024-11-21,4.432,0.057928,4.3974,4.09002,5948.709961,16.870001,4.305,,4.39
2024-11-22,4.410,0.054405,4.4078,4.10522,5969.339844,15.240000,4.298,,4.41
2024-11-25,4.265,0.058041,4.4035,4.11810,5987.370117,14.600000,4.172,,4.37


Regressions:

In [617]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")
    return Xnew

In [618]:
from sklearn.metrics import mean_squared_error, r2_score
def linreg_predict(features, dependent, df):
    """Predicts vals for a given array, and returns dct of mse and r^2
    Args:
        Xnew: (an array, either 1-d or 2-d which includes all the $p$ predictor features, not including bias term)
        ynew: (a 1-d array which includes all corresponding response values to `Xnew`)
        m: (a 1-d array of length $p+1$ which contains the coefficients from the `line_of_best_fit` function)
    Returns:
        dct: dictionary, dct of mse and r^2 vals
    """

    features_df = df[features]
    target_series = df[dependent]
    X_train, X_test, y_train, y_test = train_test_split(features_df, target_series, random_state=30, test_size = 0.3)
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    
    
    dct = {}
    
    
    dct["ypreds"] = linear_regression_model.predict(X_test)
    dct["resids"] = y_test - dct["ypreds"]
    dct["mse"] = mean_squared_error(y_test, dct["ypreds"])
    dct["r2"] = r2_score(y_test, dct["ypreds"])

    return dct

In [619]:
m = linreg_predict(features=['S&P500_close', 'VIX_close', '1_year_close', 'Yield_STDEV'], dependent='10_year_close', df=final_df)

In [620]:
m

{'ypreds': array([3.6353012 , 1.88123544, 1.33609753, 1.42054326, 1.4500577 ,
        2.74552262, 2.30150827, 2.72002148, 2.80628704, 3.07815151,
        1.43568223, 4.36560813, 2.56287935, 1.43469046, 2.75992583,
        1.25417467, 2.8921958 , 1.9280435 , 2.32435489, 4.16646449,
        2.84658799, 2.0146929 , 4.19697044, 1.62637254, 1.79561785,
        1.82704043, 1.22425759, 1.41996607, 1.70457835, 2.93054772,
        1.30994849, 1.79022626, 2.56695658, 1.92665389, 1.60749403,
        1.70153593, 4.29250457, 2.90544148, 2.83181586, 2.80972692,
        2.31442246, 1.84008474, 2.2889507 , 2.39336757, 2.22258515,
        1.93389155, 2.28847499, 1.80007605, 2.2242807 , 1.44482573,
        1.37894458, 1.3288882 , 4.07145102, 2.39741002, 2.52942525,
        1.25651544, 1.44202456, 1.26079208, 1.91361269, 4.18725601,
        4.04538321, 1.44196776, 1.87128916, 1.47612173, 2.80418805,
        4.19092008, 1.41229848, 4.03370269, 1.38610791, 4.44248509,
        2.66048958, 4.15575134, 2.8795

In [None]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")

    return Xnew

In [None]:
def line_of_best_fit(X, y):
    '''
    This function computes the line of best fit by providing the coefficients for the slope(s) 
    and intercept using the Ordinary Least Squares (OLS) method. It achieves this by projecting 
    the values of y onto the span of X (the space defined by the independent variables).
    
    X represents out input data, a numpy array, and acts as out indepedet variable. y represents the data we 
    are trying to predict, a numpy array of the same length as X, and acts as our as a dependent variable.
    
    Output = Coefficants as an array, first elm being the intercept accounted for with the 'add_bias_column' 
    and second being the intercept. Can be summarized as the line of best foit for predicted y values on X.
    '''
    proper_X = add_bias_column(X)
    
    vector = np.matmul(np.linalg.inv(np.matmul(proper_X.T, proper_X)), np.matmul(proper_X.T, y))
       
    return vector

In [None]:
def linreg_predict(Xnew, ynew, m):
    '''
    This function is used to gage the predicted values, residual values, MSE (Mean Squared Error), and r2_score to analysze a
    line of best fit projecttion,
    
    Xnew is either a 1D or 2D array which is the data used as 'predictor fatures' in predicitng y. Does not contain a bias column
    Ynew is a 1d array and can be summrized as the values given by multiple Xnew and m
    M is a 1d arrat represented as the found coeffeicint line of best fit values for the given data within the lin_of_best function
    
    Output = dct with ypreds, resids, mse, and r2 as a measurement of the current predicted values of y compared to the true
    values. 
    
    '''
    proper_X = add_bias_column(Xnew)
    
    ypreds = np.matmul(proper_X, m)
    
    resids = ynew - ypreds
    
    mse = np.mean(resids**2)
    
    r2 = r2_score(ynew, ypreds)
    
    final_dct = {
        'ypreds': ypreds,
        'resids': resids,
        'mse': mse,
        'r2': r2        
    }
    
    
    return final_dct

In [None]:
mercy_x

In [None]:
crossval = train_test_split(mercy_x, 
                            mercy_y,
                            test_size=0.3)

Xtrain, Xtest, ytrain, ytest = crossval