Michael Ricardo DS 2500 Project Trading torch

In [159]:
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.model_selection import train_test_split

In [160]:
def fetch_ticker_data(ticker):
    """
    Gets the market data for a given date and ticker.
    Fetches from yfinance library.

    Args:
        years ago (int): Representing the most recent day for stock entries 
        ticker(str): Representing the offical company stock ticker
    Returns
        df (DataFrame): DataFrame with Stock pricing data and history data
    """
    most_recent = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
    start_date = '2014-12-01'
    
    
    stock_data = yf.Ticker(ticker)
    returning_data = stock_data.history(start = start_date, end= most_recent)
    
    returning_data.index = returning_data.index.tz_localize(None).normalize()

    return returning_data

In [161]:
#Data for the IRX, Thirteen_Week
ticker = '^IRX'
Thirteen_Week = fetch_ticker_data(ticker)
Thirteen_Week = Thirteen_Week.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume', 'High', 'Low'])
Thirteen_Week = Thirteen_Week.rename(columns={'Close': '13_week_close'})

In [162]:
#Data for the FVX, Five Year
ticker = '^FVX'
Five_Year = fetch_ticker_data(ticker)
Five_Year = Five_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume', 'High', 'Low'])
Five_Year = Five_Year.rename(columns={'Close': '5_year_close'})



In [163]:
#Data for the TNX, Ten Year
ticker = '^TNX'
Ten_Year = fetch_ticker_data(ticker)
Ten_Year = Ten_Year.drop(columns=['Open','Dividends', 'Stock Splits', 'Volume'])
Ten_Year = Ten_Year.rename(columns={'High': '10_year_high', 'Low': '10_year_low', 'Close': '10_year_close'})

Adding variables not related to security itself: 

In [164]:
#Data for the S&P 500 & VIX, these will be used as variables 
ticker = '^GSPC'
SP500 = fetch_ticker_data(ticker)
SP500 = SP500.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
SP500 = SP500.rename(columns={'Close': 'S&P500_close'})

ticker = '^VIX'
VIX = fetch_ticker_data(ticker)
VIX = VIX.drop(columns=['Open', 'High', 'Low', 'Dividends', 'Stock Splits', 'Volume'])
VIX = VIX.rename(columns={'Close': 'VIX_close'})


In [165]:
def external_csv(csv):
    '''
    Pulling in a csv, setting the 'Date column as the index and making it a proper datetime object
    Args:
        csv (CSV) - Data collection with Date column along with pricing data 
    Returns
        df (DataFrame): DataFrame     
    '''
    csv_read = pd.read_csv(csv)
    
    return csv_read

In [166]:
#CPI data CSV
cpi_csv = 'CPI_Data_10.csv'
cpi = external_csv(cpi_csv)
cpi = cpi.rename(columns={'Close': 'CIP_close'})
cpi = cpi.set_index('Date')

In [167]:
one_year_csv = '1year_bond_master.csv'
One_Year = external_csv(one_year_csv)
One_Year = One_Year.drop(columns=['Open', 'High', 'Low'])

In [168]:
#Further cleaning needed for the 1 year, flipping the rows, eliminating % sign within each
One_Year = One_Year.iloc[::-1] 
One_Year['Close'] = One_Year['Close'].str.rstrip('%').astype(float)

One_Year = One_Year.rename(columns={'Close': '1_year_close'})
One_Year = One_Year.set_index('Date')

Section will be for data curration and preperation for regression 

In [169]:
def security_editor(df, close_col, high_col,low_col):
    """
    Gaining volatility and moving index metrics from the bond, bill, or note functioning as df. Yield Volatilioty represents the historical volatility for an asset over a given
    time though its Standerd deviation. Simple moving average measures the average closing price for a week & month of an asset. 

    Args:
        df (dataframe): Specific asset df imported with Close, Volume, High, and Low data for each day of market activity 
    Returns
        df(dataframe): Added ATR. SMA_7 and SMA_30 cols for given securuity. Removing high and low cols
    """
    df['Yield_STDEV'] = ta.stdev(df[close_col], length = 14)
    df = df.drop(columns=[high_col, low_col])
 
    df['SMA_7'] = ta.sma(df[close_col], length=10) 
    df['SMA_30'] = ta.sma(df[close_col], length=50)

    return df

In [170]:
df = Ten_Year
close_col = '10_year_close'
high_col = '10_year_high'
low_col = '10_year_low'
Ten_Year_Complete = security_editor(Ten_Year, close_col, high_col, low_col)
print(Ten_Year)

            10_year_high  10_year_low  10_year_close  Yield_STDEV
Date                                                             
2014-12-01         2.218        2.155          2.218          NaN
2014-12-02         2.289        2.243          2.285          NaN
2014-12-03         2.307        2.278          2.287          NaN
2014-12-04         2.298        2.253          2.257          NaN
2014-12-05         2.331        2.252          2.307          NaN
...                  ...          ...            ...          ...
2024-11-22         4.430        4.383          4.410     0.054405
2024-11-25         4.332        4.261          4.265     0.058041
2024-11-26         4.324        4.281          4.302     0.060994
2024-11-27         4.275        4.227          4.242     0.070597
2024-11-29         4.225        4.172          4.178     0.086214

[2516 rows x 4 columns]


In [171]:

def combine(df_first, dfs):
    """
    Combining the data from other indexes to be used as variabels
    Args:
        df_main (dataframe): Main secuirty which will have the others cols added
        df_1 (dataframe): Sub index which will be added to df_main, will be used for S&P 500
        df_2 (dataframe): Sub index which will be added to df_main, will be used for VIx 
    Returns:
        df_mained: Combined df_main with df1 & df2
    """
    for df in dfs:
        # Ensure indices are aligned and consistent
        df.index = pd.to_datetime(df.index)
        df_first = pd.concat([df_first, df], axis=1)
    return df_first


In [172]:
One_Year.index

Index(['12/1/2014', '12/2/2014', '12/3/2014', '12/4/2014', '12/5/2014',
       '12/8/2014', '12/9/2014', '12/10/2014', '12/11/2014', '12/12/2014',
       ...
       '11/13/2024', '11/14/2024', '11/15/2024', '11/18/2024', '11/19/2024',
       '11/20/2024', '11/21/2024', '11/22/2024', '11/25/2024', '11/26/2024'],
      dtype='object', name='Date', length=2529)

In [189]:
#merging the dfs from above into the 10 year bond
dfs = [SP500, VIX, Five_Year, One_Year, cpi]
df_first = Ten_Year_Complete
final_df = combine(df_first, dfs)
final_df
final_df.to_csv('Imrpvoed.csv', index=False)

In [174]:
final_df = final_df.dropna(subset=['10_year_close' , 'SMA_7', 'SMA_30', '10_year_close', 'S&P500_close','1_year_close'])
final_df

Unnamed: 0_level_0,10_year_close,Yield_STDEV,SMA_7,SMA_30,S&P500_close,VIX_close,5_year_close,1_year_close,CIP_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-02-11,1.988,0.106079,1.8356,2.01794,2068.530029,16.959999,1.514,0.23,
2015-02-12,1.986,0.114389,1.8591,2.01330,2088.479980,15.340000,1.497,0.23,
2015-02-13,2.021,0.124403,1.8937,2.00802,2096.989990,14.690000,1.516,0.23,
2015-02-17,2.145,0.146688,1.9409,2.00518,2100.340088,15.800000,1.619,0.22,
2015-02-18,2.066,0.148306,1.9695,2.00136,2099.679932,15.450000,1.520,0.21,
...,...,...,...,...,...,...,...,...,...
2024-11-20,4.406,0.056236,4.3883,4.07498,5917.109863,17.160000,4.275,4.37,
2024-11-21,4.432,0.057928,4.3974,4.09002,5948.709961,16.870001,4.305,4.39,
2024-11-22,4.410,0.054405,4.4078,4.10522,5969.339844,15.240000,4.298,4.41,
2024-11-25,4.265,0.058041,4.4035,4.11810,5987.370117,14.600000,4.172,4.37,


Regressions:

In [175]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")
    return Xnew

In [176]:
from sklearn.metrics import mean_squared_error, r2_score
def linreg_predict(features, dependent, df):
    """Predicts vals for a given array, and returns dct of mse and r^2
    Args:
        Xnew: (an array, either 1-d or 2-d which includes all the $p$ predictor features, not including bias term)
        ynew: (a 1-d array which includes all corresponding response values to `Xnew`)
        m: (a 1-d array of length $p+1$ which contains the coefficients from the `line_of_best_fit` function)
    Returns:
        dct: dictionary, dct of mse and r^2 vals
    """

    features_df = df[features]
    target_series = df[dependent]
    X_train, X_test, y_train, y_test = train_test_split(features_df, target_series, random_state=30, test_size = 0.3)
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(X_train, y_train)
    
    predictions = linear_regression_model.predict(X_test)
    rmse = np.sqrt(((predictions - y_test) ** 2).mean())
    
    residuals = y_test - predictions
    dct = {}
    
    
    dct["ypreds"] = predictions
    dct["resids"] = residuals
    dct["mse"] = rmse
    dct["r2"] = r2_score(y_test, dct["ypreds"])

    return dct

In [177]:
m = linreg_predict(features=['S&P500_close', 'VIX_close', 'Yield_STDEV', 'SMA_30'], dependent='10_year_close', df=final_df)

In [178]:
m

{'ypreds': array([3.94501332, 1.85601911, 0.68300104, 1.48050895, 1.56588654,
        2.90282188, 1.69465615, 2.48640625, 2.68185777, 3.01643832,
        0.71823999, 3.8082107 , 2.4602743 , 1.58989422, 2.91176967,
        0.68719917, 2.90469411, 1.63478031, 2.25692415, 4.13797537,
        2.63140085, 2.44537097, 4.53512918, 2.11073762, 1.76431468,
        1.81580619, 0.84399627, 1.65493677, 2.10998972, 2.95333176,
        1.60951258, 1.7664259 , 2.92561908, 1.61912668, 2.18900026,
        2.06204495, 3.92082083, 2.971322  , 2.71183592, 3.01860787,
        1.88767544, 1.82055504, 2.16682614, 2.30243682, 2.25892921,
        1.92583165, 2.3659616 , 2.15475526, 2.6108255 , 1.50253043,
        1.65375325, 1.07125011, 3.77794969, 2.77909285, 2.61602762,
        0.71132803, 1.49047465, 0.87997738, 1.61785863, 4.51323552,
        3.68764082, 1.45673791, 1.58075143, 1.63822034, 2.67683816,
        4.49678129, 1.42516915, 3.72865085, 1.45428637, 4.27981285,
        2.90798422, 4.24945122, 3.0696

In [179]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")

    return Xnew

In [180]:
def line_of_best_fit(X, y):
    '''
    This function computes the line of best fit by providing the coefficients for the slope(s) 
    and intercept using the Ordinary Least Squares (OLS) method. It achieves this by projecting 
    the values of y onto the span of X (the space defined by the independent variables).
    
    X represents out input data, a numpy array, and acts as out indepedet variable. y represents the data we 
    are trying to predict, a numpy array of the same length as X, and acts as our as a dependent variable.
    
    Output = Coefficants as an array, first elm being the intercept accounted for with the 'add_bias_column' 
    and second being the intercept. Can be summarized as the line of best foit for predicted y values on X.
    '''
    proper_X = add_bias_column(X)
    
    vector = np.matmul(np.linalg.inv(np.matmul(proper_X.T, proper_X)), np.matmul(proper_X.T, y))
       
    return vector

In [181]:
def linreg_predict(Xnew, ynew, m):
    '''
    This function is used to gage the predicted values, residual values, MSE (Mean Squared Error), and r2_score to analysze a
    line of best fit projecttion,
    
    Xnew is either a 1D or 2D array which is the data used as 'predictor fatures' in predicitng y. Does not contain a bias column
    Ynew is a 1d array and can be summrized as the values given by multiple Xnew and m
    M is a 1d arrat represented as the found coeffeicint line of best fit values for the given data within the lin_of_best function
    
    Output = dct with ypreds, resids, mse, and r2 as a measurement of the current predicted values of y compared to the true
    values. 
    
    '''
    proper_X = add_bias_column(Xnew)
    
    ypreds = np.matmul(proper_X, m)
    
    resids = ynew - ypreds
    
    mse = np.mean(resids**2)
    
    r2 = r2_score(ynew, ypreds)
    
    final_dct = {
        'ypreds': ypreds,
        'resids': resids,
        'mse': mse,
        'r2': r2        
    }
    
    
    return final_dct

In [182]:
mercy_x = final_df[['S&P500_close', 'VIX_close', 'Yield_STDEV', 'SMA_30', '1_year_close']]
mercy_y = final_df['10_year_close']

In [183]:
crossval = train_test_split(mercy_x, 
                            mercy_y,
                            test_size=0.3)

Xtrain, Xtest, ytrain, ytest = crossval
Xtrain

Unnamed: 0_level_0,S&P500_close,VIX_close,Yield_STDEV,SMA_30,1_year_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-07-24,2469.909912,9.430000,0.051157,2.24508,1.23
2015-07-09,2051.310059,19.969999,0.076937,2.27192,0.23
2023-08-04,4478.029785,17.100000,0.127024,3.83298,5.34
2018-10-24,2656.100098,25.230000,0.035274,3.02336,2.64
2019-08-16,2888.679932,18.469999,0.163275,1.97044,1.72
...,...,...,...,...,...
2020-03-02,3090.229980,33.419998,0.180729,1.67294,0.94
2016-04-14,2082.780029,13.720000,0.047653,1.81432,0.54
2020-07-31,3271.120117,24.459999,0.029114,0.67046,0.12
2016-07-13,2152.429932,13.040000,0.097539,1.66898,0.50


In [184]:
line_of_best_mercy_train = line_of_best_fit(Xtrain, ytrain)

In [185]:
predicted_vals_mercy = linreg_predict(Xtest, ytest, line_of_best_mercy_train)

In [186]:
print(f"Mse: {predicted_vals_mercy['mse']} R2: {predicted_vals_mercy['r2']}")

Mse: 0.04712106741486919 R2: 0.9596727982661492


In [187]:
import statsmodels.api as sm

Xtrain_with_bias = sm.add_constant(Xtrain)

model = sm.OLS(ytrain, Xtrain_with_bias)  # Ordinary Least Squares
results = model.fit()

# Predictions
y_pred = results.predict(Xtrain_with_bias)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(ytrain, y_pred))

# Get R-squared from the results
r2 = results.rsquared

print("RMSE:", rmse)
print("R-squared:", r2)
print(results.summary())

RMSE: 0.21067326954927654
R-squared: 0.9582556633772341
                            OLS Regression Results                            
Dep. Variable:          10_year_close   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.958
Method:                 Least Squares   F-statistic:                     7837.
Date:                Mon, 02 Dec 2024   Prob (F-statistic):               0.00
Time:                        07:35:14   Log-Likelihood:                 237.26
No. Observations:                1713   AIC:                            -462.5
Df Residuals:                    1707   BIC:                            -429.9
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------

In [188]:

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split your data
Xtrain, Xtest, ytrain, ytest = train_test_split(mercy_x, mercy_y, test_size=0.3, random_state=42)

# Add bias column
Xtrain_with_bias = sm.add_constant(Xtrain)
Xtest_with_bias = sm.add_constant(Xtest)

# Fit model on training data
model = sm.OLS(ytrain, Xtrain_with_bias).fit()

# Predictions on train and test data
ytrain_pred = model.predict(Xtrain_with_bias)
ytest_pred = model.predict(Xtest_with_bias)

# Calculate RMSE and R^2
train_rmse = np.sqrt(mean_squared_error(ytrain, ytrain_pred))
test_rmse = np.sqrt(mean_squared_error(ytest, ytest_pred))
train_r2 = r2_score(ytrain, ytrain_pred)
test_r2 = r2_score(ytest, ytest_pred)

print("Train RMSE:", train_rmse, "Test RMSE:", test_rmse)
print("Train R^2:", train_r2, "Test R^2:", test_r2)

Train RMSE: 0.2103676420530439 Test RMSE: 0.21793399990643275
Train R^2: 0.9597712504834242 Test R^2: 0.9561509700113353
