### Using Logistic Regression Algorithm to calculate the percentage of observations that is correctly predicted by the model (hit rate) for the 10yr Treasury Bond.

In [57]:
# Import the required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [58]:
# Read in the transaction_fraud_data.csv file into a PandasDataFrame.
ten_year_data = pd.read_csv(Path("../Resources/Bond Data/treasury_yield_10yr.csv"), index_col='Date', infer_datetime_format=True, parse_dates=True)

#ten_year_data = ten_year_data.set_index('Date')

# Review the DataFrame
ten_year_data.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-31,2.37,2.379,2.369,2.376,2.376,0.0
2017-11-01,2.398,2.401,2.349,2.376,2.376,0.0
2017-11-02,2.363,2.37,2.336,2.347,2.347,0.0
2017-11-03,2.351,2.361,2.324,2.343,2.343,0.0
2017-11-05,,,,,,
2017-11-06,2.325,2.333,2.315,2.32,2.32,0.0
2017-11-07,2.327,2.329,2.304,2.307,2.307,0.0
2017-11-08,2.315,2.327,2.306,2.325,2.325,0.0
2017-11-09,2.333,2.345,2.315,2.331,2.331,0.0
2017-11-10,2.368,2.406,2.366,2.4,2.4,0.0


In [59]:
ten_year_data = ten_year_data.drop(columns=["Open", "High", "Low", "Close", "Volume"])

ten_year_data.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2017-10-31,2.376
2017-11-01,2.376
2017-11-02,2.347
2017-11-03,2.343
2017-11-05,


In [60]:
ten_year_data = ten_year_data.dropna()

ten_year_data.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2017-10-31,2.376
2017-11-01,2.376
2017-11-02,2.347
2017-11-03,2.343
2017-11-06,2.32


In [61]:
lags = 5

for i in range(0, lags):
	ten_year_data['Lag_' + str(i+1)] = ten_year_data['Adj Close'].shift(i+1)

ten_year_data['Returns'] = ten_year_data['Adj Close'].pct_change()

ten_year_data.head()

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,2.376,,,,,,
2017-11-01,2.376,2.376,,,,,0.0
2017-11-02,2.347,2.376,2.376,,,,-0.012205
2017-11-03,2.343,2.347,2.376,2.376,,,-0.001704
2017-11-06,2.32,2.343,2.347,2.376,2.376,,-0.009816


In [62]:
# create the lagged percentage returns coluns
for i in range(0, lags):
	ten_year_data['Lag_' + str(i+1)] = ten_year_data["Lag_" + str(i+1)].pct_change()
	ten_year_data.fillna(0, inplace=True)
    
ten_year_data.head(10)

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,2.376,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-01,2.376,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-02,2.347,0.0,inf,0.0,0.0,0.0,-0.012205
2017-11-03,2.343,-0.012205,0.0,inf,0.0,0.0,-0.001704
2017-11-06,2.32,-0.001704,-0.012205,0.0,inf,0.0,-0.009816
2017-11-07,2.307,-0.009816,-0.001704,-0.012205,0.0,inf,-0.005603
2017-11-08,2.325,-0.005603,-0.009816,-0.001704,-0.012205,0.0,0.007802
2017-11-09,2.331,0.007802,-0.005603,-0.009816,-0.001704,-0.012205,0.002581
2017-11-10,2.4,0.002581,0.007802,-0.005603,-0.009816,-0.001704,0.029601
2017-11-13,2.4,0.029601,0.002581,0.007802,-0.005603,-0.009816,0.0


In [63]:
# convert returns to the sign of direction
ten_year_data['Direction'] = np.sign(ten_year_data['Returns'])

ten_year_data = ten_year_data.dropna()
ten_year_data = ten_year_data.drop(ten_year_data.index[0:5])

##remove rows with any values that are not infinite
ten_year_data = ten_year_data[np.isfinite(ten_year_data).all(1)]

ten_year_data

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns,Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-11-08,2.325,-0.005603,-0.009816,-0.001704,-0.012205,0.000000,0.007802,1.0
2017-11-09,2.331,0.007802,-0.005603,-0.009816,-0.001704,-0.012205,0.002581,1.0
2017-11-10,2.400,0.002581,0.007802,-0.005603,-0.009816,-0.001704,0.029601,1.0
2017-11-13,2.400,0.029601,0.002581,0.007802,-0.005603,-0.009816,0.000000,0.0
2017-11-14,2.381,0.000000,0.029601,0.002581,0.007802,-0.005603,-0.007917,-1.0
...,...,...,...,...,...,...,...,...
2022-10-24,4.234,-0.003076,0.023988,0.032266,-0.004234,0.001247,0.004985,1.0
2022-10-25,4.108,0.004985,-0.003076,0.023988,0.032266,-0.004234,-0.029759,-1.0
2022-10-26,4.015,-0.029759,0.004985,-0.003076,0.023988,0.032266,-0.022639,-1.0
2022-10-27,3.937,-0.022639,-0.029759,0.004985,-0.003076,0.023988,-0.019427,-1.0


In [68]:
cpi_data = pd.read_csv(Path("../Resources/reformatted_cpi.csv"))

cpi_data['Date'] = (cpi_data['Year'].astype(str)) + "-" + (cpi_data['level_1'])

cpi_data['Date'] = pd.to_datetime(cpi_data['Date'])

cpi_data = cpi_data.drop(columns=['Year', 'level_1'])

cpi_data = cpi_data.set_index('Date')

cpi_data

Unnamed: 0_level_0,0
Date,Unnamed: 1_level_1
2012-01-01,226.665
2012-02-01,227.663
2012-03-01,229.392
2012-04-01,230.085
2012-05-01,229.815
...,...
2022-05-01,292.296
2022-06-01,296.311
2022-07-01,296.276
2022-08-01,296.171


In [94]:
# Transforming monthly CPI data into daily CPI variable

start_date = cpi_data.index.min() - pd.DateOffset(day=1)
end_date = cpi_data.index.max() + pd.DateOffset(day=31)

dates = pd.date_range(start_date, end_date, freq='D')
dates.name = 'date'
cpi_data = cpi_data.reindex(dates, method='ffill')

cpi_data.head()

Unnamed: 0_level_0,0
date,Unnamed: 1_level_1
2012-01-01,226.665
2012-01-02,226.665
2012-01-03,226.665
2012-01-04,226.665
2012-01-05,226.665


In [128]:
unemployment_data = pd.read_csv(Path("../Resources/Unemployment_in_thousands_2012_2022.csv"), index_col='Year')

unemployment_data = unemployment_data.iloc[:,0:12].stack().reset_index(level = 1)

unemployment_data['Date'] = (unemployment_data.index.astype(str)) + "-" + (unemployment_data['level_1'])

unemployment_data['Date'] = pd.to_datetime(unemployment_data['Date'])

unemployment_data = unemployment_data.set_index('Date')

unemployment_data = unemployment_data.drop(columns=['level_1'])

unemployment_data.head()

Unnamed: 0_level_0,0
Date,Unnamed: 1_level_1
2012-01-01,12797.0
2012-02-01,12813.0
2012-03-01,12713.0
2012-04-01,12646.0
2012-05-01,12660.0


In [132]:
# Transforming monthly CPI data into daily CPI variable

start_date = cpi_data.index.min() - pd.DateOffset(day=1)
end_date = cpi_data.index.max() + pd.DateOffset(day=31)

dates = pd.date_range(start_date, end_date, freq='D')
dates.name = 'date'
unemployment_data = unemployment_data.reindex(dates, method='ffill')

unemployment_data.head()

Unnamed: 0_level_0,0
date,Unnamed: 1_level_1
2012-01-01,12797.0
2012-01-02,12797.0
2012-01-03,12797.0
2012-01-04,12797.0
2012-01-05,12797.0


In [143]:
#ten_year_data['cpi'] = 0
#
#for index in cpi_data:
#    if ten_year_data.index == cpi_data.index:
#        ten_year_data['cpi'] = cpi_data[0]
#        continue
#
#cpi_data.head()

combined_df = pd.concat([ten_year_data, cpi_data, unemployment_data], axis=1)

combined_df = combined_df.dropna()

combined_df = combined_df.drop(columns=['Lag_3', 'Lag_4', 'Lag_5', 'cpi'])

combined_df.columns = ['Adj Close', 'Lag_1', 'Lag_2', 'Returns', 'Direction', 'CPI', 'Unemployment'] 

combined_df.head()

Unnamed: 0,Adj Close,Lag_1,Lag_2,Returns,Direction,CPI,Unemployment
2017-11-08,2.325,-0.005603,-0.009816,0.007802,1.0,246.669,6774.0
2017-11-09,2.331,0.007802,-0.005603,0.002581,1.0,246.669,6774.0
2017-11-10,2.4,0.002581,0.007802,0.029601,1.0,246.669,6774.0
2017-11-13,2.4,0.029601,0.002581,0.0,0.0,246.669,6774.0
2017-11-14,2.381,0.0,0.029601,-0.007917,-1.0,246.669,6774.0


In [148]:
from sklearn.preprocessing import StandardScaler

# Scaling the numeric columns
combined_df_scaled = StandardScaler().fit_transform(combined_df[["Adj Close", "Lag_1", "Lag_2", "Returns", "CPI", "Unemployment"]])

# Creating a DataFrame with with the scaled data
combined_transformed = pd.DataFrame(combined_df_scaled, columns=["Adj Close", "Lag_1", "Lag_2", "Returns", "CPI", "Unemployment"])

# Display sample data
combined_transformed.head()

Unnamed: 0,Adj Close,Lag_1,Lag_2,Returns,CPI,Unemployment
0,0.412938,-0.170319,-0.276057,0.167083,-1.212549,-0.347645
1,0.420185,0.167515,-0.169886,0.035498,-1.212549,-0.347645
2,0.503526,0.035925,0.167947,0.716402,-1.212549,-0.347645
3,0.503526,0.716856,0.036357,-0.029534,-1.212549,-0.347645
4,0.480577,-0.029109,0.717285,-0.229031,-1.212549,-0.347645


In [150]:
X = combined_transformed
y = combined_df["Direction"]

## Split into training and testing windows
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

#start_test = pd.Timestamp("2017-11-14")
#
#X_train = X[X.index < start_test]
#X_test = X[X.index >= start_test]
#y_train = y[y.index < start_test]
#y_test = y[y.index >= start_test]

In [151]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [152]:
lr_accuracy = accuracy_score(y_test, y_pred)
lr_accuracy

0.987012987012987

In [153]:
pred = (1.0 + (y_pred == y_test))/2.0
hit_rate = np.mean(pred)
print('Logistic Regression - Hit Rate of 10 yr Treasury Yield Bond: {:.4f}'.format(hit_rate))

Logistic Regression - Hit Rate of 10 yr Treasury Yield Bond: 0.9935
