In [13]:
# Import the required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [14]:
# Read in the transaction_fraud_data.csv file into a PandasDataFrame.
thirty_year_data = pd.read_csv(Path("../Resources/treasury_yield_30yr.csv"), index_col='Date', infer_datetime_format=True, parse_dates=True)

#ten_year_data = ten_year_data.set_index('Date')

# Review the DataFrame
thirty_year_data.head(10)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-31,2.878,2.886,2.87,2.875,2.875,0.0
2017-11-01,2.897,2.899,2.837,2.862,2.862,0.0
2017-11-02,2.848,2.856,2.822,2.829,2.829,0.0
2017-11-03,2.828,2.838,2.809,2.822,2.822,0.0
2017-11-05,,,,,,
2017-11-06,2.804,2.812,2.792,2.797,2.797,0.0
2017-11-07,2.804,2.804,2.768,2.769,2.769,0.0
2017-11-08,2.777,2.789,2.767,2.784,2.784,0.0
2017-11-09,2.809,2.821,2.792,2.806,2.806,0.0
2017-11-10,2.85,2.89,2.848,2.88,2.88,0.0


In [15]:
thirty_year_data = thirty_year_data.drop(columns=["Open", "High", "Low", "Close", "Volume"])

thirty_year_data.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2017-10-31,2.875
2017-11-01,2.862
2017-11-02,2.829
2017-11-03,2.822
2017-11-05,


In [16]:
thirty_year_data = thirty_year_data.dropna()

thirty_year_data.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2017-10-31,2.875
2017-11-01,2.862
2017-11-02,2.829
2017-11-03,2.822
2017-11-06,2.797


In [17]:
lags = 5

for i in range(0, lags):
	thirty_year_data['Lag_' + str(i+1)] = thirty_year_data['Adj Close'].shift(i+1)

thirty_year_data['Returns'] = thirty_year_data['Adj Close'].pct_change()

thirty_year_data.head()

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,2.875,,,,,,
2017-11-01,2.862,2.875,,,,,-0.004522
2017-11-02,2.829,2.862,2.875,,,,-0.01153
2017-11-03,2.822,2.829,2.862,2.875,,,-0.002474
2017-11-06,2.797,2.822,2.829,2.862,2.875,,-0.008859


In [18]:
# create the lagged percentage returns coluns
for i in range(0, lags):
	thirty_year_data['Lag_' + str(i+1)] = thirty_year_data["Lag_" + str(i+1)].pct_change()
	thirty_year_data.fillna(0, inplace=True)
    
thirty_year_data.head(10)

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-10-31,2.875,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-01,2.862,0.0,0.0,0.0,0.0,0.0,-0.004522
2017-11-02,2.829,-0.004522,inf,0.0,0.0,0.0,-0.01153
2017-11-03,2.822,-0.01153,-0.004522,inf,0.0,0.0,-0.002474
2017-11-06,2.797,-0.002474,-0.01153,-0.004522,inf,0.0,-0.008859
2017-11-07,2.769,-0.008859,-0.002474,-0.01153,-0.004522,inf,-0.010011
2017-11-08,2.784,-0.010011,-0.008859,-0.002474,-0.01153,-0.004522,0.005417
2017-11-09,2.806,0.005417,-0.010011,-0.008859,-0.002474,-0.01153,0.007902
2017-11-10,2.88,0.007902,0.005417,-0.010011,-0.008859,-0.002474,0.026372
2017-11-13,2.869,0.026372,0.007902,0.005417,-0.010011,-0.008859,-0.003819


In [19]:
# convert returns to the sign of direction
thirty_year_data['Direction'] = np.sign(thirty_year_data['Returns'])

thirty_year_data = thirty_year_data.dropna()
thirty_year_data = thirty_year_data.drop(thirty_year_data.index[0:5])

##remove rows with any values that are not infinite
thirty_year_data = thirty_year_data[np.isfinite(thirty_year_data).all(1)]

thirty_year_data

Unnamed: 0_level_0,Adj Close,Lag_1,Lag_2,Lag_3,Lag_4,Lag_5,Returns,Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-11-08,2.784,-0.010011,-0.008859,-0.002474,-0.011530,-0.004522,0.005417,1.0
2017-11-09,2.806,0.005417,-0.010011,-0.008859,-0.002474,-0.011530,0.007902,1.0
2017-11-10,2.880,0.007902,0.005417,-0.010011,-0.008859,-0.002474,0.026372,1.0
2017-11-13,2.869,0.026372,0.007902,0.005417,-0.010011,-0.008859,-0.003819,-1.0
2017-11-14,2.839,-0.003819,0.026372,0.007902,0.005417,-0.010011,-0.010457,-1.0
...,...,...,...,...,...,...,...,...
2022-10-24,4.363,0.021110,0.021813,0.026113,0.001494,0.009301,0.013473,1.0
2022-10-25,4.263,0.013473,0.021110,0.021813,0.026113,0.001494,-0.022920,-1.0
2022-10-26,4.163,-0.022920,0.013473,0.021110,0.021813,0.026113,-0.023458,-1.0
2022-10-27,4.092,-0.023458,-0.022920,0.013473,0.021110,0.021813,-0.017055,-1.0


In [20]:
X = thirty_year_data[["Lag_1", "Lag_2"]]
y = thirty_year_data["Direction"]

## Split into training and testing windows
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

start_test = pd.Timestamp("2017-11-14")

X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

In [21]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [22]:
lr_accuracy = accuracy_score(y_test, y_pred)
lr_accuracy

0.4935897435897436

In [23]:
pred = (1.0 + (y_pred == y_test))/2.0
hit_rate = np.mean(pred)
print('Logistic Regression - Hit Rate of 30 yr Treasury Yield Bond: {:.4f}'.format(hit_rate))

Logistic Regression - Hit Rate of 30 yr Treasury Yield Bond: 0.7468
