### Regression Analysis: Seasonal Effects with Sklearn Linear Regression

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.io as pio
pio.renderers.default = 'notebook_connected'
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
%matplotlib inline
from _functions import *

yen_futures = pd.read_csv(
    Path("Data/yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures['Returns'] = (yen_futures['Settle'].pct_change()*100)
yen_futures['Returns'] = yen_futures['Returns'].replace(-np.inf, np.nan).dropna()
# print(f' Lagged returns is the independent variable (X), data type: {type(yen_futures['Returns']}'')
X_train, X_test, Y_train, Y_test = train_test(yen_futures)

### Returns

### Linear Regression Model

In [4]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = LinearRegression()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = Y_test.to_frame()
Results['Predicted_return'] = predictions

# Plot the first 20 predictions vs the true values
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Returns'].tail(20), name="Actual Returns"),
    row=1, col=1, secondary_y=False)

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Predicted_return'].tail(20), name="Predicted_Return"),
    row=1, col=1, secondary_y=True,
)
fig.show()

### Out-of-Sample Performance

In [4]:
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
mse = mean_squared_error(
    Results['Returns'],
    Results['Predicted_return']
)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
out_of_sample_rmse = np.sqrt(mse)
print(f'Out of sample root mean squared error is {out_of_sample_rmse}')

Out of sample root mean squared error is 0.41545437184712763


### In-Sample Performance

In [5]:
# Construct a dataframe using just the "y" training data:
prediction_results = Y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
prediction_results['Predictions'] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(
    prediction_results['Returns'],
    prediction_results['Predictions']
)

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f'in sample root mean squared error (RMSE) is: {in_sample_rmse}')

in sample root mean squared error (RMSE) is: 0.5962037920929946


# Conclusions

#### Answer: Both out of sample and in sample root mean squared error is on the higher side. More improvements to the model would be needed in order to rely on the outputs.