### Regression Analysis: Seasonal Effects with Sklearn Linear Regression

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.io as pio
pio.renderers.default = 'iframe_connected'
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
%matplotlib inline

# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("Data/yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
yen_futures['Returns'] = (yen_futures['Settle'].pct_change()*100)
# In this case, you may have to replace inf, -inf values with np.nan"s
yen_futures['Returns'] = yen_futures['Returns'].replace(-np.inf, np.nan).dropna()
type(yen_futures['Returns'])

pandas.core.series.Series

### Returns

In [2]:
# Create a lagged return using the shift function
yen_futures['Lagged_returns'] = yen_futures.Returns.shift()
yen_futures.dropna(inplace=True)
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures[:'2017']
test = yen_futures['2018':]

# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
X_train = train.Lagged_returns.to_frame().dropna()
X_test = test.Lagged_returns.to_frame().dropna()
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
Y_train = train.Returns.dropna()
Y_test = test.Returns.dropna()
print(f'{X_train.tail(1)} CHECK: train dataset should end in DEC 2017')
print(f'{X_test.tail(1)} CHECK: test data should end in OCT 2019')
print(f'{Y_train.tail(1)} CHECK: train dataset should end in DEC 2017')
print(f'{Y_test.tail(1)} CHECK: test data should end in OCT 2019')
X_train

            Lagged_returns
Date                      
2017-12-29        0.180221 CHECK: train dataset should end in DEC 2017
            Lagged_returns
Date                      
2019-10-15        0.151335 CHECK: test data should end in OCT 2019
Date
2017-12-29    0.224871
Name: Returns, dtype: float64 CHECK: train dataset should end in DEC 2017
Date
2019-10-15   -0.469509
Name: Returns, dtype: float64 CHECK: test data should end in OCT 2019


Unnamed: 0_level_0,Lagged_returns
Date,Unnamed: 1_level_1
2014-02-18,0.409123
2014-02-19,-0.427829
2014-02-20,-0.020460
2014-02-21,0.020465
2014-02-24,-0.204604
...,...
2017-12-22,0.084621
2017-12-26,0.118370
2017-12-27,0.118230
2017-12-28,-0.151830


### Linear Regression Model

In [3]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = LinearRegression()
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = Y_test.to_frame()
Results['Predicted_return'] = predictions

# Plot the first 20 predictions vs the true values
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Returns'].tail(20), name="Actual Returns"),
    row=1, col=1, secondary_y=False)

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Predicted_return'].tail(20), name="Predicted_Return"),
    row=1, col=1, secondary_y=True,
)
fig.show()

### Out-of-Sample Performance

In [4]:
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
mse = mean_squared_error(
    Results['Returns'],
    Results['Predicted_return']
)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
out_of_sample_rmse = np.sqrt(mse)
print(f'Out of sample root mean squared error is {out_of_sample_rmse}')

Out of sample root mean squared error is 0.41545437184712763


### In-Sample Performance

In [5]:
# Construct a dataframe using just the "y" training data:
prediction_results = Y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
prediction_results['Predictions'] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(
    prediction_results['Returns'],
    prediction_results['Predictions']
)

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f'in sample root mean squared error (RMSE) is: {in_sample_rmse}')

in sample root mean squared error (RMSE) is: 0.5962037920929946


# Conclusions

#### Answer: Both out of sample and in sample root mean squared error is on the higher side. More improvements to the model would be needed in order to rely on the outputs.