In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with *lagged* Yen futures returns. 

In [2]:
# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("Data/yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1976-08-02,3398.0,3401.0,3398.0,3401.0,,3401.0,2.0,1.0
1976-08-03,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-04,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-05,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-06,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0


In [3]:
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990-01-02,6954.0,6954.0,6835.0,6847.0,,6847.0,48336.0,51473.0
1990-01-03,6877.0,6910.0,6865.0,6887.0,,6887.0,38206.0,53860.0
1990-01-04,6937.0,7030.0,6924.0,7008.0,,7008.0,49649.0,55699.0
1990-01-05,6952.0,6985.0,6942.0,6950.0,,6950.0,29944.0,53111.0
1990-01-08,6936.0,6972.0,6936.0,6959.0,,6959.0,19763.0,52072.0


# Data Preparation

### Returns

In [4]:
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
yen_futures['Returns'] = (yen_futures['Settle'].pct_change()*100)
# In this case, you may have to replace inf, -inf values with np.nan"s
yen_futures['Returns'] = yen_futures['Returns'].replace(-np.inf, np.nan).dropna()
type(yen_futures['Returns'])

pandas.core.series.Series

### Lagged Returns 

In [5]:
# Create a lagged return using the shift function
yen_futures['Lagged_returns'] = yen_futures.Returns.shift()
yen_futures.dropna(inplace=True)
yen_futures.head(3)

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest,Returns,Lagged_returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-02-18,9831.0,9865.0,9734.0,9775.0,42.0,9775.0,203495.0,196924.0,-0.427829,0.409123
2014-02-19,9768.0,9825.0,9760.0,9773.0,2.0,9773.0,129508.0,197197.0,-0.02046,-0.427829
2014-02-20,9774.0,9837.0,9765.0,9775.0,2.0,9775.0,160202.0,198280.0,0.020465,-0.02046


y = return_series
X = lagged_return.to_frame()
X.head()

### Train Test Split

In [6]:
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures[:'2017']
test = yen_futures['2018':]

In [7]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
X_train = train.Lagged_returns.to_frame().dropna()
X_test = test.Lagged_returns.to_frame().dropna()
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
Y_train = train.Returns.dropna()
Y_test = test.Returns.dropna()
print(f'{X_train.tail(1)} CHECK: train dataset should end in DEC 2017')
print(f'{X_test.tail(1)} CHECK: test data should end in OCT 2019')
print(f'{Y_train.tail(1)} CHECK: train dataset should end in DEC 2017')
print(f'{Y_test.tail(1)} CHECK: test data should end in OCT 2019')
X_train

            Lagged_returns
Date                      
2017-12-29        0.180221 CHECK: train dataset should end in DEC 2017
            Lagged_returns
Date                      
2019-10-15        0.151335 CHECK: test data should end in OCT 2019
Date
2017-12-29    0.224871
Name: Returns, dtype: float64 CHECK: train dataset should end in DEC 2017
Date
2019-10-15   -0.469509
Name: Returns, dtype: float64 CHECK: test data should end in OCT 2019


Unnamed: 0_level_0,Lagged_returns
Date,Unnamed: 1_level_1
2014-02-18,0.409123
2014-02-19,-0.427829
2014-02-20,-0.020460
2014-02-21,0.020465
2014-02-24,-0.204604
...,...
2017-12-22,0.084621
2017-12-26,0.118370
2017-12-27,0.118230
2017-12-28,-0.151830


# Linear Regression Model

In [8]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

# Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

In [9]:
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(X_test)

In [10]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = Y_test.to_frame()
Results['Predicted_return'] = predictions
Results

Unnamed: 0_level_0,Returns,Predicted_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,0.297285,-0.009599
2018-01-03,-0.240479,-0.010033
2018-01-04,-0.213028,-0.006807
2018-01-05,-0.353933,-0.006971
2018-01-08,0.062017,-0.006126
...,...,...
2019-10-09,-0.410601,-0.009275
2019-10-10,-0.369458,-0.005786
2019-10-11,-0.564304,-0.006033
2019-10-14,0.151335,-0.004864


In [11]:
Results.tail(20)

Unnamed: 0_level_0,Returns,Predicted_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-17,0.404858,-0.009288
2019-09-18,-0.145161,-0.010679
2019-09-19,0.333818,-0.007378
2019-09-20,0.246847,-0.010252
2019-09-23,0.192709,-0.009731
2019-09-24,0.395363,-0.009406
2019-09-25,-0.739716,-0.010622
2019-09-27,-0.058975,-0.008249
2019-09-30,-0.241403,-0.007896
2019-10-01,0.328028,-0.006801


In [12]:
# Plot the first 20 predictions vs the true values
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Returns'].tail(20), name="Actual Returns"),
    row=1, col=1, secondary_y=False)

fig.add_trace(
    go.Scatter(x=Results.index[-20:], y=Results['Predicted_return'].tail(20), name="Predicted_Return"),
    row=1, col=1, secondary_y=True,
)
fig.show()

# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

In [13]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
mse = mean_squared_error(
    Results['Returns'],
    Results['Predicted_return']
)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
out_of_sample_rmse = np.sqrt(mse)
print(f'Out of sample root mean squared error is {out_of_sample_rmse}')

Out of sample root mean squared error is 0.41545437184712763


# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

In [14]:
# Construct a dataframe using just the "y" training data:
prediction_results = Y_train.to_frame()

# Add a column of "in-sample" predictions to that dataframe:  
prediction_results['Predictions'] = model.predict(X_train)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
in_sample_mse = mean_squared_error(
    prediction_results['Returns'],
    prediction_results['Predictions']
)

# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
in_sample_rmse = np.sqrt(in_sample_mse)
print(f'in sample root mean squared error (RMSE) is: {in_sample_rmse}')

in sample root mean squared error (RMSE) is: 0.5962037920929946


# Conclusions

YOUR CONCLUSIONS HERE!

## Answer: Both out of sample and in sample root mean squared error is on the higher side. More improvements to the model would be needed in order to rely on the outputs.