In [26]:
import numpy as np
import pandas as pd
from pathlib import Path
%matplotlib inline

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with *lagged* Yen futures returns. 

In [27]:
# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1976-08-02,3398.0,3401.0,3398.0,3401.0,,3401.0,2.0,1.0
1976-08-03,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-04,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-05,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0
1976-08-06,3401.0,3401.0,3401.0,3401.0,,3401.0,0.0,1.0


In [28]:
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()

Unnamed: 0_level_0,Open,High,Low,Last,Change,Settle,Volume,Previous Day Open Interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1990-01-02,6954.0,6954.0,6835.0,6847.0,,6847.0,48336.0,51473.0
1990-01-03,6877.0,6910.0,6865.0,6887.0,,6887.0,38206.0,53860.0
1990-01-04,6937.0,7030.0,6924.0,7008.0,,7008.0,49649.0,55699.0
1990-01-05,6952.0,6985.0,6942.0,6950.0,,6950.0,29944.0,53111.0
1990-01-08,6936.0,6972.0,6936.0,6959.0,,6959.0,19763.0,52072.0


# Data Preparation

### Returns

In [29]:
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
yensettle = yen_futures['Settle']

yensettle= yensettle.pct_change()*100


# yensettle= yensettle.replace([yensettle.inf(), -yensettle.inf()], np.nan)
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
# YOUR CODE HERE!

### Lagged Returns 

In [30]:
# Create a lagged return using the shift function
yen_futures['settlelag'] = yensettle.shift()
yensettle.dropna(inplace=True)

### Train Test Split

In [31]:
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures.loc['2008':'2018']
test = yen_futures.loc['2018':'2019']

In [32]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
xtrain = train['settlelag'].to_frame()
xtest = test['settlelag'].to_frame()
ytrain = train['Settle'].to_frame()
ytest = test['Settle'].to_frame()

In [33]:
xtrain.head()

Unnamed: 0_level_0,settlelag
Date,Unnamed: 1_level_1
2008-01-02,0.861683
2008-01-03,2.152446
2008-01-04,0.08689
2008-01-07,0.781335
2008-01-08,-0.527619


In [34]:
ytrain.head()

Unnamed: 0_level_0,Settle
Date,Unnamed: 1_level_1
2008-01-02,9207.0
2008-01-03,9215.0
2008-01-04,9287.0
2008-01-07,9238.0
2008-01-08,9206.0


# Linear Regression Model

In [35]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model= LinearRegression()
model.fit(xtrain, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

In [36]:
# Make a prediction of "y" values using just the test dataset
predictions = model.predict(xtest)

In [37]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results= ytest
Results['prediction'] = predictions

In [38]:
# Plot the first 20 predictions vs the true values
Results.head()

Unnamed: 0_level_0,Settle,prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-02,8940.5,10213.987288
2018-01-03,8919.0,10217.444541
2018-01-04,8900.0,10191.770311
2018-01-05,8868.5,10193.080862
2018-01-08,8874.0,10186.353732


# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

In [42]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
mse = mean_squared_error(
    Results["Settle"],
    Results["prediction"]
)

# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
rmse = np.sqrt(mse)
print(f"Out-of-Sample Root Mean Squared Error (RMSE): {rmse}")

Out-of-Sample Root Mean Squared Error (RMSE): 1082.935028978282


# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

In [48]:
# Construct a dataframe using just the "y" training data:
insampleresults = ytrain

# Add a column of "in-sample" predictions to that dataframe:  
insampleresults['insample predictions'] = model.predict(xtrain)

# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
insample_mse = mean_squared_error(
    insampleresults["Settle"],
    insampleresults["insample predictions"])


# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
insample_rmse = np.sqrt(insample_mse)
print(f"In-sample Root Mean Squared Error (RMSE): {insample_rmse}")

In-sample Root Mean Squared Error (RMSE): 1439.204683766347


# Conclusions

YOUR CONCLUSIONS HERE!