# Stock Prediction Homework

This homework is related to the stock_prediction tutorial.  For trading days in 1/1/2016-7/31/2021, you will test a simple trading strategy based on the predictions of the linear regression and the KNN models.  The trading strategy will be as follows: go long MSFT when the model predicts a gain, go short MSFT when the model predicts a loss.  You will compare the performance of these strategies to a buy and hold (long only) strategy.

To complete the assignment, do the following:

#1) Add 2-day lag and 3-day lag MSFT returns to the feature set, call them `msft_lag_2` and `msft_lag_3`, respectively.

#2) Calculate the percent of days that the models are correct.  Also, calculate the percent of days that the long-only strategy is correct.

#3) Calculate the annualized Sharpe Ratio (avg return / stdev returns) of the three strategies: linear regression, KNN, and long-only.

In [23]:
import numpy as np
import pandas as pd
import yfinance as yf
yf.pdr_override()
from pandas_datareader import data as pdr
import sklearn

stock_tickers = ['MSFT', 'IBM', 'GOOGL'] # define tickers
df_stock = pdr.get_data_yahoo(stock_tickers, start='2005-01-01', end='2021-07-31') # grab the data
df_stock = df_stock['Adj Close'] # select only the adjusted close price
df_stock.columns = df_stock.columns.str.lower() # clean-up column names
df_stock.rename_axis('trade_date', inplace=True) # clean-up index name
df_stock.rename_axis('', axis=1, inplace=True) # clean-up index name

currency_tickers = ['DEXJPUS', 'DEXUSUK']
df_currency = pdr.get_data_fred(currency_tickers, start='2005-01-01', end='2021-07-31')
df_currency = df_currency
df_currency.columns = df_currency.columns.str.lower()
df_currency.rename_axis('trade_date', inplace=True)
df_currency.rename_axis('', axis=1, inplace=True)

index_tickers = ['SPY', 'DIA', '^VIX'] 
df_index = pdr.get_data_yahoo(index_tickers, start='2005-01-01', end='2021-07-31')
df_index = df_index['Adj Close']
df_index.columns = df_index.columns.str.lower().str.replace('^', '')
df_index.rename_axis('trade_date', inplace=True)
df_index.rename_axis('', axis=1, inplace=True)
df_index

df_data = \
    (
    df_stock
        .merge(df_index, how='left', left_index=True, right_index=True) # join currency data
        .merge(df_currency, how='left', left_index=True, right_index=True) # join index data
        .dropna()
        .assign(msft = lambda df: df['msft'].pct_change())   # percent change
        .assign(msft_lag_0 = lambda df: df['msft'].shift(0)) #
        .assign(msft_lag_1 = lambda df: df['msft'].shift(1)) #
        .assign(ibm = lambda df: df['ibm'].pct_change())     #
        .assign(googl = lambda df: df['googl'].pct_change()) #
        .assign(spy = lambda df: df['spy'].pct_change())     #
        .assign(dia = lambda df: df['dia'].pct_change())     #
        .assign(vix = lambda df: df['vix'].diff())           # absolute change
        .assign(dexjpus = lambda df: df['dexjpus'].pct_change()) # percent change
        .assign(dexusuk = lambda df: df['dexusuk'].pct_change()) #
        .dropna()
    )

# Add 2-day and 3-day lagged returns for MSFT
df_data['msft_lag_2'] = df_data['msft'].shift(2)
df_data['msft_lag_3'] = df_data['msft'].shift(3)

df_data.head()

     

[*********************100%%**********************]  3 of 3 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  3 of 3 completed


  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


Unnamed: 0_level_0,googl,ibm,msft,dia,spy,vix,dexjpus,dexusuk,msft_lag_0,msft_lag_1,msft_lag_2,msft_lag_3
trade_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005-01-05,-0.00509,-0.002069,-0.002236,-0.005551,-0.006901,0.110001,-0.003069,0.002177,-0.002236,0.00374,,
2005-01-06,-0.025632,-0.003109,-0.00112,0.003122,0.005084,-0.51,0.00885,-0.00657,-0.00112,-0.002236,,
2005-01-07,0.028109,-0.004366,-0.00299,-0.001886,-0.001433,-0.09,0.000572,-0.002613,-0.00299,-0.00112,-0.002236,
2005-01-10,0.006242,-0.001044,0.004874,0.003402,0.004728,-0.26,-0.005813,0.00262,0.004874,-0.00299,-0.00112,-0.002236
2005-01-11,-0.007793,-0.007107,-0.002612,-0.006404,-0.006891,-0.04,-0.008627,0.0024,-0.002612,0.004874,-0.00299,-0.00112


### 2. 

In [31]:
df_train = df_data.query('trade_date < "2016-01-01"')
df_test = df_data.query('trade_date >= "2016-01-01"')

In [40]:
X_train = df_train.drop(columns=['msft'])[0:len(df_train)-1].dropna()
y_train = df_train[['msft']][1:len(df_train)][3:]

In [42]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)
print("LR R^2:", linear_regression.score(X_train, y_train))
np.round(linear_regression.coef_, 3)

LR R^2: 0.021352475057364262


array([[ 0.004, -0.025,  0.294, -0.434,  0.   ,  0.113, -0.001,  0.031,
        -0.027,  0.021, -0.055]])

In [43]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)
print("KNN R^2:", knn.score(X_train, y_train))


KNN R^2: 0.11683868914980078


In [67]:
X_test = df_test.drop(columns=['msft'])[0:len(df_test)-1]
y_test = df_test[['msft']][1:len(df_test)]     

In [71]:
# Calculate predictions for Linear Regression
y_pred_lr = linear_regression.predict(X_test)
y_test_binary = np.where(y_test > 0, 1, 0)  # 1 if actual return is positive
y_pred_binary_lr = np.where(y_pred_lr > 0, 1, 0)  # 1 if predicted return is positive

# KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train > 0)  # Train KNN on binary output
y_pred_knn = knn_model.predict(X_test)

# Long-only strategy (always long, i.e., predict positive return every day)
y_pred_long_only = np.ones_like(y_test_binary)  # Always predict 1 (positive return)

# Calculate accuracy (percentage of correct predictions)
accuracy_lr = np.mean(y_test_binary == y_pred_binary_lr)  # Linear Regression accuracy
accuracy_knn = np.mean(y_test_binary == y_pred_knn)
accuracy_long_only = np.mean(y_test_binary == y_pred_long_only)  # Long-only accuracy

# Print the results
print(f"Linear Regression Accuracy: {accuracy_lr:.3f}")
print(f"KNN Accuracy: {accuracy_knn:.3f}")
print(f"Long-Only Strategy Accuracy: {accuracy_long_only:.3f}")

Linear Regression Accuracy: 0.526
KNN Accuracy: 0.499
Long-Only Strategy Accuracy: 0.553


  return self._fit(X, y)


### 3. Calculate daily returns based on the strategy predictions

In [72]:
# Linear Regression Strategy: Go long if prediction is positive, go short if negative
returns_lr = np.where(y_pred_binary_lr == 1, y_test.values.flatten(), -y_test.values.flatten())

# KNN Strategy: Go long if prediction is positive, go short if negative
returns_knn = np.where(y_pred_knn == 1, y_test.values.flatten(), -y_test.values.flatten())

# Long-only Strategy: Always long
returns_long_only = y_test.values.flatten()  # Since always long, the return is just the actual return

# Calculate the average returns and standard deviation of returns for each strategy
avg_return_lr = np.mean(returns_lr)
std_return_lr = np.std(returns_lr)

avg_return_knn = np.mean(returns_knn)
std_return_knn = np.std(returns_knn)

avg_return_long_only = np.mean(returns_long_only)
std_return_long_only = np.std(returns_long_only)

# Annualized Sharpe Ratio (assuming 252 trading days)
sharpe_ratio_lr = (avg_return_lr / std_return_lr) * np.sqrt(252)
sharpe_ratio_knn = (avg_return_knn / std_return_knn) * np.sqrt(252)
sharpe_ratio_long_only = (avg_return_long_only / std_return_long_only) * np.sqrt(252)

# Print the results
print(f"Linear Regression Sharpe Ratio: {sharpe_ratio_lr:.3f}")
print(f"KNN Sharpe Ratio: {sharpe_ratio_knn:.3f}")
print(f"Long-Only Strategy Sharpe Ratio: {sharpe_ratio_long_only:.3f}")


Linear Regression Sharpe Ratio: 0.229
KNN Sharpe Ratio: 0.569
Long-Only Strategy Sharpe Ratio: 1.308


### We should just long only 