This question should be answered using the Weekly data set, which
is part of the ISLP package. This data is similar in nature to the
Smarket data from this chapter’s lab, except that it contains 1, 089
weekly returns for 21 years, from the beginning of 1990 to the end of
2010.

In [2]:
!pip install ISLP

Collecting ISLP
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from ISLP)
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from ISLP)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from ISLP)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from ISLP)
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting autograd-gamma>=0.3 (from lifelines->ISLP)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->ISLP)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting scipy>=0.9 (from ISLP)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m660.2 kB/s[0m eta [3

(e) Repeat (d) using LDA.

In [4]:

from ISLP import load_data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score
import statsmodels.api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Load the Weekly dataset
weekly_data = load_data('Weekly')

# Define the predictors and the response variable
X = weekly_data[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
weekly_data['Direction_Numeric'] = weekly_data['Direction'].map({'Up': 1, 'Down': 0})

# Check for NaN values and replace with a suitable value (e.g., 0)
weekly_data['Direction_Numeric'].fillna(0, inplace=True)

# Create a boolean mask for the training data (1990-2008)
train_mask = weekly_data.Year <= 2008

# Create a boolean mask for the test data (2009-2010)
test_mask = weekly_data.Year >= 2009

# Split the data into training and testing sets (same as in part (d))
X_train = weekly_data.loc[train_mask, ['Lag2']]
y_train = weekly_data.loc[train_mask, 'Direction_Numeric']
X_test = weekly_data.loc[test_mask, ['Lag2']]
y_test = weekly_data.loc[test_mask, 'Direction_Numeric']

# Fit the LDA model using the training data
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_lda = lda_model.predict(X_test)

# Calculate the confusion matrix for the test data
cm_lda = confusion_matrix(y_test, y_pred_lda)
print("Confusion Matrix (Held-out data, LDA):")
print(cm_lda)

# Calculate the accuracy for the test data
accuracy_lda = accuracy_score(y_test, y_pred_lda)
print("Overall Accuracy (Held-out data, LDA):", accuracy_lda)

Confusion Matrix (Held-out data, LDA):
[[ 9 34]
 [ 5 56]]
Overall Accuracy (Held-out data, LDA): 0.625


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  weekly_data['Direction_Numeric'].fillna(0, inplace=True)


(f) Repeat (d) using QDA.

In [5]:
# Split the data into training and testing sets (same as in part (d))
X_train = weekly_data.loc[train_mask, ['Lag2']]
y_train = weekly_data.loc[train_mask, 'Direction_Numeric']
X_test = weekly_data.loc[test_mask, ['Lag2']]
y_test = weekly_data.loc[test_mask, 'Direction_Numeric']

# Fit the QDA model using the training data
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_qda = qda_model.predict(X_test)

# Calculate the confusion matrix for the test data
cm_qda = confusion_matrix(y_test, y_pred_qda)
print("Confusion Matrix (Held-out data, QDA):")
print(cm_qda)

# Calculate the accuracy for the test data
accuracy_qda = accuracy_score(y_test, y_pred_qda)
print("Overall Accuracy (Held-out data, QDA):", accuracy_qda)

Confusion Matrix (Held-out data, QDA):
[[ 0 43]
 [ 0 61]]
Overall Accuracy (Held-out data, QDA): 0.5865384615384616


(g) Repeat (d) using KNN with K = 1.

In [6]:
# Split the data into training and testing sets (same as in part (d))
X_train = weekly_data.loc[train_mask, ['Lag2']]
y_train = weekly_data.loc[train_mask, 'Direction_Numeric']
X_test = weekly_data.loc[test_mask, ['Lag2']]
y_test = weekly_data.loc[test_mask, 'Direction_Numeric']

# Fit the KNN model with K=1 using the training data
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_knn = knn_model.predict(X_test)

# Calculate the confusion matrix for the test data
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix (Held-out data, KNN with K=1):")
print(cm_knn)

# Calculate the accuracy for the test data
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Overall Accuracy (Held-out data, KNN with K=1):", accuracy_knn)

Confusion Matrix (Held-out data, KNN with K=1):
[[22 21]
 [32 29]]
Overall Accuracy (Held-out data, KNN with K=1): 0.49038461538461536


(h) Repeat (d) using naive Bayes.

In [7]:
# Split the data into training and testing sets (same as in part (d))
X_train = weekly_data.loc[train_mask, ['Lag2']]
y_train = weekly_data.loc[train_mask, 'Direction_Numeric']
X_test = weekly_data.loc[test_mask, ['Lag2']]
y_test = weekly_data.loc[test_mask, 'Direction_Numeric']

# Fit the Naive Bayes model using the training data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_nb = nb_model.predict(X_test)

# Calculate the confusion matrix for the test data
cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix (Held-out data, Naive Bayes):")
print(cm_nb)

# Calculate the accuracy for the test data
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Overall Accuracy (Held-out data, Naive Bayes):", accuracy_nb)

Confusion Matrix (Held-out data, Naive Bayes):
[[ 0 43]
 [ 0 61]]
Overall Accuracy (Held-out data, Naive Bayes): 0.5865384615384616


(i) Which of these methods appears to provide the best results on
this data?

LDA and Logistic Regression

(j) Experiment with different combinations of predictors, including
possible transformations and interactions, for each of the
methods. Report the variables, method, and associated confusion
matrix that appears to provide the best results on the held
out data. Note that you should also experiment with values for
K in the KNN classifier.

In [8]:
# Load the Weekly dataset
weekly_data = load_data('Weekly')

# Create a numeric 'Direction' column
weekly_data['Direction_Numeric'] = weekly_data['Direction'].map({'Up': 1, 'Down': 0})

# Split the data into training and testing sets (using 2009-2010 as the test set)
train_mask = weekly_data.Year <= 2008
test_mask = weekly_data.Year >= 2009
X_train = weekly_data.loc[train_mask, ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
y_train = weekly_data.loc[train_mask, 'Direction_Numeric']
X_test = weekly_data.loc[test_mask, ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']]
y_test = weekly_data.loc[test_mask, 'Direction_Numeric']


def evaluate_model(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)
  accuracy = accuracy_score(y_test, y_pred)
  return cm, accuracy


# Experiment with different methods and predictors
results = {}

# Logistic Regression
model_log = sm.Logit(y_train, sm.add_constant(X_train)).fit()
y_pred_prob = model_log.predict(sm.add_constant(X_test))
y_pred = (y_pred_prob > 0.5).astype(int)
cm_log = confusion_matrix(y_test, y_pred)
accuracy_log = accuracy_score(y_test, y_pred)
results['Logistic Regression (Lag1-Lag5, Volume)'] = (cm_log, accuracy_log)


# LDA
lda_model = LinearDiscriminantAnalysis()
cm_lda, accuracy_lda = evaluate_model(lda_model, X_train, y_train, X_test, y_test)
results['LDA (Lag1-Lag5, Volume)'] = (cm_lda, accuracy_lda)

# QDA
qda_model = QuadraticDiscriminantAnalysis()
cm_qda, accuracy_qda = evaluate_model(qda_model, X_train, y_train, X_test, y_test)
results['QDA (Lag1-Lag5, Volume)'] = (cm_qda, accuracy_qda)

# KNN
for k in [1, 3, 5, 7]:
  knn_model = KNeighborsClassifier(n_neighbors=k)
  cm_knn, accuracy_knn = evaluate_model(knn_model, X_train, y_train, X_test, y_test)
  results[f'KNN (k={k}, Lag1-Lag5, Volume)'] = (cm_knn, accuracy_knn)

# Naive Bayes
nb_model = GaussianNB()
cm_nb, accuracy_nb = evaluate_model(nb_model, X_train, y_train, X_test, y_test)
results['Naive Bayes (Lag1-Lag5, Volume)'] = (cm_nb, accuracy_nb)


# Find the best performing model
best_model = max(results, key=lambda k: results[k][1])
print(f"Best Model: {best_model}")
print(f"Confusion Matrix: \n{results[best_model][0]}")
print(f"Accuracy: {results[best_model][1]}")

Optimization terminated successfully.
         Current function value: 0.681388
         Iterations 4
Best Model: KNN (k=7, Lag1-Lag5, Volume)
Confusion Matrix: 
[[19 24]
 [26 35]]
Accuracy: 0.5192307692307693
