In [1]:
# Install the ISLP package
!pip install ISLP

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from ISLP import load_data
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import itertools

# Load the Weekly dataset
Weekly = load_data('Weekly')

# (d) Fit logistic regression model using a training period from 1990 to 2008
train_data = Weekly[Weekly.Year <= 2008]
test_data = Weekly[Weekly.Year > 2008]

X_train = train_data[['Lag2']]
y_train = train_data['Direction'].apply(lambda x: 1 if x == 'Up' else 0)

X_test = test_data[['Lag2']]
y_test = test_data['Direction'].apply(lambda x: 1 if x == 'Up' else 0)

# Logistic Regression (d)
logit_model = sm.Logit(y_train, sm.add_constant(X_train)).fit()
logit_predictions = (logit_model.predict(sm.add_constant(X_test)) > 0.5).astype(int)

# Evaluate Logistic Regression model
cm_logit = confusion_matrix(y_test, logit_predictions)
accuracy_logit = accuracy_score(y_test, logit_predictions)
print("Confusion Matrix (Test Data - Logistic Regression):\n", cm_logit)
print("\nOverall Accuracy (Test Data - Logistic Regression):", accuracy_logit)

# (e) Repeat using LDA
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
lda_predictions = lda_model.predict(X_test)

cm_lda = confusion_matrix(y_test, lda_predictions)
accuracy_lda = accuracy_score(y_test, lda_predictions)
print("Confusion Matrix (Test Data - LDA):\n", cm_lda)
print("\nOverall Accuracy (Test Data - LDA):", accuracy_lda)

# (f) Repeat using QDA
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)
qda_predictions = qda_model.predict(X_test)

cm_qda = confusion_matrix(y_test, qda_predictions)
accuracy_qda = accuracy_score(y_test, qda_predictions)
print("Confusion Matrix (Test Data - QDA):\n", cm_qda)
print("\nOverall Accuracy (Test Data - QDA):", accuracy_qda)

# (g) Repeat using KNN with K = 1
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

cm_knn = confusion_matrix(y_test, knn_predictions)
accuracy_knn = accuracy_score(y_test, knn_predictions)
print("Confusion Matrix (Test Data - KNN):\n", cm_knn)
print("\nOverall Accuracy (Test Data - KNN):", accuracy_knn)

# (h) Repeat using naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

cm_nb = confusion_matrix(y_test, nb_predictions)
accuracy_nb = accuracy_score(y_test, nb_predictions)
print("Confusion Matrix (Test Data - Naive Bayes):\n", cm_nb)
print("\nOverall Accuracy (Test Data - Naive Bayes):", accuracy_nb)

# (i) Comparing overall accuracy of the models
print("Comparing the overall accuracy of the models on the test data:")
print(f"- Logistic Regression: {accuracy_logit}")
print(f"- LDA: {accuracy_lda}")
print(f"- QDA: {accuracy_qda}")
print(f"- KNN (K=1): {accuracy_knn}")
print(f"- Naive Bayes: {accuracy_nb}")

print(f"\nThe method with the highest accuracy on the test data is: Logistic Regression and LDA.")

# (j) Experiment with different combinations of predictors
X_train_full = train_data[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5']]
X_test_full = test_data[['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5']]

# Generate all combinations of predictors
predictor_combinations = []
for i in range(1, len(X_train_full.columns) + 1):
    predictor_combinations.extend(itertools.combinations(X_train_full.columns, i))

# Loop over different combinations of predictors and values of K
best_accuracy = 0
best_combination = None
best_k = None

for combination in predictor_combinations:
    X_train_subset = X_train_full[list(combination)]
    X_test_subset = X_test_full[list(combination)]
    for k in [1, 3, 5, 7]:
        knn_model = KNeighborsClassifier(n_neighbors=k)
        knn_model.fit(X_train_subset, y_train)
        knn_predictions = knn_model.predict(X_test_subset)
        accuracy = accuracy_score(y_test, knn_predictions)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_combination = combination
            best_k = k

print(f"Best Accuracy: {best_accuracy}")
print(f"Best Predictor Combination: {best_combination}")
print(f"Best K: {best_k}")


Collecting ISLP
  Downloading ISLP-0.4.0-py3-none-any.whl.metadata (7.0 kB)
Collecting lifelines (from ISLP)
  Downloading lifelines-0.30.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pygam (from ISLP)
  Downloading pygam-0.9.1-py3-none-any.whl.metadata (7.1 kB)
Collecting pytorch-lightning (from ISLP)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics (from ISLP)
  Downloading torchmetrics-1.5.1-py3-none-any.whl.metadata (20 kB)
Collecting autograd-gamma>=0.3 (from lifelines->ISLP)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting formulaic>=0.2.2 (from lifelines->ISLP)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting scipy>=0.9 (from ISLP)
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m1.3 MB/s[0m eta [36m