In [3]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, accuracy_score
import statsmodels.api as sm

# Load the Weekly dataset from the ISLR package
# ISLR data is available from statsmodels library, which includes it
data = sm.datasets.get_rdataset("Weekly", "ISLR").data

# Show the first few rows of the data
print(data.head())

# Assuming 'data' is a DataFrame that contains the stock market data
# Define the training and testing sets
train_data = data[(data['Year'] >= 1990) & (data['Year'] <= 2008)]
test_data = data[(data['Year'] == 2009) | (data['Year'] == 2010)]

# Extract the predictor and the target variable
X_train = train_data[['Lag2']]
y_train = train_data['Direction']  # Assuming 'Direction' is the target variable
X_test = test_data[['Lag2']]
y_test = test_data['Direction']

# Fit the LDA model
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lda_model.predict(X_test)

# Compute the confusion matrix and the accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Overall Accuracy: {accuracy:.4f}")

   Year   Lag1   Lag2   Lag3   Lag4   Lag5    Volume  Today Direction
0  1990  0.816  1.572 -3.936 -0.229 -3.484  0.154976 -0.270      Down
1  1990 -0.270  0.816  1.572 -3.936 -0.229  0.148574 -2.576      Down
2  1990 -2.576 -0.270  0.816  1.572 -3.936  0.159837  3.514        Up
3  1990  3.514 -2.576 -0.270  0.816  1.572  0.161630  0.712        Up
4  1990  0.712  3.514 -2.576 -0.270  0.816  0.153728  1.178        Up
Confusion Matrix:
[[ 9 34]
 [ 5 56]]
Overall Accuracy: 0.6250


In [5]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Fit the QDA model
qda_model = QuadraticDiscriminantAnalysis()
qda_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = qda_model.predict(X_test)

# Compute the confusion matrix and the accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Overall Accuracy: {accuracy:.4f}")

Confusion Matrix:
[[ 0 43]
 [ 0 61]]
Overall Accuracy: 0.5865


In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Fit the KNN model with K=1
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Compute the confusion matrix and the accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Overall Accuracy: {accuracy:.4f}")

Confusion Matrix:
[[22 21]
 [30 31]]
Overall Accuracy: 0.5096


In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

# Fit the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_model.predict(X_test)

# Compute the confusion matrix and the accuracy
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Overall Accuracy: {accuracy:.4f}")

Confusion Matrix:
[[ 0 43]
 [ 0 61]]
Overall Accuracy: 0.5865


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from itertools import combinations
import numpy as np

# Define the data and the test period
train_data = data[(data['Year'] >= 1990) & (data['Year'] <= 2008)]
test_data = data[(data['Year'] == 2009) | (data['Year'] == 2010)]

# Define possible predictors and transformations
predictor_options = ['Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume']
transformations = {
    'Lag1_sq': lambda df: df['Lag1'] ** 2,
    'Lag2_sq': lambda df: df['Lag2'] ** 2,
    'Lag1_Lag2': lambda df: df['Lag1'] * df['Lag2']
}

# Generate all combinations of predictors and transformations
predictor_combinations = []
for i in range(1, len(predictor_options) + 1):
    for combo in combinations(predictor_options, i):
        predictor_combinations.append(list(combo))

# Add transformations to the training and test sets
for name, transform in transformations.items():
    train_data[name] = transform(train_data)
    test_data[name] = transform(test_data)
    predictor_combinations.append([name])

# Define methods and results storage
methods = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'Naive Bayes': GaussianNB()
}
knn_results = {}

# Iterate over each model and predictor combination
best_result = {'method': None, 'predictors': None, 'confusion_matrix': None, 'accuracy': 0}

for predictors in predictor_combinations:
    X_train = train_data[predictors]
    y_train = train_data['Direction']
    X_test = test_data[predictors]
    y_test = test_data['Direction']
    
    # Try each method except KNN
    for method_name, model in methods.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Update best result if accuracy is higher
        if accuracy > best_result['accuracy']:
            best_result.update({
                'method': method_name,
                'predictors': predictors,
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'accuracy': accuracy
            })

# Experiment with KNN for different values of K
for k in range(1, 21):  # Testing K values from 1 to 20
    knn_model = KNeighborsClassifier(n_neighbors=k)
    for predictors in predictor_combinations:
        X_train = train_data[predictors]
        y_train = train_data['Direction']
        X_test = test_data[predictors]
        y_test = test_data['Direction']
        
        knn_model.fit(X_train, y_train)
        y_pred = knn_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Update best result if accuracy is higher
        if accuracy > best_result['accuracy']:
            best_result.update({
                'method': f'KNN (K={k})',
                'predictors': predictors,
                'confusion_matrix': confusion_matrix(y_test, y_pred),
                'accuracy': accuracy
            })

# Report the best model
print("Best Model:")
print(f"Method: {best_result['method']}")
print(f"Predictors: {best_result['predictors']}")
print("Confusion Matrix:")
print(best_result['confusion_matrix'])
print(f"Overall Accuracy: {best_result['accuracy']:.4f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[name] = transform(train_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[name] = transform(test_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[name] = transform(train_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[ro

Best Model:
Method: Logistic Regression
Predictors: ['Lag2']
Confusion Matrix:
[[ 9 34]
 [ 5 56]]
Overall Accuracy: 0.6250
