In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV


dataset_path = 'jpm_class_data.csv'
df = pd.read_csv('jpm_class_data.csv')

# Extract dataset name from the file path
dataset_name = dataset_path.split('/')[-1].split('.')[0] 


In [2]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,MACD,RSI,MA,EMA,SMA,STD,Upper,Lower,PatternClass
0,2000-01-03,49.833332,50.250000,48.083332,48.583332,12019200,,,,,,,,,4
1,2000-01-04,47.083332,47.458332,46.125000,47.250000,11723400,,,47.916666,47.916666,47.916666,0.666666,49.249998,46.583334,6
2,2000-01-05,46.833332,48.375000,46.000000,46.958332,8714550,,0.000000,47.104166,47.277777,47.104166,0.145834,47.395834,46.812498,0
3,2000-01-06,46.750000,48.625000,46.500000,47.625000,8369250,,45.070470,47.291666,47.509259,47.291666,0.333334,47.958334,46.624998,0
4,2000-01-07,48.416668,49.000000,47.333332,48.500000,6571950,,74.838720,48.062500,48.169753,48.062500,0.437500,48.937500,47.187500,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5782,2022-12-23,130.580002,131.440002,129.639999,131.279999,5091900,0.145534,57.304556,130.970001,131.177310,130.970001,0.309998,131.589996,130.350006,6
5783,2022-12-27,131.570007,132.220001,130.550003,131.740005,5411000,0.142811,70.977241,131.510002,131.552440,131.510002,0.230003,131.970009,131.049995,0
5784,2022-12-28,131.850006,133.410004,131.009995,132.460007,8827700,0.196487,85.506519,132.100006,132.157485,132.100006,0.360001,132.820007,131.380005,0
5785,2022-12-29,132.929993,133.270004,132.309998,133.220001,6585200,0.296928,92.953548,132.840004,132.865829,132.840004,0.379997,133.599998,132.080009,6


In [3]:


# Load your OHLC dataset with indicators
# Replace 'your_dataset.csv' with the actual file path or DataFrame variable


# Assuming 'PatternClass' column is already defined in your dataset

# Remove rows where PatternClass is 0 (No Pattern)
df = df[df['PatternClass'] != 0]

# Convert 'Date' to datetime and extract relevant features
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Select features (X) and target variable (y)
X = df.drop(['PatternClass', 'Date', 'Open', 'High', 'Low', 'Close','Volume'], axis=1)
selected_features = ['PatternClass', 'Open', 'High', 'Low', 'Close','Volume']  # Replace with your actual feature names
X = df[selected_features]
y = df['PatternClass']
print("Feature Columns:", X.columns)

# Handle missing values by filling with mean
X = X.fillna(X.mean())

# Check for and handle infinite values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(0)  # You can customize the filling strategy based on your data

# Encode any categorical variables if necessary
label_encoder = LabelEncoder()
X_encoded = X.apply(lambda col: label_encoder.fit_transform(col.astype(str)) if col.dtype == 'O' else col)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=2)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the classifiers
rf_classifier = RandomForestClassifier(random_state=100, n_estimators=100)
svm_classifier = SVC(random_state=100)
logreg_classifier = LogisticRegression(random_state=100)
dt_classifier = DecisionTreeClassifier(random_state=100)
knn_classifier = KNeighborsClassifier()
nb_classifier = GaussianNB()
extra_trees_classifier = ExtraTreesClassifier(random_state=100, n_estimators=100)  # Added Extra Trees Classifier

# List of classifiers
classifiers = [rf_classifier, svm_classifier, logreg_classifier, dt_classifier, knn_classifier, nb_classifier, extra_trees_classifier]
classifier_names = ['Random Forest', 'SVM', 'Logistic Regression', 'Decision Tree', 'KNN', 'Naive Bayes', 'Extra Trees']

# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'F1 Score'])

# Loop through each classifier
for classifier, classifier_name in zip(classifiers, classifier_names):
    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test_scaled)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Append results to the DataFrame
    results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)

# Display the results and name of dataset
print(f"Dataset Name: {dataset_name}")
print()
print(results_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = df['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_i

Feature Columns: Index(['PatternClass', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')


  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Reca

Dataset Name: jpm_class_data

            Classifier  Accuracy  F1 Score  Precision    Recall
0        Random Forest  0.997888  0.997758   0.997898  0.997888
1                  SVM  0.996832  0.996727   0.996852  0.996832
2  Logistic Regression  0.995776  0.995584   0.995814  0.995776
3        Decision Tree  1.000000  1.000000   1.000000  1.000000
4                  KNN  0.993664  0.993341   0.993706  0.993664
5          Naive Bayes  1.000000  1.000000   1.000000  1.000000
6          Extra Trees  1.000000  1.000000   1.000000  1.000000


  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
