In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV


dataset_path = 'GOOGLclass_data.csv'
df = pd.read_csv('GOOGLclass_data.csv')

# Extract dataset name from the file path
dataset_name = dataset_path.split('/')[-1].split('.')[0] 


In [6]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,MACD,RSI,MA,EMA,SMA,STD,Upper,Lower,PatternClass
0,2004-08-20,2.527778,2.729730,2.515015,2.710460,2.710460,456686856,,,2.610735,2.610735,2.610735,0.099725,2.810184,2.411286,6
1,2004-08-24,2.783784,2.792793,2.591842,2.624374,2.624374,304946748,,49.999895,2.681056,2.648051,2.681056,0.056682,2.794420,2.567692,4
2,2004-08-26,2.626376,2.701451,2.619119,2.700450,2.700450,141897960,,76.107414,2.676551,2.684006,2.676551,0.023898,2.724348,2.628754,6
3,2004-08-27,2.705205,2.718218,2.644895,2.656406,2.656406,124235640,,43.674203,2.678428,2.665606,2.678428,0.022022,2.722472,2.634384,4
4,2004-08-30,2.634635,2.639890,2.552803,2.552803,2.552803,103935960,,14.534659,2.604604,2.590404,2.604604,0.051801,2.708207,2.501002,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2647,2022-12-22,88.160004,88.540001,86.320000,87.760002,87.760002,27658300,-2.030253,16.638351,88.670002,88.313583,88.670002,0.910000,90.490002,86.850002,4
2648,2022-12-23,87.110001,89.550003,87.070000,89.230003,89.230003,23003000,-2.062129,60.285579,88.495003,88.924530,88.495003,0.735001,89.965004,87.025002,6
2649,2022-12-27,88.800003,88.940002,87.010002,87.389999,87.389999,20097300,-2.210383,26.089129,88.310001,87.901510,88.310001,0.920002,90.150005,86.469997,4
2650,2022-12-29,86.620003,88.849998,86.610001,88.449997,88.449997,23333500,-2.346211,67.284891,87.234997,87.849054,87.234997,1.215000,89.664997,84.804996,6


In [8]:


# Load your OHLC dataset with indicators
# Replace 'your_dataset.csv' with the actual file path or DataFrame variable


# Assuming 'PatternClass' column is already defined in your dataset

# Remove rows where PatternClass is 0 (No Pattern)
df = df[df['PatternClass'] != 0]

# Convert 'Date' to datetime and extract relevant features
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Select features (X) and target variable (y)
X = df.drop(['PatternClass', 'Date', 'Open', 'High', 'Low', 'Close','Volume','Adj Close'], axis=1)
selected_features = ['PatternClass', 'Open', 'High', 'Low', 'Close','Volume','Adj Close']  # Replace with your actual feature names
X = df[selected_features]

y = df['PatternClass']
print("Feature Columns:", X.columns)

# Handle missing values by filling with mean
X = X.fillna(X.mean())

# Check for and handle infinite values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(0)  # You can customize the filling strategy based on your data

# Encode any categorical variables if necessary
label_encoder = LabelEncoder()
X_encoded = X.apply(lambda col: label_encoder.fit_transform(col.astype(str)) if col.dtype == 'O' else col)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3, random_state=2)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the classifiers
rf_classifier = RandomForestClassifier(random_state=100, n_estimators=100)
svm_classifier = SVC(random_state=100)
logreg_classifier = LogisticRegression(random_state=100)
dt_classifier = DecisionTreeClassifier(random_state=100)
knn_classifier = KNeighborsClassifier()
nb_classifier = GaussianNB()
extra_trees_classifier = ExtraTreesClassifier(random_state=100, n_estimators=100)  # Added Extra Trees Classifier

# List of classifiers
classifiers = [rf_classifier, svm_classifier, logreg_classifier, dt_classifier, knn_classifier, nb_classifier, extra_trees_classifier]
classifier_names = ['Random Forest', 'SVM', 'Logistic Regression', 'Decision Tree', 'KNN', 'Naive Bayes', 'Extra Trees']

# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'F1 Score'])

# Loop through each classifier
for classifier, classifier_name in zip(classifiers, classifier_names):
    # Train the classifier
    classifier.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = classifier.predict(X_test_scaled)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Append results to the DataFrame
    results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)

# Display the results and name of dataset
print(f"Dataset Name: {dataset_name}")
print()
print(results_df)



Feature Columns: Index(['PatternClass', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')


  _warn_prf(average, modifier, msg_start, len(result))
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  _warn_prf(average, modifier, msg_start, len(result))
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Rec

Dataset Name: GOOGLclass_data

            Classifier  Accuracy  F1 Score  Precision    Recall
0        Random Forest  0.998744  0.998116   0.997491  0.998744
1                  SVM  1.000000  1.000000   1.000000  1.000000
2  Logistic Regression  0.998744  0.998116   0.997491  0.998744
3        Decision Tree  1.000000  1.000000   1.000000  1.000000
4                  KNN  0.994975  0.994274   0.993738  0.994975
5          Naive Bayes  1.000000  1.000000   1.000000  1.000000
6          Extra Trees  1.000000  1.000000   1.000000  1.000000


  results_df = results_df.append({'Classifier': classifier_name, 'Accuracy': accuracy, 'F1 Score': f1,'Precision': precision, 'Recall': recall}, ignore_index=True)
