In [8]:
import os
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.metrics import mean_absolute_error, root_mean_squared_error

import warnings
from sklearn.exceptions import UndefinedMetricWarning
import numpy as np
from tqdm import tqdm  # Corrected import
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix

def load_data(file_path):
    data = pd.read_csv(file_path)
    data['Date'] = pd.to_datetime(data['Date'])
    return data

def filter_predictions(data, predictions_column):
    return data[data[predictions_column] != 0]

def calculate_additional_metrics(data, predictions_column):
    def safe_mean(df):
        return df.mean() if not df.empty else float('nan')  # or use 0 as a default

    correct_up = data[(data[predictions_column] == 1) & (data['percent_change_Close'] > 0)]['percent_change_Close']
    correct_down = data[(data[predictions_column] == -1) & (data['percent_change_Close'] < 0)]['percent_change_Close']
    wrong_up = data[(data[predictions_column] == 1) & (data['percent_change_Close'] < 0)]['percent_change_Close']
    wrong_down = data[(data[predictions_column] == -1) & (data['percent_change_Close'] > 0)]['percent_change_Close']
    up_diff = safe_mean(correct_up) - safe_mean(wrong_up)
    down_diff = safe_mean(correct_down) - safe_mean(wrong_down)
    positive_preds = sum(data[predictions_column] == 1)
    negative_preds = sum(data[predictions_column] == -1)
    non_predictions = sum(data[predictions_column] == 0)
    
    return {
        "correct_up_avg": safe_mean(correct_up) * 100,  # Multiplied by 100
        "wrong_up_avg": safe_mean(wrong_up) * 100,  # Multiplied by 100
        "up_diff": up_diff * 100,  # Multiplied by 100
        "correct_down_avg": safe_mean(correct_down) * 100,  # Multiplied by 100
        "wrong_down_avg": safe_mean(wrong_down) * 100,  # Multiplied by 100
        "down_diff": down_diff * 100,  # Multiplied by 100
        "positive_preds": positive_preds,
        "negative_preds": negative_preds,
        "non_predictions": non_predictions
    }


def plot_distributions(extended_metrics_list):
    correct_up_values = [item["correct_up_stats"]["mean"] for item in extended_metrics_list if item["correct_up_stats"]["mean"] is not float('nan')]
    wrong_up_values = [item["wrong_up_stats"]["mean"] for item in extended_metrics_list if item["wrong_up_stats"]["mean"] is not float('nan')]
    correct_down_values = [item["correct_down_stats"]["mean"] for item in extended_metrics_list if item["correct_down_stats"]["mean"] is not float('nan')]
    wrong_down_values = [item["wrong_down_stats"]["mean"] for item in extended_metrics_list if item["wrong_down_stats"]["mean"] is not float('nan')]
    fig = make_subplots(rows=2, cols=2, subplot_titles=("Correct Up", "Wrong Up", "Correct Down", "Wrong Down"))
    bin_size = 0.1
    fig.add_trace(go.Histogram(x=correct_up_values, xbins=dict(start=np.floor(min(correct_up_values)), end=np.ceil(max(correct_up_values)), size=bin_size), name='Correct Up'), row=1, col=1)
    fig.add_trace(go.Histogram(x=wrong_up_values, xbins=dict(start=np.floor(min(wrong_up_values)), end=np.ceil(max(wrong_up_values)), size=bin_size), name='Wrong Up'), row=1, col=2)
    fig.add_trace(go.Histogram(x=correct_down_values, xbins=dict(start=np.floor(min(correct_down_values)), end=np.ceil(max(correct_down_values)), size=bin_size), name='Correct Down'), row=2, col=1)
    fig.add_trace(go.Histogram(x=wrong_down_values, xbins=dict(start=np.floor(min(wrong_down_values)), end=np.ceil(max(wrong_down_values)), size=bin_size), name='Wrong Down'), row=2, col=2)
    fig.update_layout(template="plotly_dark")
    fig.update_layout(title_text="Distribution of Prediction Metrics", height=900, width=1200)
    fig.show()


def evaluate_magnitude_prediction(data):
    data['Absolute_Percent_Change'] = data['percent_change_Close'].abs()
    mae = mean_absolute_error(data['Absolute_Percent_Change'], data['MagnitudePrediction'])
    rmse = root_mean_squared_error(data['Absolute_Percent_Change'], data['MagnitudePrediction'])
    correlation = data[['Absolute_Percent_Change', 'MagnitudePrediction']].corr().iloc[0, 1]
    return mae, rmse, correlation


def calculate_extended_metrics(data, predictions_column):
    def safe_stats(series):
        capped_series = series.clip(lower=-2.5, upper=2.5) * 100  # Multiplied by 100
        return {
            "mean": round(capped_series.mean(), 2) if not capped_series.empty else float('nan'),
            "max": round(capped_series.max(), 2) if not capped_series.empty else float('nan'),
            "min": round(capped_series.min(), 2) if not capped_series.empty else float('nan'),
            "std": round(capped_series.std(), 2) if not capped_series.empty else float('nan')
        }

    correct_up = data[(data[predictions_column] == 1) & (data['percent_change_Close'] > 0)]['percent_change_Close']
    correct_down = data[(data[predictions_column] == -1) & (data['percent_change_Close'] < 0)]['percent_change_Close']
    wrong_up = data[(data[predictions_column] == 1) & (data['percent_change_Close'] < 0)]['percent_change_Close']
    wrong_down = data[(data[predictions_column] == -1) & (data['percent_change_Close'] > 0)]['percent_change_Close']

    return {
        "correct_up_stats": safe_stats(correct_up),
        "correct_down_stats": safe_stats(correct_down),
        "wrong_up_stats": safe_stats(wrong_up),
        "wrong_down_stats": safe_stats(wrong_down),
        "positive_preds": sum(data[predictions_column] == 1),
        "negative_preds": sum(data[predictions_column] == -1),
        "non_predictions": sum(data[predictions_column] == 0)
    }


def evaluate_model(data, predictions_column):
    filtered_data = data[data['percent_change_Close'] != 0]
    actual = filtered_data['percent_change_Close'].apply(lambda x: 1 if x > 0 else -1)
    predicted = filtered_data[predictions_column]
    accuracy = accuracy_score(actual, predicted)
    precision, recall, f1, _ = precision_recall_fscore_support(actual, predicted, average='weighted', zero_division=1)
    return accuracy, (precision, recall, f1)


def print_average_extended_metrics(extended_metrics_list):
    print("\n")
    header = "{:<20} {:>10} {:>10} {:>10} {:>10}".format("Metric", "Mean", "Max", "Min", "Std Dev")
    print(header)
    print("-" * 65)  # Separator

    for metric in ["correct_up_stats", "wrong_up_stats", "correct_down_stats", "wrong_down_stats"]:
        mean_metric = {
            k: round(np.nanmean([d[metric][k] for d in extended_metrics_list]), 2)
            for k in ["mean", "max", "min", "std"]
        }
        row = "{:<20} {:>10} {:>10} {:>10} {:>10}".format(
            metric,
            mean_metric["mean"],
            mean_metric["max"],
            mean_metric["min"],
            mean_metric["std"]
        )
        print(row)

def evaluate_model(data, predictions_column):
    label_map = {1: 'Up', 0: 'Unsure', -1: 'Down'}
    actual_labels = data['percent_change_Close'].apply(lambda x: 'Up' if x > 0 else ('Down' if x < 0 else 'Unsure'))
    predicted_labels = data[predictions_column].map(label_map)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(actual_labels, predicted_labels, average='macro', zero_division=0)
    cm = confusion_matrix(actual_labels, predicted_labels, labels=["Up", "Down", "Unsure"])
    print("Confusion Matrix:\n", cm)
    return accuracy, (precision, recall, f1)

def process_directory(directory, predictions_column='Prediction', min_distribution_threshold=0.0):
    accuracies = []
    up_accuracies = []
    down_accuracies = []
    extended_metrics_list = []
    metrics_list = []
    detailed_metrics = []
    skipped_fileCounter = 0
    counted_files = 0
    totalfiles = 0
    maes, rmses, correlations = [], [], []
    unsure_accuracy = []

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        for filename in tqdm(os.listdir(directory), desc="Processing Files"):
            if filename.endswith(".csv") and filename != "log_file_name.csv":
                totalfiles += 1
                file_path = os.path.join(directory, filename)
                data = load_data(file_path)
    
                # Skip the first 400 rows
                if len(data) > 400:
                    data = data.iloc[400:]
                else:
                    skipped_fileCounter += 1
                    continue
                unsure_correct = filtered_data[(filtered_data[predictions_column] == 0) & (filtered_data['percent_change_Close'] == 0)]
                total_unsure = sum(filtered_data['percent_change_Close'] == 0)
                unsure_acc = len(unsure_correct) / total_unsure if total_unsure != 0 else 0
                unsure_accuracy.append(unsure_acc)

                mae, rmse, corr = evaluate_magnitude_prediction(data)
                maes.append(mae)
                rmses.append(rmse)
                correlations.append(corr)

                category_counts = data[predictions_column].value_counts(normalize=True)
                if all(count >= min_distribution_threshold for count in category_counts):
                    filtered_data = filter_predictions(data, predictions_column)
                    accuracy, metrics = evaluate_model(filtered_data, predictions_column)
                    accuracies.append(accuracy)
                    metrics_list.append(metrics)
                    
                    # Calculate category-specific accuracies
                    up_correct = len(filtered_data[(filtered_data[predictions_column] == 1) & (filtered_data['percent_change_Close'] > 0)])
                    down_correct = len(filtered_data[(filtered_data[predictions_column] == -1) & (filtered_data['percent_change_Close'] < 0)])
                    unsure_correct = len(filtered_data[(filtered_data[predictions_column] == 0) & (filtered_data['percent_change_Close'] == 0)])
                    total_up = sum(filtered_data['percent_change_Close'] > 0)
                    total_down = sum(filtered_data['percent_change_Close'] < 0)
                    total_unsure = sum(filtered_data['percent_change_Close'] == 0)
                    up_accuracies.append(up_correct / total_up if total_up != 0 else 0)
                    down_accuracies.append(down_correct / total_down if total_down != 0 else 0)

                    extended_metrics = calculate_extended_metrics(filtered_data, predictions_column)
                    extended_metrics_list.append(extended_metrics)
                    additional_metrics = calculate_additional_metrics(filtered_data, predictions_column)
                    detailed_metrics.append(additional_metrics)
                    counted_files += 1
                else:
                    skipped_fileCounter += 1



    avg_up_accuracy = np.mean(up_accuracies)
    avg_down_accuracy = np.mean(down_accuracies)
    avg_mae = np.mean(maes)
    avg_rmse = np.mean(rmses)
    avg_corr = np.mean(correlations)
    print("Regressor Model Performance:")
    print(f"Average MAE across all files: {avg_mae:.3f}")
    print(f"Average RMSE across all files: {avg_rmse:.3f}")
    print(f"Average Correlation across all files: {avg_corr:.3f}")
    print("Average Classifier Model Performance with Distribution Filtering:")
    print(f"Average Up Accuracy: {avg_up_accuracy*100:.3f}")
    print(f"Average Down Accuracy: {avg_down_accuracy*100:.3f}")
    MarketUp = 0.499569
    MarketDown = 0.500430
    Better_than_Random_Up = avg_up_accuracy - MarketUp
    Better_than_Random_Down = avg_down_accuracy - MarketDown
    print(f"Better than Random Down: {Better_than_Random_Down*100:.3f}")
    print(f"Better than Random Up: {Better_than_Random_Up*100:.3f}")
    print(f"Total files: {totalfiles}")
    print(f"Skipped {skipped_fileCounter} files due to inadequate distribution across categories, Processed {counted_files} files.")
    print(f"Skipped percentage: {skipped_fileCounter/totalfiles*100:.2f}%")

    for metric in ["correct_up_avg", "correct_down_avg", "wrong_up_avg", "wrong_down_avg", "positive_preds", "negative_preds", "non_predictions"]:
        avg_metric = np.nanmean([d[metric] for d in detailed_metrics])
        print(f"Average {metric}: {avg_metric:.3f}")

    print_average_extended_metrics(extended_metrics_list)
    plot_distributions(extended_metrics_list)

if __name__ == "__main__":
    directory = "Data/RFpredictions"
    process_directory(directory)


Processing Files:   0%|          | 0/5751 [00:00<?, ?it/s]

Processing Files:   0%|          | 0/5751 [00:00<?, ?it/s]


UnboundLocalError: local variable 'filtered_data' referenced before assignment