In [8]:
# Data Manipulation and Analysis
import pandas as pd
import numpy as np

# Data Visualization
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Statistical Analysis
from scipy import stats
import statsmodels.api as sm

# Machine Learning and Data Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Parallel and Asynchronous Programming
import multiprocessing
import asyncio
import joblib

# Miscellaneous
from scipy import signal
import os
import datetime
import logging
import pandas as pd
import os
import datetime
import logging
from sklearn.preprocessing import RobustScaler



class Config:
    def __init__(self):
        self.input_dir = "Data/IndicatorData"
        self.output_dir = "Data/ScaledData"
        self.log_file = "Data/ScaledData/_ScalingErrors.log"
        self.scaling_methods_file = "__ScalingMethods.csv"  # Add this line

    def setup_logging(self):
        logging.basicConfig(filename=self.log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        logging.info(f"Logging started at {datetime.datetime.now()}")

###===================================( Data Preprocessing )===================================###
###===================================( Data Preprocessing )===================================###
###===================================( Data Preprocessing )===================================###
        





def load_scaling_methods(config):
    scaling_methods_file = config.scaling_methods_file  # File in the workspace folder
    try:
        scaling_methods_df = pd.read_csv(scaling_methods_file)
        scaling_methods = {}
        for _, row in scaling_methods_df.iterrows():
            scaling_methods[row['Column']] = {
                'ScalingMethod': row['ScalingMethod'],
                'LogTransform': row['LogTransform'] == 'True',
                'Detrend': row['Detrend']
            }
        logging.info("Successfully loaded scaling methods.")
        return scaling_methods
    except Exception as e:
        logging.error(f"Error loading scaling methods: {e}")
        return {}



def outlyer_squasher(df, percentile1=0.999, percentile2=0.001):
    try:
        for col in df.select_dtypes(include=['float64', 'int64']).columns:
            lower_quantile = df[col].quantile(percentile2)  # Get the lower quantile
            upper_quantile = df[col].quantile(percentile1)  # Get the upper quantile
            df[col] = df[col].clip(lower=lower_quantile, upper=upper_quantile)
    except Exception as e:
        logging.error(f"Error squashing outliers: {e}")
    return df




def handle_inf_and_scale_dynamic(df, scaling_methods, window_ratio=0.1, min_window_size=100):
    try:
        # Replace inf/-inf with NaN
        df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Forward fill to handle NaNs
        df.fillna(method='ffill', inplace=True)

        window_size = max(int(len(df) * window_ratio), min_window_size)
        numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

        for start in range(0, len(df), window_size):
            end = min(start + window_size, len(df))
            df_subset = df.iloc[start:end]
            numeric_cols = df_subset.select_dtypes(include=[np.number]).columns

            for col in numeric_cols:
                # Check if column has a specified scaling method
                if col in scaling_methods:
                    # Log transform if specified
                    if scaling_methods[col]['LogTransform']:
                        df = apply_log_transform(df, col)

                    # Detrend if specified
                    if scaling_methods[col]['Detrend']:
                        df = detrend_data(df, col, scaling_methods[col]['Detrend'])

                scaler = get_scaler(scaling_methods[col]['ScalingMethod'])

        logging.info("Dataframe dynamic scaling with inf handling complete.")
    except Exception as e:
        logging.error(f"Error in dynamic scaling of dataframe: {e}")

    return df



def get_scaler(scaling_method):
    if scaling_method == 'StandardScaler':
        return StandardScaler()
    elif scaling_method == 'MinMaxScaler':
        return MinMaxScaler()
    elif scaling_method == 'RobustScaler':
        return RobustScaler()
    else:
        logging.error(f"No scaling method needed")
        return None


def apply_log_transform(df, column):
    try:
        df[column] = np.log1p(df[column])
    except Exception as e:
        logging.error(f"Error applying log transform to column {column}: {e}")

    return df

def detrend_data(df, column, method):


    try:
        if method == 'Differencing':
            df[column] = df[column].diff().fillna(df[column])
        elif method == 'ScipyDetrend':
            df[column] = signal.detrend(df[column])
    except Exception as e:
        logging.error(f"Error detrending column {column}: {e}")
    return df

def interpolated(df):
    numeric_df = df.select_dtypes(include=[np.number])
    numeric_df = numeric_df.interpolate(method='linear', limit_direction='both')
    df[numeric_df.columns] = numeric_df
    return df


def save_dataframe(df, file_path):
    config = Config()  # Instantiate a Config object
    file_name = file_path.split("/")[-1].split("_")[0] + "_scaled.csv"
    file_path = os.path.join(config.output_dir, file_name)  # Use the instance's output_dir
    df.to_csv(file_path, index=False)
    logging.info(f"File {file_name} has been saved to {config.output_dir}")




def process_dataframe(df, file_path, scaling_methods):
    df = outlyer_squasher(df)
    df = interpolated(df)
    df = handle_inf_and_scale_dynamic(df, scaling_methods)
    df = interpolated(df)
    save_dataframe(df, file_path)


def main():
    config = Config()
    config.setup_logging()

    # Load the scaling methods
    scaling_methods = load_scaling_methods(config)

    # Clear all files ending in .csv in the output directory
    for file in os.listdir(config.output_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(config.output_dir, file)
            os.remove(file_path)
    logging.info(f"Files in {config.output_dir} have been cleared.")

    for file in os.listdir(config.input_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(config.input_dir, file)
            try:
                df = pd.read_csv(file_path)
                process_dataframe(df, file_path, scaling_methods)
                # Optionally, save processed data to the output directory
                ##df.to_csv(os.path.join(config.output_dir, file), index=False)
            except Exception as e:
                logging.error(f"Error processing file {file}: {e}")

if __name__ == "__main__":
    main()

  df.fillna(method='ffill', inplace=True)
  diff_b_a = subtract(b, a)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='ffill', inplace=True)
