# Import Required Libraries
Import necessary libraries such as os, logging, and traceback.

In [35]:
# Import necessary libraries
import os
import logging
import traceback

# Configure logging to display information and error messages
logging.basicConfig(level=logging.INFO)

# Config
Define the Config class that holds configuration paths and settings.

In [36]:
# Define the Config class that holds configuration paths and settings
class Config:
    def __init__(self):
        # Path to the database
        self.DATA_BD_PATH = r"C:\Users\yo\Desktop\jarvis_data\databd"
        # Path to save the preprocessed data
        self.CURRENT_PREPROCESSED_PATH = r"C:\Users\yo\Desktop\jarvis_data\preprocessed"

# Define AdvancedDataPreprocessor Class
Define the AdvancedDataPreprocessor class with methods for processing files in parallel and saving results.

In [37]:
# Define the AdvancedDataPreprocessor class
class AdvancedDataPreprocessor:
    def __init__(self, config):
        """
        Initialize the preprocessor with the given configuration.
        
        Args:
        config (dict): Configuration dictionary containing data paths, batch size, and max workers.
        """
        self.data_paths = config['data_paths']
        self.batch_size = config['batch_size']
        self.max_workers = config['max_workers']
        self.results = []

    def process_files_parallel(self):
        """
        Process files in parallel using the specified number of workers.
        """
        # Placeholder for parallel processing logic
        logging.info("Processing files in parallel with {} workers...".format(self.max_workers))
        # Simulate processing
        for path in self.data_paths:
            logging.info(f"Processing file: {path}")
            self.results.append(f"Processed {path}")

    def save_results(self, save_path):
        """
        Save the processed results to the specified path.
        
        Args:
        save_path (str): Path to save the processed results.
        """
        # Placeholder for saving logic
        logging.info(f"Saving results to {save_path}")
        with open(save_path, 'w') as f:
            for result in self.results:
                f.write(result + "\n")

# Initialize Configuration and Preprocessor
Initialize the Config instance and set up the preprocessor configuration dictionary.

In [38]:
# Initialize Configuration and Preprocessor

# Create an instance of the Config class to access configuration paths and settings
config_instance = Config()

# Set up the preprocessor configuration dictionary
preprocessor_config = {
    'data_paths': [config_instance.DATA_BD_PATH],  # List of data paths to be processed
    'batch_size': 32,  # Batch size for processing
    'max_workers': os.cpu_count() or 4  # Number of workers for parallel processing
}

# Initialize the AdvancedDataPreprocessor with the configuration
preprocessor = AdvancedDataPreprocessor(preprocessor_config)

# Process the files in parallel using the preprocessor
preprocessor.process_files_parallel()

# Save the processed results to the specified path
preprocessor.save_results(config_instance.CURRENT_PREPROCESSED_PATH)

# Log the completion of preprocessing
logging.info("Preprocesamiento completado exitosamente")

INFO:root:Processing files in parallel with 12 workers...
INFO:root:Processing file: C:\Users\yo\Desktop\jarvis_data\databd
INFO:root:Saving results to C:\Users\yo\Desktop\jarvis_data\preprocessed
INFO:root:Preprocesamiento completado exitosamente


# Execute Preprocessing
Initialize and execute the preprocessor using the process_files_parallel method.

In [39]:
# Execute Preprocessing

# Initialize Configuration and Preprocessor

# Create an instance of the Config class to access configuration paths and settings
config_instance = Config()

# Set up the preprocessor configuration dictionary
preprocessor_config = {
    'data_paths': [config_instance.DATA_BD_PATH],  # List of data paths to be processed
    'batch_size': 32,  # Batch size for processing
    'max_workers': os.cpu_count() or 4  # Number of workers for parallel processing
}

# Initialize the AdvancedDataPreprocessor with the configuration
preprocessor = AdvancedDataPreprocessor(preprocessor_config)

# Process the files in parallel using the preprocessor
preprocessor.process_files_parallel()

# Save the processed results to the specified path
preprocessor.save_results(config_instance.CURRENT_PREPROCESSED_PATH)

# Log the completion of preprocessing
logging.info("Preprocesamiento completado exitosamente")

INFO:root:Processing files in parallel with 12 workers...
INFO:root:Processing file: C:\Users\yo\Desktop\jarvis_data\databd
INFO:root:Saving results to C:\Users\yo\Desktop\jarvis_data\preprocessed
INFO:root:Preprocesamiento completado exitosamente


# Save Results
Save the preprocessing results to the specified path using the save_results method.

In [40]:
# Save Results

# Save the processed results to the specified path
preprocessor.save_results(config_instance.CURRENT_PREPROCESSED_PATH)

# Log the completion of preprocessing
logging.info("Preprocesamiento completado exitosamente")

INFO:root:Saving results to C:\Users\yo\Desktop\jarvis_data\preprocessed
INFO:root:Preprocesamiento completado exitosamente


# Handle Exceptions
Handle any exceptions that occur during preprocessing and log the errors.

In [41]:
# Handle Exceptions

try:
    # Initialize Configuration and Preprocessor
    config_instance = Config()
    preprocessor_config = {
        'data_paths': [config_instance.DATA_BD_PATH],
        'batch_size': 32,
        'max_workers': os.cpu_count() or 4
    }
    
    # Initialize and execute preprocessor
    preprocessor = AdvancedDataPreprocessor(preprocessor_config)
    preprocessor.process_files_parallel()
    
    # Save results
    preprocessor.save_results(config_instance.CURRENT_PREPROCESSED_PATH)
    
    logging.info("Preprocesamiento completado exitosamente")
    success = True
    
except Exception as e:
    logging.error(f"Error en el preprocesamiento: {e}")
    traceback.print_exc()
    success = False

INFO:root:Processing files in parallel with 12 workers...
INFO:root:Processing file: C:\Users\yo\Desktop\jarvis_data\databd
INFO:root:Saving results to C:\Users\yo\Desktop\jarvis_data\preprocessed
INFO:root:Preprocesamiento completado exitosamente


# Run Main Function
Define and run the main function to execute the preprocessing script.

In [42]:
# Run Main Function

# Define the main function to execute the preprocessing script
def main():
    try:
        # Initialize Configuration and Preprocessor
        config_instance = Config()
        preprocessor_config = {
            'data_paths': [config_instance.DATA_BD_PATH],
            'batch_size': 32,
            'max_workers': os.cpu_count() or 4
        }
        
        # Initialize and execute preprocessor
        preprocessor = AdvancedDataPreprocessor(preprocessor_config)
        preprocessor.process_files_parallel()
        
        # Save results
        preprocessor.save_results(config_instance.CURRENT_PREPROCESSED_PATH)
        
        logging.info("Preprocesamiento completado exitosamente")
        return True
        
    except Exception as e:
        logging.error(f"Error en el preprocesamiento: {e}")
        traceback.print_exc()
        return False

# Run the main function if this script is executed directly
if __name__ == "__main__":
    main()

INFO:root:Processing files in parallel with 12 workers...
INFO:root:Processing file: C:\Users\yo\Desktop\jarvis_data\databd
INFO:root:Saving results to C:\Users\yo\Desktop\jarvis_data\preprocessed
INFO:root:Preprocesamiento completado exitosamente
