# EXCEPTIONS

In [1]:
import logging
import os
import pandas as pd

# Set up logging configuration
logging.basicConfig(level=logging.INFO)

class IncompatibleActionError(Exception):
    """Exception raised when the action is not supported."""
    def __init__(self, action_type: str):
        super().__init__(f"Action '{action_type}' is not supported.")

class IncompatibleFormatError(Exception):
    """Exception raised when the file format is not supported."""
    def __init__(self, file_format: str):
        super().__init__(f"Format '{file_format}' is not supported.")

class IncompatibleProcessingError(Exception):
    """Exception raised when the processing is not supported."""
    def __init__(self):
        super().__init__("Processing is not supported. The file was neither saved nor processed.")

class SaveDatasetError(Exception):
    """
    Custom exception for errors when saving a DataFrame in a specific format.
    """
    def __init__(self, file_format, original_exception):
        super().__init__(f"Error saving the file in format '{file_format}': {original_exception}")
        self.file_format = file_format
        self.original_exception = original_exception

class LoadDatasetError(Exception):
    """
    Custom exception for errors when loading a file in a specific format.
    """
    def __init__(self, file_path, file_format, original_exception):
        super().__init__(f"Error loading the file '{file_path}' with format '{file_format}': {original_exception}")
        self.file_path = file_path
        self.file_format = file_format
        self.original_exception = original_exception

# MAIN CLASS

In [2]:
class PandasDatasetProcessor:

    @staticmethod
    def create_directory_if_not_exists(path: str) -> None:
        """
        Checks if the specified directory exists. If not, creates it.

        Args:
            path (str): Directory path to check.

        Returns:
            None
        """
        if not os.path.exists(path):
            os.makedirs(path)
            print(f"Directorio '{path}' creado.")

    @staticmethod
    def compatible_formats(action_type: str, file_format: str, dataset: pd.DataFrame = None):
        """
        Returns compatible methods for reading or writing data based on the action type and format.
    
        Parameters:
        - action_type (str): The action type, either 'write' or 'read'.
        - file_format (str): The file format, such as 'csv', 'json', 'xml', etc.
        - dataset (pd.DataFrame, optional): The DataFrame to process (required for write actions).
    
        Returns:
        - dict: A dictionary with compatible actions for the specified action type.
        - callable or None: A callable function for the specified format or None if not supported.
        """
        compatible_formats = {
            'write': {
                'orc': lambda filename: dataset.to_orc(filename) if dataset is not None else None,
                'parquet': lambda filename: dataset.to_parquet(filename) if dataset is not None else None,
                'xml': lambda filename: dataset.to_xml(filename, index=False) if dataset is not None else None,
                'json': lambda filename: dataset.to_json(filename, orient='records', lines=False, indent=4) if dataset is not None else None,
                'html': lambda filename: dataset.to_html(filename, index=False, border=1) if dataset is not None else None,
                'hdf5': lambda filename: dataset.to_hdf(filename, key='df', mode='w') if dataset is not None else None,
                'csv': lambda filename: dataset.to_csv(filename, encoding='utf-8', index=False) if dataset is not None else None,
                'xlsx': lambda filename: dataset.to_excel(filename, engine='openpyxl', index=False) if dataset is not None else None
            },
            'read': {
                'orc': pd.read_orc,
                'parquet': pd.read_parquet,
                'xml': pd.read_xml,
                'json': pd.read_json,
                'html': pd.read_html,
                'csv': pd.read_csv,
                'hdf5': pd.read_hdf,
                'xlsx': pd.read_excel
            }
        }
    
        compatible_action_type = compatible_formats.get(action_type, {})
        compatible_file_format = compatible_action_type.get(file_format.lower(), None)
    
        return compatible_action_type, compatible_file_format

    @staticmethod
    def save_dataset(dataset: pd.DataFrame, action_type: str, file_format: str, path: str = '.', base_filename: str = 'output_file') -> None:
        """
        Saves a DataFrame in the specified file format.
    
        Parameters:
        - dataset (pd.DataFrame): The DataFrame to save.
        - action_type (str): The type of action ('write' to save).
        - file_format (str): The file format, such as 'csv', 'json', 'xml', etc.
        - path (str): The directory path where the file will be saved (default is the current directory).
        - base_filename (str): The base name for the file (default is 'output_file').
    
        Raises:
        - SaveDatasetError: If an error occurs while saving the file.
        - IncompatibleProcessingError: If neither the action nor the format is supported.
        - IncompatibleActionError: If the action type is not supported.
        - IncompatibleFormatError: If the file format is not supported.
        """
        compatible_action_type, compatible_file_format = PandasDatasetProcessor.compatible_formats(action_type, file_format, dataset)
    
        if compatible_action_type and compatible_file_format:
            PandasDatasetProcessor.create_directory_if_not_exists(path)
            file_name = os.path.join(path, f'{base_filename}.{file_format.lower()}')
    
            try:
                compatible_file_format(file_name) # Save the file in the specified format
                print(f"File Saved as {file_name}")
            except Exception as e:
                raise SaveDatasetError(file_format, e)
        else:
            if not compatible_action_type and not compatible_file_format:
                raise IncompatibleProcessingError()
            elif not compatible_action_type:
                raise IncompatibleActionError(action_type)
            elif not compatible_file_format:
                raise IncompatibleFormatError(file_format)


    @staticmethod
    def load_dataset(file_path: str) -> pd.DataFrame:
        """
        Loads a dataset from a file, automatically determining the format based on the file extension.
    
        Args:
            file_path (str): The full path of the file to load.
    
        Returns:
            pd.DataFrame: The loaded dataset as a DataFrame.
    
        Raises:
            LoadDatasetError: If an error occurs while attempting to load the file.
            IncompatibleFormatError: If the file format is not compatible.
        """
        file_format = file_path.split('.')[-1]  # Get the file extension
        compatible_action_type, compatible_file_format = PandasDatasetProcessor.compatible_formats('read', file_format)
    
        if compatible_file_format:
            try:
                # Read the file using the corresponding method
                dataset = compatible_file_format(file_path)
                print(f"File '{file_path}' successfully loaded as {file_format}.")
                return dataset
            except Exception as e:
                # Raise a custom exception if an error occurs during loading
                raise LoadDatasetError(file_path, file_format, e)
        else:
            # Raise a custom exception if the format is not compatible
            raise IncompatibleFormatError(file_format)

        
    @staticmethod
    def generate_partitioned_datasets(dataset: pd.DataFrame, num_parts: int) -> list:
        """
        Divides a DataFrame into 'n' partitions evenly. If an exact division is not possible, 
        the last partition will contain the remaining records, with the remainder distributed as 
        evenly as possible among the earlier partitions.
    
        Parameters:
        dataset (pd.DataFrame): The DataFrame to be divided.
        num_parts (int): The number of partitions to divide the DataFrame into.
    
        Returns:
        list: A list of DataFrames generated from the partitioning of the original DataFrame.
        """
    
        # dataset total records
        total_rows = len(dataset)
        
        # Calculate the base size of the partitions and the remaining records
        base_partition_size = total_rows // num_parts
        remainder = total_rows % num_parts
        
        partitions = []
        start_idx = 0
        
        # Create the partitions, distributing the remaining records evenly
        for i in range(num_parts):
            # The partition size will be base_partition_size + 1 for the first 'remainder' partitions
            partition_size = base_partition_size + (1 if i < remainder else 0)
            end_idx = start_idx + partition_size
            partitions.append(dataset.iloc[start_idx:end_idx])
            start_idx = end_idx
        
        return partitions

# TEST

## CREATE FILES FROM DATAFRAME

In [3]:
file_locations = list()

dataset_1 = pd.read_csv('https://raw.githubusercontent.com/JorgeCardona/data-collection-json-csv-sql/refs/heads/main/csv/flight_logs_part_1.csv')
dataset_2 = pd.read_csv('https://raw.githubusercontent.com/JorgeCardona/data-collection-json-csv-sql/refs/heads/main/csv/flight_logs_part_2.csv')

# Example usage
# Make sure you have a DataFrame called 'dataset_1' before running this code.
file_formats = ['orc', 'parquet', 'xml', 'json', 'html', 'csv', 'hdf5', 'xlsx']
datasets = [dataset_1, dataset_2]

# In this example, the DataFrame is saved in 'csv' format.
action_type = 'write'
for index_dataset, dataset in enumerate(datasets):
    for index_file, file_format in enumerate(file_formats):
        # Save dataset to multiples formats
        path = f'./data/dataset_{index_dataset+1}'
        base_filename=f'sample_dataset_{index_file+1}'
        
        file_location = f"{path}/{base_filename}.{file_format}"
        
        file_locations.append(file_location)
        
        PandasDatasetProcessor.save_dataset(dataset=dataset_1, action_type='write', file_format=file_format, path=path, base_filename=base_filename)

File Saved as ./data/dataset_1/sample_dataset_1.orc
File Saved as ./data/dataset_1/sample_dataset_2.parquet
File Saved as ./data/dataset_1/sample_dataset_3.xml
File Saved as ./data/dataset_1/sample_dataset_4.json
File Saved as ./data/dataset_1/sample_dataset_5.html
File Saved as ./data/dataset_1/sample_dataset_6.csv
File Saved as ./data/dataset_1/sample_dataset_7.hdf5
File Saved as ./data/dataset_1/sample_dataset_8.xlsx
File Saved as ./data/dataset_2/sample_dataset_1.orc
File Saved as ./data/dataset_2/sample_dataset_2.parquet
File Saved as ./data/dataset_2/sample_dataset_3.xml
File Saved as ./data/dataset_2/sample_dataset_4.json
File Saved as ./data/dataset_2/sample_dataset_5.html
File Saved as ./data/dataset_2/sample_dataset_6.csv
File Saved as ./data/dataset_2/sample_dataset_7.hdf5
File Saved as ./data/dataset_2/sample_dataset_8.xlsx


## CREATE DATAFRAMES FROM FILES

In [4]:
for file_location in file_locations:
    # Call the function to save the file in 'csv' format in the './output' folder.
    PandasDatasetProcessor.load_dataset(file_location)

File './data/dataset_1/sample_dataset_1.orc' successfully loaded as orc.
File './data/dataset_1/sample_dataset_2.parquet' successfully loaded as parquet.
File './data/dataset_1/sample_dataset_3.xml' successfully loaded as xml.
File './data/dataset_1/sample_dataset_4.json' successfully loaded as json.
File './data/dataset_1/sample_dataset_5.html' successfully loaded as html.
File './data/dataset_1/sample_dataset_6.csv' successfully loaded as csv.
File './data/dataset_1/sample_dataset_7.hdf5' successfully loaded as hdf5.
File './data/dataset_1/sample_dataset_8.xlsx' successfully loaded as xlsx.
File './data/dataset_2/sample_dataset_1.orc' successfully loaded as orc.
File './data/dataset_2/sample_dataset_2.parquet' successfully loaded as parquet.
File './data/dataset_2/sample_dataset_3.xml' successfully loaded as xml.
File './data/dataset_2/sample_dataset_4.json' successfully loaded as json.
File './data/dataset_2/sample_dataset_5.html' successfully loaded as html.
File './data/dataset_2/

# SPLIT DATAFRAME INTO MULTIPLE DATAFRAMES

In [5]:
PandasDatasetProcessor.generate_partitioned_datasets(dataset_2, 7)

[     flight_id  flight_number departure_airport departure_country  \
 0            1           1978               CFQ           Germany   
 1            2           2337               ONG            France   
 2            3           7588               TEX           Nigeria   
 3            4           7545               ORB             Japan   
 4            5           4553               WEW          Thailand   
 ..         ...            ...               ...               ...   
 710        711           9736               KUC              Peru   
 711        712           8381               AML             China   
 712        713           5505               LIO             China   
 713        714           9372               LOT            Mexico   
 714        715           1258               LVA           Finland   
 
      departure_time arrival_country arrival_date  flight_duration  \
 0    6/7/2023 04:42        Colombia   27/12/2022            14.18   
 1    6/7/2023 17: