<a href="https://colab.research.google.com/github/Masta-cynat/OscarWebScraper/blob/main/Cleaning%20ETL%20Function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def locate_and_ingest_chunks(output_prefix, file_extension='.csv'):
    """
    Locate all chunk files produced by the CSV splitter, apply ETL functionality, and process the data.

    Parameters:
    output_prefix (str): The prefix used when creating the chunk files.
    file_extension (str): The file extension for the chunk files (default is '.csv').

    Returns:
    pd.DataFrame: The consolidated DataFrame after loading and processing all chunk files.
    """
    # Locate all chunk files in the current directory
    chunk_files = [f for f in os.listdir() if f.startswith(output_prefix) and f.endswith(file_extension)]

    if not chunk_files:
        print("No chunk files found with the specified prefix.")
        return None

    print(f"Found {len(chunk_files)} chunk files.")

    # Initialize an empty DataFrame to store the consolidated data
    consolidated_df = pd.DataFrame()

    # ETL (Extract, Transform, Load) for each chunk file
    for chunk_file in sorted(chunk_files):
        print(f"Processing file: {chunk_file}")

        # Extract: Read the chunk file into a DataFrame
        chunk_df = pd.read_csv(chunk_file, sep=';', low_memory=True)

        # Transform: (Add custom transformations here if needed)
        # For example, you can filter data, rename columns, drop unnecessary rows, etc.
        # Example transformation: Drop rows with missing values
        chunk_df = chunk_df.dropna()

        # Load: Append the transformed chunk to the consolidated DataFrame
        consolidated_df = pd.concat([consolidated_df, chunk_df], ignore_index=True)

    print(f"Consolidated data contains {consolidated_df.shape[0]} rows and {consolidated_df.shape[1]} columns.")

    return consolidated_df


def save_consolidated_file(consolidated_df, output_file):
    """
    Save the consolidated DataFrame into a single CSV file.

    Parameters:
    consolidated_df (pd.DataFrame): The consolidated DataFrame.
    output_file (str): The name of the output file to save the consolidated data.
    """
    # Load: Save the consolidated data to a new CSV file
    consolidated_df.to_csv(output_file, index=False, sep=';')
    print(f"Saved the consolidated data to {output_file}")

# Usage example
output_prefix = 'chunk'  # Prefix for the chunk files
consolidated_output_file = 'consolidated_lifebear.csv'  # Name of the output file for the consolidated data

# Locate and process chunk files
consolidated_df = locate_and_ingest_chunks(output_prefix)

# Save the consolidated data to a new file if processing was successful
if consolidated_df is not None:
    save_consolidated_file(consolidated_df, consolidated_output_file)
