In [1]:
# Import required libraries
import os
import re
import pandas as pd

# Define the input folder containing Excel files
input_folder = os.path.join("..", "input_files", "raw_batch_fugro")

# Define the output folder for cleaned CSVs
output_folder = os.path.join("..", "output_files", "output_sheets")
os.makedirs(output_folder, exist_ok=True)  # Create if it doesn't exist

print(f"Input folder: {input_folder}")
print(f"Output folder: {output_folder}")

Input folder: ..\input_files\raw_batch_fugro
Output folder: ..\output_files\output_sheets


In [2]:
# Function to sanitize column names for filenames
def sanitize_column_name(col_name):
    # Replace all non-alphanumeric characters with underscores
    safe_name = re.sub(r'[^\w\-]', '_', col_name)
    # Remove multiple underscores
    safe_name = re.sub(r'__+', '_', safe_name)
    # Strip leading/trailing underscores
    return safe_name.strip('_')

In [3]:
# Loop over all Excel files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):
        file_path = os.path.join(input_folder, filename)
        print(f" Processing file: {filename}")

        try:
            # Read the Excel file, skipping metadata rows
            df = pd.read_excel(file_path, skiprows=6)

            for col in df.columns:
                if col != 'Time':
                    # Create sub-DataFrame
                    df_single = df[['Time', col]].copy()

                    # Drop the first row (polluted)
                    df_single = df_single.iloc[1:].copy()

                    # Rename columns
                    df_single.columns = ['Timestamp', 'head']

                    # Drop rows where 'head' is NaN
                    df_single = df_single.dropna(subset=['head'])

                    # Sanitize filename
                    safe_col_name = sanitize_column_name(col)

                    # Include original Excel filename (without extension) in output CSV name
                    base_name = os.path.splitext(filename)[0]
                    output_file = os.path.join(output_folder, f"{safe_col_name}.csv")

                    # Save to CSV
                    df_single.to_csv(output_file, index=False)

            print(f"Finished: {filename}")

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

print("🎉 All files processed.")

 Processing file: Fugro_Hhw_geavanceerd_4424.xlsx
Finished: Fugro_Hhw_geavanceerd_4424.xlsx
 Processing file: Fugro_Hhw_normaal.xlsx
Finished: Fugro_Hhw_geavanceerd_4424.xlsx
 Processing file: Fugro_Hhw_normaal.xlsx
Finished: Fugro_Hhw_normaal.xlsx
 Processing file: Fugro_Hoorn_4423.xlsx
Finished: Fugro_Hhw_normaal.xlsx
 Processing file: Fugro_Hoorn_4423.xlsx
Finished: Fugro_Hoorn_4423.xlsx
 Processing file: Fugro_Hoorn_Zuiderdijk.xlsx
Finished: Fugro_Hoorn_4423.xlsx
 Processing file: Fugro_Hoorn_Zuiderdijk.xlsx
Finished: Fugro_Hoorn_Zuiderdijk.xlsx
🎉 All files processed.
Finished: Fugro_Hoorn_Zuiderdijk.xlsx
🎉 All files processed.
