In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Input and output directories
input_directory = "../formatted_data/"
output_directory = "../cleaned_data/"

# Create a folder for cleaned data
os.makedirs(output_directory, exist_ok=True)

# Function to detect and remove outliers using IQR
def remove_outliers(df, x_col, y_col):
    # Calculate IQR for x and y columns
    Q1_x = df[x_col].quantile(0.25)
    Q3_x = df[x_col].quantile(0.75)
    IQR_x = Q3_x - Q1_x

    Q1_y = df[y_col].quantile(0.25)
    Q3_y = df[y_col].quantile(0.75)
    IQR_y = Q3_y - Q1_y

    # Define bounds for x and y
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x

    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y

    # Filter data within the bounds
    filtered_df = df[(df[x_col] >= lower_bound_x) & (df[x_col] <= upper_bound_x) &
                     (df[y_col] >= lower_bound_y) & (df[y_col] <= upper_bound_y)]
    return filtered_df

# Process each file in the folder
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file with the correct delimiter
        df = pd.read_csv(file_path, delimiter=';')

        # Ensure the relevant columns are numeric
        df['Acceleration voltage U_B / V'] = pd.to_numeric(df['Acceleration voltage U_B / V'], errors='coerce')
        df['Collector current I_A / nA'] = pd.to_numeric(df['Collector current I_A / nA'], errors='coerce')

        # Drop rows with NaN values in the relevant columns
        df = df.dropna(subset=['Acceleration voltage U_B / V', 'Collector current I_A / nA'])

        # Remove outliers
        cleaned_df = remove_outliers(df, 'Acceleration voltage U_B / V', 'Collector current I_A / nA')
        
        # Save the cleaned data
        cleaned_file_path = os.path.join(output_directory, file_name)
        cleaned_df.to_csv(cleaned_file_path, index=False)
        
        # Read all cleaned CSV data for verification
        df = pd.read_csv(cleaned_file_path)
        print(f"Processed file: {file_name}")
        print(df)

print("Data cleaning completed. Cleaned files is saved in the 'cleaned_data.ipynb' folder.")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Processed file: 1.0 7.0 1.csv
     Time t / s  Voltage U_A1 / V  Voltage U_B1 / V  \
0         0.100            -0.075             0.030   
1         0.201             0.195             0.055   
2         0.299             0.270             0.085   
3         0.400             0.315             0.115   
4         0.501             0.345             0.145   
..          ...               ...               ...   
394      39.500             0.915             7.905   
395      39.600             0.900             7.905   
396      39.700             0.900             7.905   
397      39.799             0.900             7.905   
398      39.900             0.900             7.905   

     Collector current I_A / nA  Acceleration voltage U_B / V  
0                        -0.075                          0.30  
1                         0.195                          0.55  
2                         0.270                          0.85  
3                         0.315                      