In [9]:
import os
import pandas as pd
import numpy as np

# Input and output directories
input_directory = "formatted_data"
output_directory = "cleaned_data"

# Create a folder for cleaned data
os.makedirs(output_directory, exist_ok=True)

# Z-score calculation function for rolling windows
def zscore(series, window, threshold=3, return_all=False):
    # Calculate rolling mean and std
    avg = series.rolling(window=window, center=True).mean()
    std = series.rolling(window=window, center=True).std()
    
    # Z-score calculation
    z = (series - avg) / std
    
    # Identify outliers: absolute z-score greater than threshold
    mask = np.abs(z) <= threshold  # Boolean mask for non-outliers
    
    if return_all:
        return z, avg, std, mask
    return mask

# Function to clean data using rolling z-score
def clean_data_with_rolling_zscore(df, x_col, y_col, window, threshold):
    # Ensure the columns are numeric
    df[x_col] = pd.to_numeric(df[x_col], errors='coerce')
    df[y_col] = pd.to_numeric(df[y_col], errors='coerce')
    
    # Drop NaN values
    df = df.dropna(subset=[x_col, y_col])
    
    # Apply rolling z-score method to the y_col
    mask = zscore(df[y_col], window=window, threshold=threshold)
    
    # Filter the data using the mask
    filtered_df = df[mask]
    return filtered_df

# Process each file in the folder
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file with the correct delimiter
        df = pd.read_csv(file_path, delimiter=';')

        # Clean the data with rolling z-score
        cleaned_df = clean_data_with_rolling_zscore(
            df, 
            x_col='Acceleration voltage U_B / V', 
            y_col='Collector current I_A / nA', 
            window=50, 
            threshold=3
        )
        
        # Save the cleaned data
        cleaned_file_path = os.path.join(output_directory, file_name)
        cleaned_df.to_csv(cleaned_file_path, index=False)

        # Read all cleaned CSV data for verification
        df = pd.read_csv(cleaned_file_path)
        print(f"Processed file: {file_name}")
        print(df)

print("Data cleaning completed. Cleaned files are saved in the 'cleaned_data' folder.")


Processed file: 1.5 6.0 2.csv
     Time t / s  Voltage U_A1 / V  Voltage U_B1 / V  \
0         2.499             0.405             0.730   
1         2.600             0.405             0.760   
2         2.700             0.420             0.790   
3         2.799             0.420             0.815   
4         2.900             0.435             0.845   
..          ...               ...               ...   
246      27.100            12.855             7.480   
247      27.201            12.855             7.510   
248      27.300            12.855             7.535   
249      27.399            12.855             7.560   
250      27.499            12.855             7.585   

     Collector current I_A / nA  Acceleration voltage U_B / V  
0                         0.405                          7.30  
1                         0.405                          7.60  
2                         0.420                          7.90  
3                         0.420                      