In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Input and output directories
input_directory = "formatted_data"
output_directory = "cleaned_data"

# Create a folder for cleaned data
os.makedirs(output_directory, exist_ok=True)

# Function to detect and remove outliers using IQR
def remove_outliers(df, x_col, y_col):
    # Calculate IQR for x and y columns
    Q1_x = df[x_col].quantile(0.25)
    Q3_x = df[x_col].quantile(0.75)
    IQR_x = Q3_x - Q1_x

    Q1_y = df[y_col].quantile(0.25)
    Q3_y = df[y_col].quantile(0.75)
    IQR_y = Q3_y - Q1_y

    # Define bounds for x and y
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x

    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y

    # Filter data within the bounds
    filtered_df = df[(df[x_col] >= lower_bound_x) & (df[x_col] <= upper_bound_x) &
                     (df[y_col] >= lower_bound_y) & (df[y_col] <= upper_bound_y)]
    return filtered_df

# Process each file in the folder
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Extract relevant columns
        x_col = "Acceleration voltage U_B / V"
        y_col = "Collector current I_A / nA"

        # Ensure the relevant columns are numeric
        df[x_col] = pd.to_numeric(df[x_col], errors='coerce')
        df[y_col] = pd.to_numeric(df[y_col], errors='coerce')

        # Drop rows with NaN values in the relevant columns
        df = df.dropna(subset=[x_col, y_col])

        # Remove outliers
        cleaned_df = remove_outliers(df, x_col, y_col)

        # Save the cleaned data
        cleaned_file_path = os.path.join(output_directory, file_name)
        cleaned_df.to_csv(cleaned_file_path, index=False)

        # Plot the cleaned data
        plt.figure()
        plt.plot(cleaned_df[x_col], cleaned_df[y_col], label="Cleaned Data", marker="o", linestyle="none")
        plt.title(f"Cleaned Data: {file_name}")
        plt.xlabel(x_col)
        plt.ylabel(y_col)
        plt.legend()
        plt.grid(True)
        plt.close()

print("Data cleaning completed. Cleaned files and plots are saved in the 'cleaned_data' folder.")


KeyError: 'Acceleration voltage U_B / V'

In [10]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Input and output directories
input_directory = "formatted_data"
output_directory = "cleaned_data"

# Create a folder for cleaned data
os.makedirs(output_directory, exist_ok=True)

# Function to detect and remove outliers using IQR
def remove_outliers(df, x_col, y_col):
    # Calculate IQR for x and y columns
    Q1_x = df[x_col].quantile(0.25)
    Q3_x = df[x_col].quantile(0.75)
    IQR_x = Q3_x - Q1_x

    Q1_y = df[y_col].quantile(0.25)
    Q3_y = df[y_col].quantile(0.75)
    IQR_y = Q3_y - Q1_y

    # Define bounds for x and y
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x

    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y

    # Filter data within the bounds
    filtered_df = df[(df[x_col] >= lower_bound_x) & (df[x_col] <= upper_bound_x) &
                     (df[y_col] >= lower_bound_y) & (df[y_col] <= upper_bound_y)]
    return filtered_df

# Process each file in the folder
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Extract relevant columns
        x_col = "Acceleration voltage U_B / V"
        y_col = "Collector current I_A / nA"

        # Ensure the relevant columns are numeric
        df['Acceleration voltage U_B / V'] = pd.to_numeric(df['Acceleration voltage U_B / V'], errors='coerce')
        df['Collector current I_A / nA'] = pd.to_numeric(df['Collector current I_A / nA'], errors='coerce')

        # Drop rows with NaN values in the relevant columns
        df = df.dropna(subset=['Acceleration voltage U_B / V', 'Collector current I_A / nA'])

        # Remove outliers
        cleaned_df = remove_outliers(df, 'Acceleration voltage U_B / V', 'Collector current I_A / nA')

        # Save the cleaned data
        cleaned_file_path = os.path.join(output_directory, file_name)
        cleaned_df.to_csv(cleaned_file_path, index=False)

        # Plot the cleaned data
        plt.figure()
        plt.plot(cleaned_df['Acceleration voltage U_B / V'], cleaned_df['Collector current I_A / nA'], label="Cleaned Data", marker="o", linestyle="none")
        plt.title(f"Cleaned Data: {file_name}")
        plt.xlabel('Acceleration voltage U_B / V')
        plt.ylabel('Collector current I_A / nA')
        plt.legend()
        plt.grid(True)
        plt.close()

print("Data cleaning completed. Cleaned files and plots are saved in the 'cleaned_data' folder.")


KeyError: 'Acceleration voltage U_B / V'

In [11]:
# debugging

import pandas as pd

# Input and output directories
input_directory = "formatted_data"

for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)
print(df.columns)

Index(['Time t / s;Voltage U_A1 / V;Voltage U_B1 / V;Collector current I_A / nA;Acceleration voltage U_B / V'], dtype='object')


In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Input and output directories
input_directory = "formatted_data"
output_directory = "cleaned_data"

# Create a folder for cleaned data
os.makedirs(output_directory, exist_ok=True)

# Function to detect and remove outliers using IQR
def remove_outliers(df, x_col, y_col):
    # Calculate IQR for x and y columns
    Q1_x = df[x_col].quantile(0.25)
    Q3_x = df[x_col].quantile(0.75)
    IQR_x = Q3_x - Q1_x

    Q1_y = df[y_col].quantile(0.25)
    Q3_y = df[y_col].quantile(0.75)
    IQR_y = Q3_y - Q1_y

    # Define bounds for x and y
    lower_bound_x = Q1_x - 1.5 * IQR_x
    upper_bound_x = Q3_x + 1.5 * IQR_x

    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y

    # Filter data within the bounds
    filtered_df = df[(df[x_col] >= lower_bound_x) & (df[x_col] <= upper_bound_x) &
                     (df[y_col] >= lower_bound_y) & (df[y_col] <= upper_bound_y)]
    return filtered_df

# Process each file in the folder
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(input_directory, file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Extract relevant columns
        x_col = 'Acceleration voltage UB / V'
        y_col = 'Collector current IA / nA'

        # Check if columns exist in DataFrame
        if x_col in df.columns and y_col in df.columns:
            # Ensure the relevant columns are numeric
            df[x_col] = pd.to_numeric(df[x_col], errors='coerce')
            df[y_col] = pd.to_numeric(df[y_col], errors='coerce')

            # Drop rows with NaN values in the relevant columns
            df = df.dropna(subset=[x_col, y_col])

            # Remove outliers
            cleaned_df = remove_outliers(df, x_col, y_col)

            # Save the cleaned data
            cleaned_file_path = os.path.join(output_directory, file_name)
            cleaned_df.to_csv(cleaned_file_path, index=False)

            # Plot the cleaned data
            plt.figure()
            plt.plot(cleaned_df[x_col], cleaned_df[y_col], label="Cleaned Data", marker="o", linestyle="none")
            plt.title(f"Cleaned Data: {file_name}")
            plt.xlabel(x_col)
            plt.ylabel(y_col)
            plt.legend()
            plt.grid(True)
            plt.close()
        else:
            print(f"Columns '{x_col}' or '{y_col}' are missing in {file_name}")


Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.5 6.0 2.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.5 6.0 1.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.0 7.0 3.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.0 7.0 2.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.0 7.0 1.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 2.0 7.0 1.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.5 8.0 1.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.5 8.0 2.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 2.0 7.0 3.csv
Columns 'Acceleration voltage UB / V' or 'Collector current IA / nA' are missing in 1.5 6.0 3.csv
Columns 'Acceleratio