In [2]:
import os
import pandas as pd

# Set the input and output directories
input_dir = r"/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data"
output_dir = r"/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/transformed_data"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Relevant columns to clean
relevant_columns = [
    "rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity",
    "orientation", "x", "y", "width_edge_mean", "length_edge_mean",
    "transformed_x", "transformed_y", "centroids", "reference_point"
]

# Function to check object validity (processing one object at a time)
def is_object_valid(overdrivable, underdrivable, measurement, movement):
    """
    Determine if an object is valid based on the given conditions:
    1. Sum of overdrivable + underdrivable ≤ 50
    2. Measurement is 'measured'
    3. Movement is either 0 or 'moved'
    """
    if pd.isna(overdrivable) or pd.isna(underdrivable) or pd.isna(measurement) or pd.isna(movement):
        return False

    return (
        (int(overdrivable) + int(underdrivable) <= 50) and  
        (measurement == (0, "measured") or measurement == "measured") and          
        (movement == (0, "moved") or movement == 0 or movement == "moved")               
    )

# Cleaning function (keeps tuples intact)
def clean_invalid_objects(row):
    overdrivable = row.get("overdrivable", [])
    underdrivable = row.get("underdrivable", [])
    measurement = row.get("status_measurement", [])
    movement = row.get("status_movement", [])
    
    # Ensure all values are lists
    overdrivable = overdrivable if isinstance(overdrivable, list) else [overdrivable]
    underdrivable = underdrivable if isinstance(underdrivable, list) else [underdrivable]
    measurement = measurement if isinstance(measurement, list) else [measurement]
    movement = movement if isinstance(movement, list) else [movement]

    num_objects = max(len(overdrivable), len(underdrivable), len(measurement), len(movement))
    
    # Ensure each list is padded to the same length
    overdrivable += [0] * (num_objects - len(overdrivable))
    underdrivable += [0] * (num_objects - len(underdrivable))
    measurement += ["unknown"] * (num_objects - len(measurement))
    movement += ["unknown"] * (num_objects - len(movement))

    # Create a validity mask by processing each object separately
    valid_mask = []
    for i in range(num_objects):
        try:
            valid_mask.append(is_object_valid(overdrivable[i], underdrivable[i], measurement[i], movement[i]))
        except Exception as e:
            print(f"Error processing object {i}: {e}")
            valid_mask.append(False)  # Mark invalid in case of error

    # Apply filtering to all relevant columns
    for col in relevant_columns + ["overdrivable", "underdrivable", "status_measurement", "status_movement"]:
        if col in row:
            col_data = row[col]

            # Ensure column is a list
            if not isinstance(col_data, list):
                col_data = [col_data]

            # Pad missing values
            if col in ["overdrivable", "underdrivable", "rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity"]:
                col_data += [0] * (num_objects - len(col_data))
            else:
                col_data += ["unknown"] * (num_objects - len(col_data))

            # Apply valid_mask (keep tuples intact)
            row[col] = [col_data[i] for i in range(num_objects) if valid_mask[i]]

    return row

# Loop through all files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):  # Process only pickle files
        file_path = os.path.join(input_dir, filename)
        df = pd.read_pickle(file_path)

        # Apply cleaning function to all rows
        df_cleaned = df.apply(clean_invalid_objects, axis=1)

        # Save the cleaned DataFrame
        output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + "_cleaned.p")
        df_cleaned.to_pickle(output_file)
        print(f"✅ Cleaned data saved to: {output_file}")


Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Error processing object 0: The truth value of an

KeyboardInterrupt: 

In [9]:
import os
import pandas as pd

# Set the input directory (update this if needed)
input_dir = r"/home/q674749/workspace/thesis_work/rat25-15.4.1/frr40_objects"

# Get a sample file (first .p file in the directory)
sample_file = None
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):
        sample_file = os.path.join(input_dir, filename)
        break  # Pick only the first file

if sample_file:
    print(f"📂 Loading sample file: {sample_file}")

    # Load the dataset
    df = pd.read_pickle(sample_file)

    # Print first few rows
    print("\n🔹 Sample Rows:")
    print(df.head())

    # Print column names and their data types
    print("\n🔹 Column Data Types:")
    print(df.dtypes)

else:
    print("⚠️ No .p files found in the input directory!")


📂 Loading sample file: /home/q674749/workspace/thesis_work/rat25-15.4.1/frr40_objects/Pickle_files_29_frr40_objects_9.p

🔹 Sample Rows:
                   cRC  length  counter    dataID  \
0 -5363298961942027009   15695        0  16777388   
1  1587984294188916074   15695        1  16777388   
2 -1973946296667397292   15695        2  16777388   
3 -4010516217464220941   15695        3  16777388   
4 -4541213732728831157   15695        4  16777388   

  header.privateExtendedQualifier  header.origin.x  header.origin.y  \
0        (0, NormalOperationMode)            3.497          -0.3127   
1        (0, NormalOperationMode)            3.497          -0.3127   
2        (0, NormalOperationMode)            3.497          -0.3127   
3        (0, NormalOperationMode)            3.514          -0.3000   
4        (0, NormalOperationMode)            3.514          -0.3000   

   header.origin.z  header.origin.roll  header.origin.roll_std_dev  ...  \
0            0.889                 0.0     

In [10]:
import pickle

with open('/home/q674749/workspace/thesis_work/rat25-15.4.1/frr40_objects/Pickle_files_05_frr40_objects_0.p', 'rb') as f:
    data = pickle.load(f)

# Check the column names
print("Column Names:")
print(list(data.columns))

# Check the data types
print("\nData Types:")
print(data.dtypes)

# Check the first few rows of data
print("\nData Sample:")
print(data.head())

Column Names:
['cRC', 'length', 'counter', 'dataID', 'header.privateExtendedQualifier', 'header.origin.x', 'header.origin.y', 'header.origin.z', 'header.origin.roll', 'header.origin.roll_std_dev', 'header.origin.pitch', 'header.origin.pitch_std_dev', 'header.origin.yaw', 'header.origin.yaw_std_dev', 'header.timestampDAQ.fractional_seconds', 'header.timestampDAQ.seconds', 'header.timestampDAQ.sync_status', 'header.timestampSP_start.fractional_seconds', 'header.timestampSP_start.seconds', 'header.timestampSP_start.sync_status', 'header.timestampSP_end.fractional_seconds', 'header.timestampSP_end.seconds', 'header.timestampSP_end.sync_status', 'header.integrityFrontRadar', 'number_of_objects', 'qualifier', 'BMWcodecAssesment', 'BMWcodecClass', 'BMWdrivingDirection', 'BMWFuncRelev', 'height_edge_mean', 'orientation_deg', 'z_original', 'vabs_kmh', 'timestamp_daq', 'timestamp', 'timestamp_logger', 'cycletime_logger', 'timestampLoggerFirstMessage', 'pitch_deg', 'yaw_deg', 'cycletime_DAQ', 'DA

In [13]:
import pickle

with open('/home/q674749/workspace/thesis_work/rat25-15.4.1/frr40_objects/Pickle_files_05_frr40_objects_0.p', 'rb') as f:
    data = pickle.load(f)

# Print sample values from 'status_measurement' column
print("Sample values from 'status_measurement' column:")
print(data['status_measurement'].sample(5))

# Print sample values from 'status_movement' column
print("\nSample values from 'status_movement' column:")
print(data['status_movement'].sample(5))

Sample values from 'status_measurement' column:
69                                                  NaN
20    [(0, measured), (0, measured), (0, measured), ...
41                                                  NaN
60                                                  NaN
1                                                   NaN
Name: status_measurement, dtype: object

Sample values from 'status_movement' column:
25    NaN
28    NaN
64    NaN
35    NaN
59    NaN
Name: status_movement, dtype: object


In [20]:
import os
import pandas as pd
import numpy as np

# Set the input directory
input_dir = r"/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data"

# List of columns to analyze
columns_to_analyze = [
    "rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity", "radialVelocityDomainMax",
    "orientation", "x", "y", "width_edge_mean", "length_edge_mean",
    "status_measurement", "status_movement", "overdrivable", "underdrivable",
    "RotationRates.yawRateVehicleBody.value", "Velocity.SpeedCog.SpeedCog",
    "header.origin.x", "header.origin.y", "header.origin.z",
    "header.origin.roll", "header.origin.pitch", "header.origin.yaw", "reference_point",
    "RotationRates.yawRateVehicleBody.value", "Velocity.SpeedCog.SpeedCog"
]

# Pick a random .p file from the input directory
sample_file = None
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):
        sample_file = os.path.join(input_dir, filename)
        break  # Use the first file found

if sample_file:
    print(f"📂 Loading sample file: {sample_file}")

    # Load the dataset
    df = pd.read_pickle(sample_file)

    # Ensure the columns exist in the dataset
    available_columns = [col for col in columns_to_analyze if col in df.columns]

    # Print column data types
    print("\n🔹 Column Data Types:")
    print(df[available_columns].dtypes)

    # Print random values from each available column
    print("\n🔹 Random Sample Data from Each Column:")
    for col in available_columns:
        print(f"\n📌 Column: {col}")
        print(df[col].sample(min(5, len(df))).to_list())  # Print up to 5 random values

else:
    print("⚠️ No .p files found in the input directory!")


📂 Loading sample file: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data/combined_Pickle_files_05_frr40_detections_32.p

🔹 Column Data Types:
rcs                         object
distance                    object
angleAzimuth                object
angleElevation              object
radialVelocity              object
radialVelocityDomainMax    float32
orientation                 object
x                           object
y                           object
width_edge_mean             object
length_edge_mean            object
status_measurement          object
status_movement             object
overdrivable                object
underdrivable               object
header.origin.x            float32
header.origin.y            float32
header.origin.z            float32
header.origin.roll         float32
header.origin.pitch        float32
header.origin.yaw          float32
reference_point             object
dtype: object

🔹 Random Sample Data from Each Column:

📌 Column: 

In [16]:
import os
import pandas as pd

# Set the input directory
input_dir = r"/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data"

# Columns to check
columns_to_check = ["status_measurement", "status_movement"]

# Pick a random .p file from the input directory
sample_file = None
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):
        sample_file = os.path.join(input_dir, filename)
        break  # Use the first file found

if sample_file:
    print(f"📂 Loading sample file: {sample_file}")

    # Load the dataset
    df = pd.read_pickle(sample_file)

    # Ensure the columns exist in the dataset
    available_columns = [col for col in columns_to_check if col in df.columns]

    if available_columns:
        # Print column data types
        print("\n🔹 Column Data Types:")
        print(df[available_columns].dtypes)

        # Print several rows from each column
        print("\n🔹 Sample Data from Selected Columns:")
        print(df[available_columns].sample(n=min(10, len(df))))  # Show up to 10 random rows

    else:
        print("⚠️ Requested columns not found in the dataset!")

else:
    print("⚠️ No .p files found in the input directory!")


📂 Loading sample file: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data/combined_Pickle_files_05_frr40_detections_32.p

🔹 Column Data Types:
status_measurement    object
status_movement       object
dtype: object

🔹 Sample Data from Selected Columns:
                                   status_measurement  \
11  [(0, measured), (0, measured), (0, measured), ...   
13  [(0, measured), (0, measured), (0, measured), ...   
14  [(0, measured), (0, measured), (0, measured), ...   
18  [(0, measured), (0, measured), (0, measured), ...   
4                                                 NaN   
5   [(0, measured), (0, measured), (0, measured), ...   
6   [(0, measured), (0, measured), (0, measured), ...   
12  [(0, measured), (0, measured), (0, measured), ...   
9   [(0, measured), (0, measured), (0, measured), ...   
7   [(0, measured), (0, measured), (0, measured), ...   

                                      status_movement  
11  [(1, stationary), (1, stationary), (1

In [24]:
import os
import pandas as pd
import numpy as np

# Set the input and output directories
input_dir = "/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data"
output_dir = "/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Relevant columns to clean
relevant_columns = [
    "rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity",
    "orientation", "x", "y", "width_edge_mean", "length_edge_mean",
    "status_measurement", "status_movement", "overdrivable", "underdrivable",
     "reference_point", 
]

# Function to check object validity (handling NaN values)
def is_object_valid(overdrivable, underdrivable, measurement, movement):
    """
    Determine if an object is valid based on the given conditions:
    1. Sum of overdrivable + underdrivable ≤ 50
    2. Measurement is 'measured'
    3. Movement is either 0 or 'moved'
    """
    if pd.isna(overdrivable) or pd.isna(underdrivable) or pd.isna(measurement) or pd.isna(movement):
        return False

    # Extract actual values from tuples
    if isinstance(measurement, tuple):
        measurement = measurement[1]
    if isinstance(movement, tuple):
        movement = movement[1]

    return (
        (overdrivable + underdrivable <= 50) and  
        (measurement == "measured") and          
        (movement in [0, "moved"])               
    )

# Function to replace NaN values inside lists
def replace_nan_in_list(data, default_value):
    """
    Replaces NaN values inside lists with a default value.
    If `data` is not a list, it is converted to a list.
    """
    if isinstance(data, list):  
        return [default_value if pd.isna(item) else item for item in data]
    elif isinstance(data, (np.ndarray, pd.Series)):  # Handle arrays
        return [default_value if pd.isna(x) else x for x in data.tolist()]
    elif pd.isna(data):  # If it's a scalar NaN
        return [default_value]
    return [data]  # Convert single values into lists

# Cleaning function (handles NaNs in all columns)
def clean_invalid_objects(row):
    """
    Cleans invalid objects in a row while maintaining data integrity.
    """
    # Extract relevant validation columns
    overdrivable = replace_nan_in_list(row.get("overdrivable", []), 0)
    underdrivable = replace_nan_in_list(row.get("underdrivable", []), 0)
    measurement = replace_nan_in_list(row.get("status_measurement", []), (0, "unknown"))
    movement = replace_nan_in_list(row.get("status_movement", []), (0, "unknown"))

    # If lists are empty after processing, replace with default values
    if not measurement or all(pd.isna(x) for x in measurement):
        measurement = [(0, "unknown")]
    if not movement or all(pd.isna(x) for x in movement):
        movement = [(0, "unknown")]

    # Find the maximum number of objects in the row
    num_objects = max(len(overdrivable), len(underdrivable), len(measurement), len(movement))

    # Pad shorter columns to match `num_objects`
    overdrivable += [0] * (num_objects - len(overdrivable))
    underdrivable += [0] * (num_objects - len(underdrivable))
    measurement += [(0, "unknown")] * (num_objects - len(measurement))
    movement += [(0, "unknown")] * (num_objects - len(movement))

    # Create a validity mask for filtering objects
    valid_mask = [
        is_object_valid(
            overdrivable[i], underdrivable[i], measurement[i], movement[i]
        )
        for i in range(num_objects)
    ]

    # Apply filtering and handle NaNs in other columns
    for col in relevant_columns + ["overdrivable", "underdrivable", "status_measurement", "status_movement"]:
        if col in row:
            col_data = row[col]

            # **Fix**: Ensure `col_data` is treated correctly
            if not isinstance(col_data, list):
                col_data = replace_nan_in_list(col_data, 0 if col in ["rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity"] else "unknown")

            # Replace NaNs inside the list
            default_value = 0 if col in ["rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity"] else "unknown"
            col_data = replace_nan_in_list(col_data, default_value)

            # Pad missing values
            if col in ["overdrivable", "underdrivable", "rcs", "distance", "angleAzimuth", "angleElevation", "radialVelocity"]:
                col_data += [0] * (num_objects - len(col_data))
            else:
                col_data += ["unknown"] * (num_objects - len(col_data))

            # Apply filtering while maintaining list structure
            row[col] = [col_data[i] for i in range(num_objects) if valid_mask[i]]

    return row

# Loop through all files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):  # Process only pickle files
        file_path = os.path.join(input_dir, filename)
        df = pd.read_pickle(file_path)

        # Apply cleaning function to all rows
        df_cleaned = df.apply(clean_invalid_objects, axis=1)

        # Save the cleaned DataFrame
        output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + "_cleaned.p")
        df_cleaned.to_pickle(output_file)
        print(f"✅ Cleaned data saved to: {output_file}")


✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_05_frr40_detections_32_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_05_frr40_detections_17_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_22_frr40_detections_35_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_22_frr40_detections_31_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_19_frr40_detections_11_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_11_set1_frr40_detections_15_cleaned.p
✅ Cleaned data saved to: /home/q674749/workspace/thesis_work/rat2

In [29]:
import os
import pandas as pd
import numpy as np

# Set the input directory
input_dir = "/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data"

# Pick a sample file
sample_file = None
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):
        sample_file = os.path.join(input_dir, filename)
        break

if sample_file:
    print(f"📂 Checking Sample File: {sample_file}")
    
    # Load the dataset
    df = pd.read_pickle(sample_file)

    # Print column names and data types
    print("\n🔹 Column Data Types:")
    print(df.dtypes)

    # Function to safely print list-based data
    def safe_print(value):
        """ Ensures data inside lists is correctly displayed """
        if isinstance(value, list):
            return [repr(item.tolist() if isinstance(item, np.ndarray) else item) for item in value]
        elif isinstance(value, np.ndarray):
            return repr(value.tolist())
        else:
            return repr(value)

    # Print raw sample rows
    print("\n🔹 Raw Data from Random Rows:")
    sample_rows = df.sample(n=5)  # Pick 5 random rows
    for idx, row in sample_rows.iterrows():
        print(f"\n📌 Row Index: {idx}")
        for col in df.columns:
            print(f"  {col}: {safe_print(row[col])}")

else:
    print("⚠️ No .p files found in the input directory!")


📂 Checking Sample File: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/combined_data/combined_Pickle_files_05_frr40_detections_32.p

🔹 Column Data Types:
timestamp                  float64
rcs                         object
distance                    object
angleAzimuth                object
angleElevation              object
radialVelocity              object
radialVelocityDomainMax    float32
orientation                 object
x                           object
y                           object
width_edge_mean             object
length_edge_mean            object
status_measurement          object
status_movement             object
overdrivable                object
underdrivable               object
header.origin.x            float32
header.origin.y            float32
header.origin.z            float32
header.origin.roll         float32
header.origin.pitch        float32
header.origin.yaw          float32
reference_point             object
yaw_rate                   fl

In [28]:
import os
import pandas as pd
import numpy as np

# Set the input directory
input_dir = "/home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects"

# Pick a sample file
sample_file = None
for filename in os.listdir(input_dir):
    if filename.endswith(".p"):
        sample_file = os.path.join(input_dir, filename)
        break

if sample_file:
    print(f"📂 Checking Sample File: {sample_file}")
    
    # Load the dataset
    df = pd.read_pickle(sample_file)

    # Print column names and data types
    print("\n🔹 Column Data Types:")
    print(df.dtypes)

    # Function to correctly display list-based data
    def safe_print(value):
        """ Converts NumPy arrays, tuples, and lists into readable format """
        if isinstance(value, np.ndarray):
            return value.tolist()  # Convert NumPy arrays to lists
        elif isinstance(value, list):
            return [safe_print(item) for item in value]  # Handle nested structures
        elif isinstance(value, tuple):
            return tuple(safe_print(item) for item in value)  # Handle tuples inside lists
        else:
            return value  # Leave other values unchanged

    # Print raw sample rows with correctly displayed data
    print("\n🔹 Raw Data from Random Rows:")
    sample_rows = df.sample(n=5)  # Pick 5 random rows
    for idx, row in sample_rows.iterrows():
        print(f"\n📌 Row Index: {idx}")
        for col in df.columns:
            print(f"  {col}: {safe_print(row[col])}")

else:
    print("⚠️ No .p files found in the input directory!")


📂 Checking Sample File: /home/q674749/workspace/thesis_work/rat25-15.4.1/processed/cleaned_objects/combined_Pickle_files_29_frr40_detections_16_cleaned.p

🔹 Column Data Types:
timestamp                  float64
rcs                         object
distance                    object
angleAzimuth                object
angleElevation              object
radialVelocity              object
radialVelocityDomainMax    float64
orientation                 object
x                           object
y                           object
width_edge_mean             object
length_edge_mean            object
status_measurement          object
status_movement             object
overdrivable                object
underdrivable               object
header.origin.x            float64
header.origin.y            float64
header.origin.z            float64
header.origin.roll         float64
header.origin.pitch        float64
header.origin.yaw          float64
reference_point             object
yaw_rate           