In [None]:
import pandas as pd
import numpy as np
import re

# ---------- 1. Load dataset ----------
# Replace with your actual file path
file_path = "robot_data.csv"
df = pd.read_csv(file_path)


# ---------- 2. Handle mixed types & symbolic artifacts ----------
def clean_numeric(value):
    """Remove unwanted characters and convert to float if possible."""
    if pd.isna(value):
        return np.nan
    cleaned = re.sub(r'[^0-9.\-]', '', str(value))  # keep only digits, . and -
    try:
        return float(cleaned)
    except ValueError:
        return np.nan

# Apply cleaning to all relevant feature columns
# (adjust these to match your dataset columns)
feature_cols = [col for col in df.columns if "JOINT" in col or "POSITION" in col or "VELOCITY" in col or "CURRENT" in col]

for col in feature_cols:
    df[col] = df[col].apply(clean_numeric)

print("\nAfter cleaning symbols and casting to float.")
print(df[feature_cols].head())

# ---------- 3. Handle missing values ----------
# Short gaps (1–2 steps): forward fill
df[feature_cols] = df[feature_cols].ffill(limit=2)

# Wider gaps: linear interpolation
df[feature_cols] = df[feature_cols].interpolate(method="linear")

# Drop rows with too many missing values across joints
df = df.dropna(thresh=len(feature_cols) - 1, axis=0)

print("\nAfter handling missing values.")
print(df[feature_cols].head())

# ---------- 4. Integrity check ----------
non_numeric_cols = [col for col in feature_cols if not pd.api.types.is_numeric_dtype(df[col])]
if non_numeric_cols:
    print(f" Warning: Non-numeric values remain in {non_numeric_cols}")
else:
    print(" All feature columns are now numeric and consistent.")

# ---------- 5. Save cleaned dataset ----------
cleaned_file = "robot_data_cleaned.csv"
df.to_csv(cleaned_file, index=False)
print(f"\nCleaned dataset saved to {cleaned_file}")