Connected to Python 3.12.8

In [5]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [6]:
# Determine the repo root relative to current file/notebook
try:
    # If running a Python script (__file__ exists)
    REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
except NameError:
    # If running in Jupyter/Colab (__file__ does not exist)
    REPO_ROOT = os.path.abspath("..")  # assumes notebook is inside notebooks/

In [7]:
# Set paths relative to repo root
RAW_PATH = os.path.join(REPO_ROOT, "data", "raw")
PROCESSED_PATH = os.path.join(REPO_ROOT, "data", "processed")
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [8]:
# Read only drive11 dataset
drive11_df = pd.read_csv(os.path.join(RAW_PATH, "drive11.csv"), index_col=False)
print("Drive11 head:")
print(drive11_df.head())

Drive11 head:
   ENGINE_RUN_TINE ()  ENGINE_RPM ()  VEHICLE_SPEED ()  THROTTLE ()  \
0                 0.0            0.0               0.0    17.647058   
1                 0.0            0.0               0.0    17.647058   
2                 0.0            0.0               0.0    17.254902   
3                 0.0            0.0               0.0    17.254902   
4                 0.0            0.0               0.0    17.254902   

   ENGINE_LOAD ()  COOLANT_TEMPERATURE ()  LONG_TERM_FUEL_TRIM_BANK_1 ()  \
0             0.0                    78.0                       -2.34375   
1             0.0                    78.0                       -2.34375   
2             0.0                    78.0                       -2.34375   
3             0.0                    78.0                       -2.34375   
4             0.0                    78.0                       -2.34375   

   SHORT_TERM_FUEL_TRIM_BANK_1 ()  INTAKE_MANIFOLD_PRESSURE ()  FUEL_TANK ()  \
0                     

In [9]:
# All datasets stored in a simple dictionary
datasets = {
    "drive11": "drive11.csv",
    "idle30": "idle30.csv",
    "live20": "live20.csv",
    "long12": "long12.csv",
    "ufpe1": "ufpe1.csv"
}

# Ensure output folder exists
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [10]:
def clean_column(col):
    """
    Clean column names:
    - remove ' ()'
    - remove any '(' or ')'
    - replace spaces with '_'
    - lowercase everything
    """
    col = col.strip()
    col = col.replace(" ()", "")
    col = col.replace("(", "").replace(")", "")
    col = col.replace(" ", "_")
    return col.lower()

print("Starting dataset preparation...\n")

Starting dataset preparation...



In [11]:
# Process each dataset in a simple loop
for name, file in datasets.items():
    print(f"\n------------------------------------------")
    print(f"Processing: {name} ({file})")
    print("------------------------------------------")

    # 1. Load dataset (IMPORTANT: index_col=False)
    df = pd.read_csv(f"{RAW_PATH}/{file}", index_col=False)

    # 2. Show original columns
    print("\nOriginal columns:")
    print(df.columns.tolist())

    # 3. Clean column names
    df.columns = [clean_column(c) for c in df.columns]

    # 4. Print cleaned columns
    print("\nCleaned columns:")
    print(df.columns.tolist())

    # 5. Show shape and dtypes
    print("\nShape:", df.shape)
    print("\nData types:")
    print(df.dtypes)

    # 6. Check for missing values
    print("\nMissing values per column:")
    print(df.isna().sum())

    # 7. Remove fully empty rows
    empty_rows = df.isna().all(axis=1).sum()
    if empty_rows > 0:
        print(f"\nRemoving {empty_rows} fully-empty rows...")
        df = df.dropna(how="all")

    # 8. Show sample rows
    print("\nFirst 5 rows:")
    print(df.head())

    # 9. Save cleaned dataset
    output_path = f"{PROCESSED_PATH}/{name}_clean.csv"
    df.to_csv(output_path, index=False)

    print(f"\nSaved cleaned dataset â†’ {output_path}")

print("\nAll datasets processed successfully! ðŸš€")


------------------------------------------
Processing: drive11 (drive11.csv)
------------------------------------------

Original columns:
['ENGINE_RUN_TINE ()', 'ENGINE_RPM ()', 'VEHICLE_SPEED ()', 'THROTTLE ()', 'ENGINE_LOAD ()', 'COOLANT_TEMPERATURE ()', 'LONG_TERM_FUEL_TRIM_BANK_1 ()', 'SHORT_TERM_FUEL_TRIM_BANK_1 ()', 'INTAKE_MANIFOLD_PRESSURE ()', 'FUEL_TANK ()', 'ABSOLUTE_THROTTLE_B ()', 'PEDAL_D ()', 'PEDAL_E ()', 'COMMANDED_THROTTLE_ACTUATOR ()', 'FUEL_AIR_COMMANDED_EQUIV_RATIO ()', 'ABSOLUTE_BAROMETRIC_PRESSURE ()', 'RELATIVE_THROTTLE_POSITION ()', 'INTAKE_AIR_TEMP ()', 'TIMING_ADVANCE ()', 'CATALYST_TEMPERATURE_BANK1_SENSOR1 ()', 'CATALYST_TEMPERATURE_BANK1_SENSOR2 ()', 'CONTROL_MODULE_VOLTAGE ()', 'COMMANDED_EVAPORATIVE_PURGE ()', 'TIME_RUN_WITH_MIL_ON ()', 'TIME_SINCE_TROUBLE_CODES_CLEARED ()', 'DISTANCE_TRAVELED_WITH_MIL_ON ()', 'WARM_UPS_SINCE_CODES_CLEARED ()']

Cleaned columns:
['engine_run_tine', 'engine_rpm', 'vehicle_speed', 'throttle', 'engine_load', 'coolant_temp