In [5]:
import numpy as np
import pandas as pd
from datetime import datetime
import json

def extract(filepath, data_type, rul_path=None):
    col_engid = "engine_id"
    col_cycno = "cycle_number"
    cols_setting = [f"setting_{i}" for i in range(1, 4)]
    cols_sensor = [f"sensor_{i}" for i in range(1, 22)]
    cols_placeholder = ["empty_1", "empty_2"]
    columns = [col_engid, col_cycno] + cols_setting + cols_sensor + cols_placeholder

    df = pd.read_table(filepath, sep=" ", header=None, names=columns).drop(cols_placeholder, axis=1)
    src_file = {"data": filepath}

    if data_type == "train":
        df = pd.merge(df, df.groupby("engine_id").agg(last_cycle_number=("cycle_number", "max")), on="engine_id", how="left")
        df["rul"] = df["last_cycle_number"] - df["cycle_number"]
        df = df.drop("last_cycle_number", axis=1)

    elif data_type == "test":
        if rul_path is not None:
            df_rul = pd.read_table(rul_path, header=None, names=["rul_last"])
            df_rul["engine_id"] = df_rul.index + 1
            df = pd.merge(df, df_rul, on="engine_id", how="left")
            df = pd.merge(df, df.groupby("engine_id").agg(last_cycle_number=("cycle_number", "max")), on="engine_id", how="left")
            df["rul"] = df["rul_last"] + (df["last_cycle_number"] - df["cycle_number"])
            df = df.drop(["rul_last", "last_cycle_number"], axis=1)

        else:
            df["rul"] = np.nan
        
        src_file["rul"] = rul_path

    df["ingested_at"] = pd.to_datetime(datetime.now(), errors="coerce")
    df["source_file"] = json.dumps(src_file)
    return df

# df = extract("../../data/raw/train_FD001.txt", "train")
# df = extract("../../data/raw/test_FD001.txt", "test")
df = extract("../../data/raw/test_FD001.txt", "test", "../../data/raw/RUL_FD001.txt")

df

Unnamed: 0,engine_id,cycle_number,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,rul,ingested_at,source_file
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,8.4052,0.03,392,2388,100.0,38.86,23.3735,142,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,8.3803,0.03,393,2388,100.0,39.02,23.3916,141,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,8.4441,0.03,393,2388,100.0,39.08,23.4166,140,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
3,1,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,...,8.3917,0.03,391,2388,100.0,39.00,23.3737,139,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
4,1,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,...,8.4031,0.03,390,2388,100.0,38.99,23.4130,138,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,100,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,...,8.4715,0.03,394,2388,100.0,38.65,23.1974,24,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,8.4512,0.03,395,2388,100.0,38.57,23.2771,23,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,8.4569,0.03,395,2388,100.0,38.62,23.2051,22,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,8.4711,0.03,395,2388,100.0,38.66,23.2699,21,2025-11-12 21:04:34.708185,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."


In [9]:
def quick_validate(df):
    if "engine_id" not in df.columns:
        raise ValueError(f"Missing required columns: engine_id")
    df["ingested_at"] = pd.to_datetime(datetime.now(), errors="coerce")
    if df["ingested_at"].isna().any():
        raise ValueError("Some ingested_at values could not be parsed")
    # ensure sensors numeric
    sensor_cols = [c for c in df.columns if c.startswith("sensor_")]
    df[sensor_cols] = df[sensor_cols].apply(pd.to_numeric, errors="coerce")
    return df

quick_validate(df)

Unnamed: 0,engine_id,cycle_number,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,rul,ingested_at,source_file
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,8.4052,0.03,392,2388,100.0,38.86,23.3735,142,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,8.3803,0.03,393,2388,100.0,39.02,23.3916,141,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,8.4441,0.03,393,2388,100.0,39.08,23.4166,140,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
3,1,4,0.0042,0.0000,100.0,518.67,642.44,1584.12,1406.42,14.62,...,8.3917,0.03,391,2388,100.0,39.00,23.3737,139,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
4,1,5,0.0014,0.0000,100.0,518.67,642.51,1587.19,1401.92,14.62,...,8.4031,0.03,390,2388,100.0,38.99,23.4130,138,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13091,100,194,0.0049,0.0000,100.0,518.67,643.24,1599.45,1415.79,14.62,...,8.4715,0.03,394,2388,100.0,38.65,23.1974,24,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13092,100,195,-0.0011,-0.0001,100.0,518.67,643.22,1595.69,1422.05,14.62,...,8.4512,0.03,395,2388,100.0,38.57,23.2771,23,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13093,100,196,-0.0006,-0.0003,100.0,518.67,643.44,1593.15,1406.82,14.62,...,8.4569,0.03,395,2388,100.0,38.62,23.2051,22,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
13094,100,197,-0.0038,0.0001,100.0,518.67,643.26,1594.99,1419.36,14.62,...,8.4711,0.03,395,2388,100.0,38.66,23.2699,21,2025-11-12 21:07:23.994850,"{""data"": ""../../data/raw/test_FD001.txt"", ""rul..."
