# **.tdms to .parquet**

## Reading TDMS files from GCP (National Instruments data)

In [5]:
import sys
!{sys.executable} -m pip install nptdms



In [6]:
from nptdms import TdmsFile

# df = pd.read_csv('/content/drive/MyDrive/Emotor_short/0Nm_BPFI_03.tdms') # This caused a UnicodeDecodeError

with TdmsFile.open('/home/jhord/code/JKaly-prog/ML-Emotor/data/0Nm_BPFI_03.tdms') as tdms_file:
    # You can inspect the TDMS file structure:
    # print(tdms_file)

    # To get a DataFrame from a specific group or channel, you need to specify it.
    # For example, to get all data as a DataFrame:
    # (Note: This might need adjustment based on your specific TDMS file structure)
    df_03 = tdms_file.as_dataframe()

    # Display the first few rows of the DataFrame
    display(df_03.head())

    # You might want to access specific groups or channels like this:
    # group = tdms_file['Measured Data']
    # channel = group['Voltage']
    # channel_data = channel.data
    # df = pd.DataFrame({'Voltage': channel_data})

    print(df_03.shape)

Unnamed: 0,/'Log'/'cDAQ9185-1F486B5Mod1/ai0',/'Log'/'cDAQ9185-1F486B5Mod1/ai1',/'Log'/'cDAQ9185-1F486B5Mod2/ai0',/'Log'/'cDAQ9185-1F486B5Mod2/ai2',/'Log'/'cDAQ9185-1F486B5Mod2/ai3'
0,27.607992,28.217591,1.894377,0.949463,-2.165271
1,27.607992,28.217591,2.128889,0.926111,-2.309938
2,27.607992,28.217591,2.373001,0.882154,-2.526938
3,27.607992,28.217591,2.087747,1.193974,-2.444867
4,27.607992,28.217591,2.393572,1.125291,-2.718899


(1536492, 5)


## Reading .tdms files
TDMS Pipeline ‚Üí DataFrame. Read .tdms files from Google Cloud Storage (GCS) without saving them to disk

In [9]:
import io
import numpy as np
import pandas as pd
from tqdm import tqdm
from nptdms import TdmsFile
from google.cloud import storage

BUCKET_NAME = "emotor-dataset-raw"
INPUT_PREFIX = "current,temp/"          # OJO: tiene coma
OUTPUT_PREFIX = "current_temp_short/"   # destino

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)


def list_tdms_files(prefix: str) -> list[str]:
    """
    List all TDMS files stored under a given prefix in a Google Cloud Storage bucket.

    This function scans a GCS bucket using the provided prefix (virtual folder)
    and returns the full blob paths of all files with a `.tdms` extension.

    Parameters
    ----------
    prefix : str
        GCS prefix (folder-like path) where TDMS files are located.
        Example: "current,temp/"

    Returns
    -------
    list[str]
        List of blob names (full paths) corresponding to `.tdms` files found
        under the given prefix.
    """
    blobs = client.list_blobs(BUCKET_NAME, prefix=prefix)
    return [b.name for b in blobs if b.name.lower().endswith(".tdms")]


def tdms_bytes_to_dataframe(tdms_bytes: bytes) -> pd.DataFrame:

    """
    Convert a TDMS file loaded in memory (bytes) into a pandas DataFrame.

    The function reads a TDMS file directly from bytes (without writing to disk),
    extracts all channels from all groups, and flattens channel data to 1D arrays.
    Column names are generated using the format: "<group_name>/<channel_name>".

    If channels have different lengths, all columns are truncated to the minimum
    length to ensure a rectangular DataFrame and avoid alignment errors.

    Parameters
    ----------
    tdms_bytes : bytes
        Raw bytes of a TDMS file, typically obtained from cloud storage
        (e.g., GCS blob.download_as_bytes()).

    Returns
    -------
    pandas.DataFrame
        DataFrame containing one column per TDMS channel.
        Returns an empty DataFrame if no channels are found.
    """

    tdms = TdmsFile.read(io.BytesIO(tdms_bytes))

    data = {}
    lengths = []

    for group in tdms.groups():
        for ch in group.channels():
            arr = np.asarray(ch[:])
            if arr.ndim > 1:
                arr = arr.reshape(-1)  # a 1D

            col = f"{group.name}/{ch.name}"
            data[col] = arr
            lengths.append(len(arr))

    if not data:
        return pd.DataFrame()

    # Alinear longitudes (por seguridad): truncar todo al m√≠nimo
    min_len = min(lengths)
    for k in list(data.keys()):
        data[k] = data[k][:min_len]

    df = pd.DataFrame(data)
    return df

## Convert .tdms to .parquet
This function includes uploading the `.parquet` file to the cloud. It does not upload empty Parquet files and prevents overwriting if `overwrite=False`.

In [34]:
STATUS_OK = "ok"
STATUS_SKIPPED = "skipped_exists"
STATUS_FAILED = "failed"

def convert_one_tdms_to_parquet_gcs(tdms_blob_name: str, overwrite: bool = False) -> str:
    """
    Convert a single TDMS file stored in Google Cloud Storage (GCS) into Parquet and upload it back to GCS.

    Workflow:
    1) Build output parquet path using OUTPUT_PREFIX and the TDMS base filename.
    2) Skip conversion if output exists and overwrite=False.
    3) Download TDMS bytes from GCS into memory.
    4) Parse TDMS bytes into a pandas DataFrame using `tdms_bytes_to_dataframe`.
    5) Validate that the DataFrame is not empty (guard rail).
    6) Serialize DataFrame to Parquet in-memory using a BytesIO buffer.
    7) Upload the resulting Parquet file to GCS.

    Parameters
    ----------
    tdms_blob_name : str
        Full blob name (path) to the input TDMS file in GCS.
        Example: "current,temp/0Nm_BPFI_03.tdms"
    overwrite : bool, default False
        If False, the function will skip conversion when the output parquet already exists.
        If True, the output parquet will be overwritten.

    Returns
    -------
    - "ok"              ‚Üí converted and uploaded successfully
    - "skipped_exists"  ‚Üí output already existed and overwrite=False
    - raises Exception  ‚Üí real failure

    Raises
    ------
    ValueError
        If the generated DataFrame is empty (0 rows or 0 columns). In that case, the parquet
        will NOT be uploaded.
    google.api_core.exceptions.GoogleAPIError
        If download or upload operations fail (raised by the GCS client libraries).
    """

    base_name = tdms_blob_name.split("/")[-1].replace(".tdms", ".parquet")
    out_blob_name = f"{OUTPUT_PREFIX}{base_name}"

    out_blob = bucket.blob(out_blob_name)
    if out_blob.exists() and not overwrite:
        print(f"‚è≠Ô∏è  Ya existe, salto: {out_blob_name}")
        return STATUS_SKIPPED

    print(f"\nüìÑ Procesando: gs://{BUCKET_NAME}/{tdms_blob_name}")

    # Download TDMS
    in_blob = bucket.blob(tdms_blob_name)
    tdms_bytes = in_blob.download_as_bytes()

    # Convert
    df = tdms_bytes_to_dataframe(tdms_bytes)

    # Guard rails: no subir parquets vac√≠os
    if df.shape[0] == 0 or df.shape[1] == 0:
        raise ValueError(f"DataFrame vac√≠o (df.shape={df.shape})")

    print(f"‚úÖ DF: shape={df.shape} cols={df.shape[1]}")

    # Write parquet to memory
    buf = io.BytesIO()
    df.to_parquet(buf, index=False, engine="pyarrow")
    buf.seek(0)

    # Upload
    out_blob.upload_from_file(buf, content_type="application/octet-stream")
    print(f"‚òÅÔ∏è  Subido: gs://{BUCKET_NAME}/{out_blob_name}")

    return STATUS_OK

In [None]:
def run_batch(overwrite: bool = False) -> list[str]:

    """
    Batch-convert all TDMS files found under INPUT_PREFIX in GCS into Parquet and upload them to OUTPUT_PREFIX.

    This function:
    - Lists TDMS blobs under INPUT_PREFIX via `list_tdms_files`.
    - Iterates through each file with a progress bar (tqdm).
    - Converts each TDMS to Parquet using `convert_one_tdms_to_parquet_gcs`.
    - Catches exceptions per-file so the batch continues even if one file fails.

    Parameters
    ----------
    overwrite : bool, default False
        If False, skip files whose parquet outputs already exist.
        If True, overwrite existing parquet outputs.

    Returns
    -------
    list[str]
        List of TDMS blob names that failed to convert or upload.
        Files skipped because output already existed are NOT included.
    """

    tdms_files = list_tdms_files(INPUT_PREFIX)
    print(f"TDMS encontrados: {len(tdms_files)} en '{INPUT_PREFIX}'")

    failed_files: list[str] = []

    for f in tqdm(tdms_files, desc="Convirtiendo TDMS ‚Üí Parquet"):
        try:
            status = convert_one_tdms_to_parquet_gcs(f, overwrite=overwrite)

            # Solo para logging opcional (no afecta failed_files)
            if status == STATUS_SKIPPED:
                continue

        except Exception as e:
            print(f"‚ö†Ô∏è Error con {f}: {e}\n")
            failed_files.append(f)

    print("\n Resumen batch")
    print(f" - Total TDMS:      {len(tdms_files)}")
    print(f" - Fallidos reales: {len(failed_files)}")

    if failed_files:
        print(" - Archivos con error:")
        for f in failed_files:
            print("   -", f)

    return failed_files

In [38]:
if __name__ == "__main__":
    failed_files = run_batch(overwrite=False)
    print("\nüìÑ failed_files =")
    print(failed_files)

üîé TDMS encontrados: 45 en 'current,temp/'


Convirtiendo TDMS ‚Üí Parquet:   2%|‚ñè         | 1/45 [00:01<01:18,  1.79s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_BPFI_03.parquet


Convirtiendo TDMS ‚Üí Parquet:   4%|‚ñç         | 2/45 [00:02<00:42,  1.01it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_BPFI_10.parquet


Convirtiendo TDMS ‚Üí Parquet:   7%|‚ñã         | 3/45 [00:03<00:48,  1.15s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_BPFI_30.parquet

üìÑ Procesando: gs://emotor-dataset-raw/current,temp/0Nm_BPFO_03.tdms


Convirtiendo TDMS ‚Üí Parquet:   9%|‚ñâ         | 4/45 [00:10<02:18,  3.37s/it]

‚ö†Ô∏è Error con current,temp/0Nm_BPFO_03.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/0Nm_BPFO_10.tdms


Convirtiendo TDMS ‚Üí Parquet:  11%|‚ñà         | 5/45 [00:16<02:59,  4.50s/it]

‚ö†Ô∏è Error con current,temp/0Nm_BPFO_10.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/0Nm_BPFO_30.tdms


Convirtiendo TDMS ‚Üí Parquet:  13%|‚ñà‚ñé        | 6/45 [00:20<02:49,  4.35s/it]

‚ö†Ô∏è Error con current,temp/0Nm_BPFO_30.tdms: DataFrame vac√≠o (df.shape=(0, 5))



Convirtiendo TDMS ‚Üí Parquet:  16%|‚ñà‚ñå        | 7/45 [00:21<01:55,  3.04s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Misalign_01.parquet


Convirtiendo TDMS ‚Üí Parquet:  18%|‚ñà‚ñä        | 8/45 [00:22<01:33,  2.52s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Misalign_03.parquet


Convirtiendo TDMS ‚Üí Parquet:  20%|‚ñà‚ñà        | 9/45 [00:22<01:06,  1.84s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Misalign_05.parquet


Convirtiendo TDMS ‚Üí Parquet:  22%|‚ñà‚ñà‚ñè       | 10/45 [00:24<00:59,  1.71s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Normal.parquet


Convirtiendo TDMS ‚Üí Parquet:  24%|‚ñà‚ñà‚ñç       | 11/45 [00:24<00:45,  1.32s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Unbalance_0583mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  27%|‚ñà‚ñà‚ñã       | 12/45 [00:26<00:42,  1.30s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Unbalance_1169mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  29%|‚ñà‚ñà‚ñâ       | 13/45 [00:26<00:33,  1.05s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Unbalance_1751mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  31%|‚ñà‚ñà‚ñà       | 14/45 [00:27<00:33,  1.09s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Unbalance_2239mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  33%|‚ñà‚ñà‚ñà‚ñé      | 15/45 [00:28<00:27,  1.10it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/0Nm_Unbalance_3318mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  36%|‚ñà‚ñà‚ñà‚ñå      | 16/45 [00:29<00:29,  1.02s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_BPFI_03.parquet


Convirtiendo TDMS ‚Üí Parquet:  38%|‚ñà‚ñà‚ñà‚ñä      | 17/45 [00:30<00:24,  1.13it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_BPFI_10.parquet


Convirtiendo TDMS ‚Üí Parquet:  40%|‚ñà‚ñà‚ñà‚ñà      | 18/45 [00:31<00:26,  1.02it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_BPFI_30.parquet

üìÑ Procesando: gs://emotor-dataset-raw/current,temp/2Nm_BPFO_03.tdms


Convirtiendo TDMS ‚Üí Parquet:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 19/45 [00:37<01:08,  2.65s/it]

‚ö†Ô∏è Error con current,temp/2Nm_BPFO_03.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/2Nm_BPFO_10.tdms


Convirtiendo TDMS ‚Üí Parquet:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 20/45 [00:44<01:36,  3.86s/it]

‚ö†Ô∏è Error con current,temp/2Nm_BPFO_10.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/2Nm_BPFO_30.tdms


Convirtiendo TDMS ‚Üí Parquet:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 21/45 [00:51<01:54,  4.77s/it]

‚ö†Ô∏è Error con current,temp/2Nm_BPFO_30.tdms: DataFrame vac√≠o (df.shape=(0, 5))



Convirtiendo TDMS ‚Üí Parquet:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 22/45 [00:51<01:19,  3.46s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Misalign_01.parquet
‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Misalign_03.parquet
‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Misalign_05.parquet


Convirtiendo TDMS ‚Üí Parquet:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 25/45 [00:52<00:32,  1.63s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Normal.parquet


Convirtiendo TDMS ‚Üí Parquet:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 26/45 [00:52<00:26,  1.40s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Unbalance_0583mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 27/45 [00:54<00:24,  1.39s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Unbalance_1169mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 28/45 [00:54<00:19,  1.13s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Unbalance_1751mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 29/45 [00:56<00:19,  1.21s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Unbalance_2239mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 30/45 [00:56<00:14,  1.00it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/2Nm_Unbalance_3318mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 31/45 [00:57<00:15,  1.09s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_BPFI_03.parquet


Convirtiendo TDMS ‚Üí Parquet:  71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 32/45 [00:58<00:11,  1.13it/s]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_BPFI_10.parquet


Convirtiendo TDMS ‚Üí Parquet:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 33/45 [00:59<00:12,  1.04s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_BPFI_30.parquet

üìÑ Procesando: gs://emotor-dataset-raw/current,temp/4Nm_BPFO_03.tdms


Convirtiendo TDMS ‚Üí Parquet:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 34/45 [01:08<00:35,  3.25s/it]

‚ö†Ô∏è Error con current,temp/4Nm_BPFO_03.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/4Nm_BPFO_10.tdms


Convirtiendo TDMS ‚Üí Parquet:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 35/45 [01:14<00:41,  4.17s/it]

‚ö†Ô∏è Error con current,temp/4Nm_BPFO_10.tdms: DataFrame vac√≠o (df.shape=(0, 5))


üìÑ Procesando: gs://emotor-dataset-raw/current,temp/4Nm_BPFO_30.tdms


Convirtiendo TDMS ‚Üí Parquet:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 36/45 [01:20<00:43,  4.81s/it]

‚ö†Ô∏è Error con current,temp/4Nm_BPFO_30.tdms: DataFrame vac√≠o (df.shape=(0, 5))



Convirtiendo TDMS ‚Üí Parquet:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 37/45 [01:21<00:28,  3.54s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Misalign_01.parquet
‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Misalign_03.parquet
‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Misalign_05.parquet


Convirtiendo TDMS ‚Üí Parquet:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 40/45 [01:21<00:08,  1.64s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Normal.parquet


Convirtiendo TDMS ‚Üí Parquet:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 41/45 [01:22<00:05,  1.42s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Unbalance_0583mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 42/45 [01:23<00:04,  1.37s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Unbalance_1169mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 43/45 [01:24<00:02,  1.20s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Unbalance_1751mg.parquet


Convirtiendo TDMS ‚Üí Parquet:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 44/45 [01:25<00:01,  1.17s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Unbalance_2239mg.parquet


Convirtiendo TDMS ‚Üí Parquet: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45/45 [01:26<00:00,  1.92s/it]

‚è≠Ô∏è  Ya existe, salto: current_temp_short/4Nm_Unbalance_3318mg.parquet

üìä Resumen batch
 - Total TDMS:      45
 - Fallidos reales: 9
‚ùå Archivos con error:
   - current,temp/0Nm_BPFO_03.tdms
   - current,temp/0Nm_BPFO_10.tdms
   - current,temp/0Nm_BPFO_30.tdms
   - current,temp/2Nm_BPFO_03.tdms
   - current,temp/2Nm_BPFO_10.tdms
   - current,temp/2Nm_BPFO_30.tdms
   - current,temp/4Nm_BPFO_03.tdms
   - current,temp/4Nm_BPFO_10.tdms
   - current,temp/4Nm_BPFO_30.tdms

üìÑ failed_files =
['current,temp/0Nm_BPFO_03.tdms', 'current,temp/0Nm_BPFO_10.tdms', 'current,temp/0Nm_BPFO_30.tdms', 'current,temp/2Nm_BPFO_03.tdms', 'current,temp/2Nm_BPFO_10.tdms', 'current,temp/2Nm_BPFO_30.tdms', 'current,temp/4Nm_BPFO_03.tdms', 'current,temp/4Nm_BPFO_10.tdms', 'current,temp/4Nm_BPFO_30.tdms']





In [37]:
print(failed_files)

['current,temp/0Nm_BPFO_03.tdms', 'current,temp/0Nm_BPFO_10.tdms', 'current,temp/0Nm_BPFO_30.tdms', 'current,temp/2Nm_BPFO_03.tdms', 'current,temp/2Nm_BPFO_10.tdms', 'current,temp/2Nm_BPFO_30.tdms', 'current,temp/4Nm_BPFO_03.tdms', 'current,temp/4Nm_BPFO_10.tdms', 'current,temp/4Nm_BPFO_30.tdms']


## Review failed_files

In [None]:
BUCKET_NAME = "emotor-dataset-raw"

client = storage.Client()
bucket = client.bucket(BUCKET_NAME)


def build_df_replacing_v8_from_gcs(tdms_blob_name: str) -> dict:
    """
    Download a TDMS from GCS, build a DataFrame aligning channels by index,
    and replace any channel with dtype '|V8' with NaNs (float64).

    Returns a dict with:
      - 'df': pandas.DataFrame
      - 'v8_channels': list[str]
      - 'lengths': dict[str, int]
      - 'min_len': int
      - 'max_len': int
    """
    blob = bucket.blob(tdms_blob_name)

    tdms_bytes = blob.download_as_bytes()
    tdms = TdmsFile(io.BytesIO(tdms_bytes))

    data = {}
    v8_channels = []
    lengths = {}

    for g in tdms.groups():
        for ch in g.channels():
            arr = ch[:]
            col = f"{g.name}/{ch.name}"
            lengths[col] = len(arr)

            if getattr(arr, "dtype", None) == "|V8":
                v8_channels.append(col)
                # Serie de NaN con el mismo largo del canal
                data[col] = pd.Series(np.nan, index=np.arange(len(arr)), dtype="float64")
            else:
                # Convertimos a num√©rico seguro (strings/bytes raros -> NaN)
                s = pd.to_numeric(pd.Series(arr), errors="coerce")
                data[col] = s.reset_index(drop=True)

    # ‚úÖ Alinea por √≠ndice autom√°ticamente; rellena con NaN donde falte data
    df = pd.DataFrame(data)

    min_len = min(lengths.values()) if lengths else 0
    max_len = max(lengths.values()) if lengths else 0

    return {
        "df": df,
        "v8_channels": v8_channels,
        "lengths": lengths,
        "min_len": min_len,
        "max_len": max_len,
    }


# =========================
# Aplicar a failed_files
# =========================

# Asume que failed_files ya existe (devuelto por run_batch)
# failed_files = [...]

rescued_results = {}   # blob_name -> dict(resultados)
still_failed = []      # por si alguno falla incluso con este m√©todo

for blob_name in failed_files:
    try:
        print(f"\nüìÑ Re-procesando (rescue): gs://{BUCKET_NAME}/{blob_name}")

        res = build_df_replacing_v8_from_gcs(blob_name)
        df = res["df"]
        v8_channels = res["v8_channels"]

        print("‚úÖ DataFrame creado:", df.shape)
        print("üìè Longitudes (min/max):", res["min_len"], "/", res["max_len"])
        print(f"‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): {len(v8_channels)}")

        for c in v8_channels[:20]:
            print(" -", c)
        if len(v8_channels) > 20:
            print(f" ... ({len(v8_channels)-20} m√°s)")

        # Guarda resultados (por si luego quieres convertir a parquet)
        rescued_results[blob_name] = res

        # (Opcional) mostrar head
        display(df.head() if "display" in globals() else df.head())

    except Exception as e:
        print(f"‚ùå Sigue fallando {blob_name}: {e}")
        still_failed.append(blob_name)

print("\n RESUMEN RESCUE")
print(f" - failed_files (entrada): {len(failed_files)}")
print(f" - rescatados (DF creado): {len(rescued_results)}")
print(f" - siguen fallando:        {len(still_failed)}")
if still_failed:
    print("‚ùå still_failed =")
    print(still_failed)



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/0Nm_BPFO_03.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,28.993472,29.995407,-1.185821,,
1,28.993472,29.995407,-1.137821,,
2,28.993472,29.995407,-0.945823,,
3,28.993472,29.995407,-0.951309,,
4,28.993472,29.995407,-0.930738,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/0Nm_BPFO_10.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,28.66123,29.024126,-2.850719,,
1,28.66123,29.024126,-2.784891,,
2,28.66123,29.024126,-2.710834,,
3,28.66123,29.024126,-2.554493,,
4,28.66123,29.024126,-2.607978,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/0Nm_BPFO_30.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,27.566317,29.283572,3.276764,,
1,27.566317,29.283572,3.264421,,
2,27.566317,29.283572,3.619617,,
3,27.566317,29.283572,3.367277,,
4,27.566317,29.283572,3.076537,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/2Nm_BPFO_03.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,31.345948,31.595729,3.638817,,
1,31.345948,31.595729,3.870586,,
2,31.345948,31.595729,3.320649,,
3,31.345948,31.595729,3.035395,,
4,31.345948,31.595729,3.012081,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/2Nm_BPFO_10.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,29.501608,29.380241,3.52499,,
1,29.501608,29.380241,3.22465,,
2,29.501608,29.380241,3.494819,,
3,29.501608,29.380241,3.712874,,
4,29.501608,29.380241,3.487962,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/2Nm_BPFO_30.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,30.26951,31.082377,-2.87129,,
1,30.26951,31.082377,-3.039974,,
2,30.26951,31.082377,-2.860318,,
3,30.26951,31.082377,-2.701234,,
4,30.26951,31.082377,-2.764319,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/4Nm_BPFO_03.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,32.544393,33.115993,-3.854594,,
1,32.544393,33.115993,-3.369113,,
2,32.544393,33.115993,-3.75311,,
3,32.544393,33.115993,-3.810709,,
4,32.544393,33.115993,-3.810709,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/4Nm_BPFO_10.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,31.414113,31.734289,-3.44317,,
1,31.414113,31.734289,-3.711967,,
2,31.414113,31.734289,-3.692767,,
3,31.414113,31.734289,-3.672196,,
4,31.414113,31.734289,-3.990364,,



üìÑ Re-procesando (rescue): gs://emotor-dataset-raw/current,temp/4Nm_BPFO_30.tdms
‚úÖ DataFrame creado: (1536492, 5)
üìè Longitudes (min/max): 0 / 1536492
‚ö†Ô∏è Canales reemplazados (|V8 ‚Üí NaN): 2
 - Log/cDAQ9185-1F486B5Mod2/ai2
 - Log/cDAQ9185-1F486B5Mod2/ai3


Unnamed: 0,Log/cDAQ9185-1F486B5Mod1/ai0,Log/cDAQ9185-1F486B5Mod1/ai1,Log/cDAQ9185-1F486B5Mod2/ai0,Log/cDAQ9185-1F486B5Mod2/ai2,Log/cDAQ9185-1F486B5Mod2/ai3
0,31.570303,32.498502,3.302821,,
1,31.570303,32.498502,2.966824,,
2,31.570303,32.498502,2.688427,,
3,31.570303,32.498502,2.722712,,
4,31.570303,32.498502,2.441572,,



üìå RESUMEN RESCUE
 - failed_files (entrada): 9
 - rescatados (DF creado): 9
 - siguen fallando:        0


In [44]:
# OUTPUT_PREFIX = "current_temp_short/"    # ya declarado arriba

rescued_uploaded = []
rescued_failed_upload = []

for blob_name, res in rescued_results.items():
    try:
        df = res["df"]

        if df.shape[0] == 0 or df.shape[1] == 0:
            raise ValueError(f"DF vac√≠o luego del rescue (shape={df.shape})")

        base_name = blob_name.split("/")[-1].replace(".tdms", ".parquet")
        out_blob_name = f"{OUTPUT_PREFIX}{base_name}"

        print(f"\n‚òÅÔ∏è Subiendo rescue parquet: gs://{BUCKET_NAME}/{out_blob_name}")

        buf = io.BytesIO()
        df.to_parquet(buf, index=False, engine="pyarrow")
        buf.seek(0)

        out_blob = bucket.blob(out_blob_name)
        out_blob.upload_from_file(buf, content_type="application/octet-stream")

        print(f"‚úÖ Rescue subido OK: {out_blob_name}")
        rescued_uploaded.append(blob_name)

    except Exception as e:
        print(f"‚ùå Error subiendo rescue {blob_name}: {e}")
        rescued_failed_upload.append(blob_name)



‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/0Nm_BPFO_03.parquet
‚úÖ Rescue subido OK: current_temp_short/0Nm_BPFO_03.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/0Nm_BPFO_10.parquet
‚úÖ Rescue subido OK: current_temp_short/0Nm_BPFO_10.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/0Nm_BPFO_30.parquet
‚úÖ Rescue subido OK: current_temp_short/0Nm_BPFO_30.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/2Nm_BPFO_03.parquet
‚úÖ Rescue subido OK: current_temp_short/2Nm_BPFO_03.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/2Nm_BPFO_10.parquet
‚úÖ Rescue subido OK: current_temp_short/2Nm_BPFO_10.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/current_temp_short/2Nm_BPFO_30.parquet
‚úÖ Rescue subido OK: current_temp_short/2Nm_BPFO_30.parquet

‚òÅÔ∏è Subiendo rescue parquet: gs://emotor-dataset-raw/c