It's time to finally do actual work with data! If you are here to reproduce code, make sure you run the build_manifest.py first and have generated the `manifest.json` file.

In [None]:
import pandas as pd
import sys
from pathlib import Path

ROOT = Path.cwd().parents[0]
sys.path.insert(0, str(ROOT))

from src.utils import seed_everything
seed_everything(1337)

# Get filepaths from manifest
manifest = pd.read_json("../data/manifest.json")

df = pd.read_csv("../" + manifest.loc["path", "filelist"])
manifest

NameError: name 'args' is not defined

In [6]:
# Before I forget, I will add a column label_hfref which is 1 when EF <= 40 else 0
# We follow the clinical definition of hfref.
df["label_hfref"] = (df["EF"] <= 40).astype("int8")
df.columns
df.head()


Unnamed: 0,FileName,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,label_hfref
0,0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL,0
1,0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN,0
2,0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN,0
3,0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN,0
4,0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL,1


## Quality Control

### 0. Check missing values
and do imputations/removals if necessary

In [7]:
df.isna().sum()

FileName          0
EF                0
ESV               0
EDV               0
FrameHeight       0
FrameWidth        0
FPS               0
NumberOfFrames    0
Split             0
label_hfref       0
dtype: int64

### 1. Verify existence of videos according to given data

In [None]:
video_list = df["FileName"]

data_dir = Path("../" + manifest.loc["root", "videos"])

missing_count = 0
missing_indices = []
missing_files = []

# Loop through every video in list and make sure they actually exist.
for index, video in video_list.items():
    
    video = video + ".avi"  #Include file suffix
    
    # skip NaN / empty strings safely
    if not isinstance(video, str) or not video.strip():
        missing_count += 1
        missing_indices.append(index)
        missing_files.append(video)
        continue

    path = data_dir / video  # each video path

    if not path.is_file():
        missing_count += 1
        missing_indices.append(index)
        missing_files.append(str(path))

print(len(video_list))
print("Missing:", missing_count)
print("Missing indices:", missing_indices)
print("Missing files:", missing_files)

if missing_count == 0:
    print("\nAll clear!")
else:
    # drop missing indices
    df_e = df.drop(index=missing_indices)
    print("Missing files detected. Please verify before continuing.")


10030
Missing: 0
Missing indices: []
Missing files: []

All clear!


### 2. Verify video metadata (size, frames)

May take up to 5 minutes

The below function serves the purpose of verifying:
- Video existence
- Correct frame count and FPS
- Whether all videos have 112 x 112 resolution

In [30]:
from pathlib import Path
import cv2
import pandas as pd
import numpy as np
import hashlib

def _sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

def verify_avis_against_df(
    df: pd.DataFrame,
    base_dir,
    file_col="FileName",
    expected_fps_col="FPS",
    expected_frames_col="NumberOfFrames",
    fps_tol=0.25,          # tolerance for fps floats (tweak if needed)
    default_ext=".avi",
    expected_width=112,
    expected_height=112,
    id_col=None,           # Name of ID Column
    compute_sha256=False, 
):
    base_dir = Path(base_dir)
    count = 0
    rows = []
    
    for _, r in df.iterrows():
        name = str(r[file_col])
        # add .avi if missing
        if not name.lower().endswith(default_ext):
            name = name + default_ext

        # Video File Path
        p = base_dir / name

        exists = p.is_file()
        size_bytes = p.stat().st_size if exists else 0
        size_ok = exists and size_bytes > 0

        opened = False
        actual_fps = np.nan
        actual_frames = np.nan
        actual_width = np.nan
        actual_height = np.nan
        sha256 = None

        if size_ok:
            if compute_sha256:
                # exact byte-for-byte identity
                sha256 = _sha256_file(p)
                
            cap = cv2.VideoCapture(str(p))
            opened = cap.isOpened()
            if opened:
                actual_fps = float(cap.get(cv2.CAP_PROP_FPS))
                actual_frames = int(round(cap.get(cv2.CAP_PROP_FRAME_COUNT)))
                actual_width = int(round(cap.get(cv2.CAP_PROP_FRAME_WIDTH)))
                actual_height = int(round(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
                count += 1
            cap.release()
            

        exp_fps = float(r[expected_fps_col])
        exp_frames = int(r[expected_frames_col])

        fps_ok = opened and np.isfinite(actual_fps) and (abs(actual_fps - exp_fps) <= fps_tol)
        frames_ok = opened and np.isfinite(actual_frames) and (int(actual_frames) == exp_frames)

        resolution_ok = (
            opened
            and np.isfinite(actual_width) and np.isfinite(actual_height)
            and int(actual_width) == int(expected_width)
            and int(actual_height) == int(expected_height)
        )
        
        ok = size_ok and opened and fps_ok and frames_ok and resolution_ok

        row_out = {
            file_col: r[file_col],
            "path": str(p),
            "exists": exists,
            "size_bytes": size_bytes,
            "opened": opened,
            "expected_fps": exp_fps,
            "actual_fps": actual_fps,
            "expected_frames": exp_frames,
            "actual_frames": actual_frames,
            "expected_width": int(expected_width),
            "actual_width": actual_width,
            "expected_height": int(expected_height),
            "actual_height": actual_height,
            "size_ok": size_ok,
            "fps_ok": fps_ok,
            "frames_ok": frames_ok,
            "resolution_ok": resolution_ok,
            "sha256": sha256,
            "ok": ok,
        }
        
        if id_col is not None:
            row_out[id_col] = r[id_col]

        rows.append(row_out)

    report = pd.DataFrame(rows)
    return report, count

Next, we verify that all videos are unique, i.e. no duplicates under different names. We do this by comparing the SHA256 codes of the videos (The actual code that extracts SHA256 is in the previous block).

In [32]:
def find_identical_videos_within_id(report: pd.DataFrame, id_col: str, hash_col: str = "sha256") -> pd.DataFrame:
    # Only consider rows that actually produced a hash
    x = report[report[hash_col].notna()].copy()

    # keep=False marks all members of a duplicate group
    x["identical_within_id"] = x.duplicated(subset=[id_col, hash_col], keep=False)

    # Return only the duplicate groups (sorted for readability)
    dupes = x[x["identical_within_id"]].sort_values([id_col, hash_col, "path"])
    return dupes


In [33]:
report, count = verify_avis_against_df(
    df,
    base_dir=data_dir,
    id_col="FileName",
    compute_sha256=True
)

dupes_within_id = find_identical_videos_within_id(report, id_col="FileName")


If all videos are within expectations, then the below should return empty dataframes.

In [36]:
print(dupes_within_id)
print(report[report["ok"] != True])

Empty DataFrame
Columns: [FileName, path, exists, size_bytes, opened, expected_fps, actual_fps, expected_frames, actual_frames, expected_width, actual_width, expected_height, actual_height, size_ok, fps_ok, frames_ok, resolution_ok, sha256, ok, identical_within_id]
Index: []
Empty DataFrame
Columns: [FileName, path, exists, size_bytes, opened, expected_fps, actual_fps, expected_frames, actual_frames, expected_width, actual_width, expected_height, actual_height, size_ok, fps_ok, frames_ok, resolution_ok, sha256, ok]
Index: []


We also make sure ejection fraction is within expectations.

In [39]:
weird_EF = df[(df["EF"] > 100) | df["EF"]< 0 ]
print(weird_EF)

Empty DataFrame
Columns: [FileName, EF, ESV, EDV, FrameHeight, FrameWidth, FPS, NumberOfFrames, Split, label_hfref]
Index: []


Final check, we take a random sample of 200 videos and make sure we can actually open them. Specifically we will try opening the beginning, middle, and last frames of the video.

I expect good results given how clean the data is, but it never hurts to re-check!

In [19]:
import os
import random

def decoding_sanity_check(
    df: pd.DataFrame,
    video_root: str,
    filename_col: str = "FileName",
    sample_n: int = 50,
    seed: int = 42,
    # probe a few points in the video (start/middle/end by default)
    probe_fracs=(0.0, 0.5, 0.9),
    # when probing, allow a few sequential reads in case the exact seek lands oddly
    reads_per_probe: int = 2,
    require_nonblank: bool = True,
):
    """
    Randomly sample videos and confirm we can decode frames at multiple positions.
    This is a stronger corruption/codec check than just reading the first frame.

    Returns:
      report_df: per-video results including how many probes succeeded
      missing_files: list of missing paths (not sampled)
    """
    if filename_col not in df.columns:
        raise ValueError(f"Column '{filename_col}' not found in df columns: {list(df.columns)}")

    # Build candidate list of paths
    candidates = []
    for fn in df[filename_col].dropna().astype(str).tolist():
        path = os.path.join(video_root, fn)
        if not os.path.splitext(path)[1]:
            path = path + ".avi"
        candidates.append(path)

    missing = [p for p in candidates if not os.path.exists(p)]
    existing = [p for p in candidates if os.path.exists(p)]
    if len(existing) == 0:
        raise RuntimeError("No existing videos found to sample from. Check video_root / filenames.")

    random.seed(seed)
    sample_n = min(sample_n, len(existing))
    sampled = random.sample(existing, sample_n)

    results = []
    for path in sampled:
        cap = cv2.VideoCapture(path)
        row = {
            "path": path,
            "opened": False,
            "frame_count": None,
            "probes_ok": 0,
            "probes_total": len(probe_fracs),
            "decoded": False,
            "reason": "",
        }

        if not cap.isOpened():
            row["reason"] = "cap_not_opened"
            results.append(row)
            continue

        row["opened"] = True
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
        row["frame_count"] = frame_count

        if frame_count <= 0:
            # Some containers/backends may not report frame count reliably; still try without seeking.
            frame_count = 0

        probe_fail_reasons = []

        for frac in probe_fracs:
            # Pick a target frame index (if we know frame_count)
            if frame_count > 0:
                target = int(frac * (frame_count - 1))
                target = max(0, min(target, frame_count - 1))
                cap.set(cv2.CAP_PROP_POS_FRAMES, target)
            else:
                # Fallback: just continue sequentially
                target = None

            ok_this_probe = False
            last_reason = "read_failed"

            for _ in range(reads_per_probe):
                ok, frame = cap.read()
                if not ok or frame is None:
                    last_reason = "read_failed"
                    continue
                if frame.size == 0:
                    last_reason = "empty_frame"
                    continue
                if require_nonblank and np.std(frame) == 0:
                    last_reason = "blank_frame_std0"
                    continue

                ok_this_probe = True
                break

            if ok_this_probe:
                row["probes_ok"] += 1
            else:
                where = f"frac={frac}"
                if target is not None:
                    where += f",target={target}"
                probe_fail_reasons.append(f"{where}:{last_reason}")

        cap.release()

        row["decoded"] = (row["probes_ok"] == row["probes_total"])
        row["reason"] = "" if row["decoded"] else ";".join(probe_fail_reasons)[:500]
        results.append(row)

    report = pd.DataFrame(results)
    failed = report[~report["decoded"]]

    print(f"Decoding sanity check: {sample_n} sampled")
    print(f"Missing files (not sampled): {len(missing)}")
    print(f"Decode failures in sample: {len(failed)}")
    if len(failed) > 0:
        display(failed[["path", "frame_count", "probes_ok", "probes_total", "reason"]].head(20))

    return report, missing




In [20]:
report_df, missing_files = decoding_sanity_check(
    df,
    video_root=data_dir,
    sample_n=200,
    probe_fracs=(0.0, 0.5, 0.9),
    reads_per_probe=2
)
report_df.to_csv("decoding_probe_report.csv", index=False)

Decoding sanity check: 200 sampled
Missing files (not sampled): 0
Decode failures in sample: 0
