In [11]:
import subprocess
import tempfile
import json
import os
import pandas as pd

def run_actionlint_on_string(yaml_text: str):
    with tempfile.NamedTemporaryFile(
        mode="w",
        suffix=".yml",
        delete=False,
        encoding="utf-8"
    ) as f:
        f.write(yaml_text)
        temp_path = f.name

    try:
        result = subprocess.run(
            [
                "actionlint",
                "-format",
                "{{json .}}",
                temp_path
            ],
            capture_output=True,
            text=True,
            shell=False
        )

        stdout = (result.stdout or "").strip()
        stderr = (result.stderr or "").strip()

        diagnostics = None
        raw_output = None

        if stdout:
            try:
                diagnostics = json.loads(stdout)
            except json.JSONDecodeError:
                raw_output = stdout

        return {
            "actionlint_exit_code": result.returncode,
            "actionlint_pass": result.returncode == 0,
            "actionlint_diagnostics": diagnostics,
            "actionlint_raw_output": raw_output,
            "actionlint_stderr": stderr
        }

    finally:
        os.remove(temp_path)


In [12]:
def run_yamllint_on_string(yaml_text: str):
    with tempfile.NamedTemporaryFile(
        mode="w",
        suffix=".yml",
        delete=False,
        encoding="utf-8"
    ) as f:
        f.write(yaml_text)
        temp_path = f.name

    try:
        result = subprocess.run(
            [
                "yamllint",
                "-f",
                "parsable",
                temp_path
            ],
            capture_output=True,
            text=True,
            shell=False
        )

        stdout = result.stdout.strip()
        stderr = result.stderr.strip()

        diagnostics = stdout.splitlines() if stdout else []

        return {
            "yamllint_exit_code": result.returncode,
            "yamllint_pass": result.returncode == 0,
            "yamllint_diagnostics": diagnostics,
            "yamllint_stderr": stderr
        }

    finally:
        os.remove(temp_path)


In [13]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("workflows_sample_dataset.csv")

results = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    yaml_text = row.get("workflow_content_file", "")

    if not isinstance(yaml_text, str) or not yaml_text.strip():
        results.append({
            "index": idx,
            "actionlint_pass": False,
            "yamllint_pass": False,
            "actionlint_exit_code": None,
            "yamllint_exit_code": None,
            "actionlint_diagnostics": None,
            "yamllint_diagnostics": None,
            "error": "empty_or_missing_yaml"
        })
        continue

    actionlint_res = run_actionlint_on_string(yaml_text)
    yamllint_res = run_yamllint_on_string(yaml_text)

    results.append({
        "index": idx,
        **actionlint_res,
        **yamllint_res
    })

    # ðŸ’¡ periodic checkpoint (important for long runs)
    if idx % 100 == 0 and idx > 0:
        pd.DataFrame(results).to_csv(
            "gold_lint_partial.csv",
            index=False
        )

# final write
lint_df = pd.DataFrame(results)
lint_df.to_csv("gold_lint_full.csv", index=False)


 35%|â–ˆâ–ˆâ–ˆâ–Œ      | 1380/3888 [24:01<42:11,  1.01s/it] Exception in thread Thread-11169 (_readerthread):
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 337: character maps to <u

In [7]:
results

[{'index': 0,
  'actionlint_exit_code': 0,
  'actionlint_pass': True,
  'actionlint_diagnostics': [],
  'actionlint_raw_output': None,
  'actionlint_stderr': '',
  'yamllint_exit_code': 1,
  'yamllint_pass': False,
   'C:\\Users\\mekae\\AppData\\Local\\Temp\\tmp6t3rfno0.yml:1:26: [error] wrong new line character: expected \\n (new-lines)',
   'C:\\Users\\mekae\\AppData\\Local\\Temp\\tmp6t3rfno0.yml:29:31: [error] trailing spaces (trailing-spaces)'],
  'yamllint_stderr': ''},
 {'index': 1,
  'actionlint_exit_code': 0,
  'actionlint_pass': True,
  'actionlint_diagnostics': [],
  'actionlint_raw_output': None,
  'actionlint_stderr': '',
  'yamllint_exit_code': 1,
  'yamllint_pass': False,
   'C:\\Users\\mekae\\AppData\\Local\\Temp\\tmpglonptnd.yml:1:23: [error] wrong new line character: expected \\n (new-lines)',
   'C:\\Users\\mekae\\AppData\\Local\\Temp\\tmpglonptnd.yml:11:81: [error] line too long (82 > 80 characters) (line-length)',
   'C:\\Users\\mekae\\AppData\\Local\\Temp\\tmpglonp

In [17]:
lint_df

Unnamed: 0,index,actionlint_exit_code,actionlint_pass,actionlint_diagnostics,actionlint_raw_output,actionlint_stderr,yamllint_exit_code,yamllint_pass,yamllint_diagnostics,yamllint_stderr
0,0,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmpkryovv09...,
1,1,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmp4qryyw7g...,
2,2,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmpt_benfu8...,
3,3,1,False,"[{'message': 'the runner of ""actions/checkout@...",,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmp55uxr5j1...,
4,4,1,False,"[{'message': 'the runner of ""softprops/action-...",,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmpik6uu4pj...,
...,...,...,...,...,...,...,...,...,...,...
3883,3883,1,False,"[{'message': 'the runner of ""actions/checkout@...",,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmpg7vsdbx9...,
3884,3884,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmp3egy05_k...,
3885,3885,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmp7xdugec2...,
3886,3886,0,True,[],,,1,False,[C:\Users\mekae\AppData\Local\Temp\tmp5b87_zjt...,


In [16]:
import pandas as pd

df = pd.read_csv("gold_lint_full.csv")

has_diag = df["actionlint_diagnostics"].apply(
    lambda x: x not in ("[]", "", None)
)

total = len(df)
with_diag = has_diag.sum()
without_diag = total - with_diag

total, with_diag, without_diag


(3888, 990, 2898)