In [3]:
def validate_conll_file(path):
    print(f"\n📂 Validating file: {path}\n" + "-"*60)
    
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sentence = []
    for i, line in enumerate(lines):
        stripped = line.strip()

        # -------- Malformed line (should have at least 2 columns) --------
        if stripped and len(stripped.split()) < 2:
            print(f"{path}:{i + 1}: ❌ Malformed line: '{stripped}'")

        # -------- Empty sentence block (multiple blank lines) --------
        if stripped == "":
            if sentence == []:
                # Look back a few lines for context
                context_start = max(0, i - 3)
                context = "".join(lines[context_start:i]).strip()
                print(f"{path}:{i + 1}: ⚠️ Empty sentence block detected. Context:\n---\n{context}\n---\n")
            sentence = []
        else:
            sentence.append(stripped)


# -------------- Run this block to validate your dataset ----------------
if __name__ == "__main__":
    files_to_check = ["train.txt", "dev.txt", "test.txt"]  # or your actual filenames

    for file_path in files_to_check:
        validate_conll_file(file_path)



📂 Validating file: train.txt
------------------------------------------------------------
train.txt:375: ❌ Malformed line: 'O'
train.txt:538: ❌ Malformed line: 'O'
train.txt:660: ⚠️ Empty sentence block detected. Context:
---
Vase O
. O
---

train.txt:768: ⚠️ Empty sentence block detected. Context:
---
Éntrense B-PER
. O
---

train.txt:1380: ⚠️ Empty sentence block detected. Context:
---
bastantes O
. O
---

train.txt:1548: ⚠️ Empty sentence block detected. Context:
---
Vanse O
. O
---

train.txt:2310: ❌ Malformed line: 'O'
train.txt:2348: ❌ Malformed line: '!'
train.txt:2419: ⚠️ Empty sentence block detected. Context:
---
Vase O
. O
---

train.txt:2557: ❌ Malformed line: 'O'
train.txt:2592: ⚠️ Empty sentence block detected. Context:
---
Dichas O
. O
---

train.txt:2837: ❌ Malformed line: '!'
train.txt:3164: ⚠️ Empty sentence block detected. Context:
---
y O
. O
---

train.txt:3276: ❌ Malformed line: 'O'
train.txt:4030: ⚠️ Empty sentence block detected. Context:
---
y O
. O
---

train