In [5]:
import re

def clean_line(line):
    # Eliminar todo tipo de comillas simples y dobles
    line = line.replace("'", "").replace('"', "")
    
    # Eliminar paréntesis y corchetes
    line = line.replace("(", "").replace(")", "").replace("[", "").replace("]", "")
    
    # Eliminar dos o más puntos consecutivos (.., ..., etc)
    line = re.sub(r'\.{2,}', '', line)
    
    return line

def validate_conll_file(path):
    print(f"\n📂 Validating file: {path}\n" + "-"*60)
    
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    sentence = []
    for i, line in enumerate(lines):
        # Limpiar la línea primero
        cleaned_line = clean_line(line)
        stripped = cleaned_line.strip()

        # -------- Malformed line (should have at least 2 columns) --------
        if stripped and len(stripped.split()) < 2:
            print(f"{path}:{i + 1}: ❌ Malformed line after cleaning: '{stripped}'")

        # -------- Empty sentence block (multiple blank lines) --------
        if stripped == "":
            if sentence == []:
                # Look back a few lines for context
                context_start = max(0, i - 3)
                context = "".join(lines[context_start:i]).strip()
                print(f"{path}:{i + 1}: ⚠️ Empty sentence block detected. Context:\n---\n{context}\n---\n")
            sentence = []
        else:
            sentence.append(stripped)


# -------------- Run this block to validate your dataset ----------------
if __name__ == "__main__":
    files_to_check = ["train.txt", "dev.txt", "test.txt"]  # or your actual filenames

    for file_path in files_to_check:
        validate_conll_file(file_path)



📂 Validating file: train.txt
------------------------------------------------------------
train.txt:141: ❌ Malformed line after cleaning: 'O'
train.txt:180: ❌ Malformed line after cleaning: 'O'
train.txt:186: ❌ Malformed line after cleaning: 'O'
train.txt:306: ❌ Malformed line after cleaning: 'O'
train.txt:478: ❌ Malformed line after cleaning: 'O'
train.txt:480: ❌ Malformed line after cleaning: 'O'
train.txt:486: ❌ Malformed line after cleaning: 'O'
train.txt:490: ❌ Malformed line after cleaning: 'O'
train.txt:753: ❌ Malformed line after cleaning: 'O'
train.txt:755: ❌ Malformed line after cleaning: 'O'
train.txt:759: ❌ Malformed line after cleaning: 'O'
train.txt:821: ❌ Malformed line after cleaning: 'O'
train.txt:825: ❌ Malformed line after cleaning: 'O'
train.txt:839: ❌ Malformed line after cleaning: 'O'
train.txt:843: ❌ Malformed line after cleaning: 'O'
train.txt:847: ❌ Malformed line after cleaning: 'O'
train.txt:851: ❌ Malformed line after cleaning: 'O'
train.txt:923: ❌ Malforme