In [1]:
import random

def load_sentences(filename):
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read().strip()
    # Cada frase separada por doble salto de línea
    sentences = text.split("\n\n")
    return [s.strip() for s in sentences if s.strip()]

def save_sentences(sentences, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n\n".join(sentences) + "\n")

def contains_loc_or_per(sentence):
    return any(tag in sentence for tag in ["B-LOC", "I-LOC", "B-PER", "I-PER"])

def main():
    files = ["train.txt", "dev.txt", "test.txt"]

    sentences = []
    for file in files:
        sentences.extend(load_sentences(file))

    # Separar por presencia de LOC o PER
    tagged_sentences = [s for s in sentences if contains_loc_or_per(s)]
    non_tagged_sentences = [s for s in sentences if not contains_loc_or_per(s)]

    print(f"Total de frases: {len(sentences)}")
    print(f"Con LOC o PER: {len(tagged_sentences)}")
    print(f"Sin LOC ni PER: {len(non_tagged_sentences)}")

    target_total = 7300
    n_tagged = len(tagged_sentences)

    if n_tagged >= target_total:
        final_sentences = random.sample(tagged_sentences, target_total)
    else:
        n_non_tagged_needed = target_total - n_tagged
        n_non_tagged_needed = min(n_non_tagged_needed, len(non_tagged_sentences))
        sampled_non_tagged = random.sample(non_tagged_sentences, n_non_tagged_needed)
        final_sentences = tagged_sentences + sampled_non_tagged

    random.shuffle(final_sentences)
    save_sentences(final_sentences, "balanced_corpus.txt")

    print(f"Corpus final: {len(final_sentences)} frases guardadas en balanced_corpus.txt")

if __name__ == "__main__":
    main()



Total de frases: 17483
Con LOC o PER: 3878
Sin LOC ni PER: 13605
Corpus final: 7300 frases guardadas en balanced_corpus.txt
