# Sentence Extraction

This notebook extracts sentences from markdown documents using spaCy. It cleans and validates sentences, performs language verification, and saves valid sentences in JSONL format.

## Setup

### Imports

In [None]:
import yaml
import re
import json
import spacy
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from lingua import LanguageDetectorBuilder

### Configuration and Paths

In [None]:
# Load configuration
with open("../config.yaml") as f:
    config = yaml.safe_load(f)

# Set up project paths
project_root = Path.cwd().parent
SENTENCES_DIR = project_root / config["SENTENCES_DIR"]
EXTRACTED_DIR = project_root / config["EXTRACTED_DIR"]

# Progress bar format
PROGRESS_BAR_FORMAT = "{desc:<25}{percentage:3.0f}%|{bar:20}{r_bar}"

## Helper Functions

In [None]:
def detect_language(text: str, detector):
    """Detect the language of the given text"""
    result = detector.detect_language_of(text)
    return result.iso_code_639_1.name.lower() if result else None


def clean_sentence(sentence: str):
    """Clean markdown formatting from sentence"""
    sentence = re.sub(r"\s+", " ", sentence)
    sentence = re.sub(r"#+\s*", "", sentence)
    sentence = re.sub(r"\*+", "", sentence)
    sentence = re.sub(r"\[|\]|\(|\)", "", sentence)
    return sentence.strip()


def is_valid_sentence(sentence: str, min_length: int = 10, max_length: int = 500):
    """Check if sentence meets validity criteria"""
    if len(sentence) < min_length:
        return False
    if len(sentence) > max_length:
        return False
    if not re.search(r"[a-zA-Z\u0080-\uFFFF]", sentence):
        return False
    if len(re.findall(r"[a-zA-Z\u0080-\uFFFF]", sentence)) < 5:
        return False
    return True

## Extract Sentences

Extract and validate sentences from markdown documents for all languages.

In [None]:
extraction_stats = []
nlp = spacy.load("xx_ent_wiki_sm")
nlp.add_pipe("sentencizer")
detector = LanguageDetectorBuilder.from_all_languages().build()

for lang_code, lang_config in config["LANGUAGES"].items():
    lang_sents_file = SENTENCES_DIR / f"{lang_code}_sentences.jsonl"
    lang_extracted_dir = EXTRACTED_DIR / lang_code

    if not lang_extracted_dir.exists():
        continue

    markdown_files = list(lang_extracted_dir.glob("*.md"))
    total_sentences = 0

    with open(lang_sents_file, "w", encoding="utf-8") as out_file:
        for markdown_path in tqdm(
            markdown_files,
            total=len(markdown_files),
            desc=f"Extracting {lang_config['name']}",
            bar_format=PROGRESS_BAR_FORMAT
        ):
            try:
                md_text = markdown_path.read_text(encoding="utf-8")

                # Clean markdown text
                md_text = re.sub(r"```.*?```", "", md_text, flags=re.DOTALL)
                md_text = re.sub(r"\|.*?\|", "", md_text)
                md_text = re.sub(
                    r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
                    "",
                    md_text,
                )

                # Extract sentences
                doc = nlp(md_text)

                sentences = []
                for sent in doc.sents:
                    cleaned = clean_sentence(sent.text)
                    detected_code = detect_language(cleaned, detector)
                    if is_valid_sentence(cleaned) and detected_code == lang_code:
                        sentences.append(cleaned)

                # Save valid sentences
                for idx, sentence in enumerate(sentences):
                    data = {
                        "text": sentence,
                        "lang": lang_code,
                        "doc_id": markdown_path.stem,
                        "sent_id": idx,
                    }
                    out_file.write(json.dumps(data, ensure_ascii=False) + "\n")
                    total_sentences += 1

                    if total_sentences >= lang_config["target_sentences"]:
                        break

            except Exception as e:
                print(f"Error when extracting sentences: {str(e)}")

            if total_sentences >= lang_config["target_sentences"]:
                break

    extraction_stats.append(
        {
            "Language": lang_config["name"],
            "Code": lang_code,
            "Documents": len(markdown_files),
            "Sentences": total_sentences,
            "Avg per Doc": (
                f"{total_sentences / len(markdown_files):.1f}"
                if markdown_files
                else "0"
            ),
        }
    )


Extracting Tamil         100%|████████████████████| 84/84 [00:19<00:00,  4.36it/s]
Extracting Bengali       100%|████████████████████| 27/27 [00:11<00:00,  2.43it/s]
Extracting Thai          100%|████████████████████| 291/291 [02:49<00:00,  1.72it/s]
Extracting Swahili       100%|████████████████████| 310/310 [04:26<00:00,  1.16it/s]
Extracting Estonian      100%|████████████████████| 251/251 [04:28<00:00,  1.07s/it]


In [25]:
display(pd.DataFrame(extraction_stats))

Unnamed: 0,Language,Code,Documents,Sentences,Avg per Doc
0,Tamil,ta,84,9533,113.5
1,Bengali,bn,27,6330,234.4
2,Thai,th,291,9036,31.1
3,Swahili,sw,310,15260,49.2
4,Estonian,et,251,15204,60.6
