A comprehensive toolkit for text denoising and quality evaluation, featuring two powerful pipelines and quality metrics.
- DeepSeek-R1-Denoising: Grammar, spelling, and whitespace error correction using DeepSeek API
- WAC-GEC: Two-stage correction pipeline (Whitespace + Grammar Error Correction) using local models
-
WAR (Whitespace Anomaly Rate): Measures whitespace issues including:
- Missing spaces (e.g., "Thisisasentence")
- Extra spaces (e.g., "Too any spaces")
- Zero-width characters
- Abnormal punctuation spacing
-
SED (Spelling Error Density): Measures spelling error percentage relative to total words
For development:
git clone https://github.com/LLLoUo/bd-toolkit.git
cd bd-toolkit
pip install -e .For spaCy model:
python -m spacy download en_core_web_smFor WAC-GEC pipeline:
pip install whitespace-correctionfrom bd_toolkit import DeepSeekDenoiser
# Initialize with API key
denoiser = DeepSeekDenoiser(api_key="your-deepseek-api-key")
# Correct single text
text = "This is anexample with speling erorrs."
corrected = denoiser.correct(text)
print(corrected)
# Output: "This is an example with spelling errors."
# Correct batch
texts = [
"Thisis anexample",
"Another sentense with extra spaces"
]
results = denoiser.correct_batch(texts)
# Process DataFrame
import pandas as pd
df = pd.DataFrame({"questions": texts})
df = denoiser.process_dataframe(df, column="questions")
print(df["questions_cleaned"])from bd_toolkit import WacGecPipeline
# Initialize pipeline
pipeline = WacGecPipeline(
gec_model_path="/path/to/gec/model",
wsc_download_dir="/path/to/wsc/models"
)
# Full pipeline (GEC + WSC)
text = "Thisis anexample withspacing errors ."
corrected = pipeline.correct(text)
print(corrected)
# GEC only
corrected_gec = pipeline.correct(text, gec_only=True)
# WSC only
corrected_wsc = pipeline.correct(text, wsc_only=True)
# Batch processing
texts = ["sentence 1", "sentence 2"]
results = pipeline.correct_batch(texts, show_progress=True)from bd_toolkit import (
calculate_whitespace_anomaly_rate,
calculate_spelling_error_density
)
# Sample texts
original = [
"This is normal.",
"Thisisabnormal",
"Too many spaces",
"Spelling erorrs here"
]
corrected = [
"This is normal.",
"This is abnormal",
"Too many spaces",
"Spelling errors here"
]
# Calculate WAR
war_original = calculate_whitespace_anomaly_rate(original)
war_corrected = calculate_whitespace_anomaly_rate(corrected)
print(f"WAR (Original): {war_original:.2f}%")
print(f"WAR (Corrected): {war_corrected:.2f}%")
# Calculate SED
sed_original = calculate_spelling_error_density(original)
sed_corrected = calculate_spelling_error_density(corrected)
print(f"SED (Original): {sed_original:.2f}%")
print(f"SED (Corrected): {sed_corrected:.2f}%")import json
import pandas as pd
from pathlib import Path
from bd_toolkit import (
DeepSeekDenoiser,
calculate_whitespace_anomaly_rate,
calculate_spelling_error_density
)
# Load data
with open("data.json", "r") as f:
data = json.load(f)
original_texts = data["original"]
# Apply denoising
denoiser = DeepSeekDenoiser(api_key="your-api-key")
corrected_texts = denoiser.correct_batch(original_texts)
# Calculate metrics
results = {
"WAR_original": calculate_whitespace_anomaly_rate(original_texts),
"WAR_corrected": calculate_whitespace_anomaly_rate(corrected_texts),
"SED_original": calculate_spelling_error_density(original_texts),
"SED_corrected": calculate_spelling_error_density(corrected_texts)
}
# Save results
df = pd.DataFrame([results])
df.to_csv("quality_metrics.csv", index=False)
print("Quality Metrics:")
for key, value in results.items():
print(f"{key}: {value:.2f}%")DeepSeekDenoiser(
api_key: str,
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
model: str = "deepseek-r1-distill-llama-8b",
temperature: float = 0.1,
max_retries: int = 5
)Methods:
correct(text: str, verbose: bool = False) -> strcorrect_batch(texts: List[str], show_progress: bool = True) -> List[str]process_dataframe(df: pd.DataFrame, column: str, output_column: str = None) -> pd.DataFrame
WacGecPipeline(
gec_model_path: str,
wsc_model_name: str = "eo_larger_byte",
wsc_download_dir: Optional[str] = None,
device: str = "auto",
dtype = torch.bfloat16
)Methods:
correct(text: str, gec_only: bool = False, wsc_only: bool = False) -> strcorrect_batch(texts: List[str], show_progress: bool = True) -> List[str]
calculate_whitespace_anomaly_rate(sentences: List[str]) -> float
calculate_spelling_error_density(sentences: List[str]) -> floatfrom bd_toolkit import DeepSeekDenoiser
import pandas as pd
denoiser = DeepSeekDenoiser(api_key="your-key")
# Load large dataset
df = pd.read_parquet("large_dataset.parquet")
# Process in batches with progress
df = denoiser.process_dataframe(
df,
column="text",
output_column="text_cleaned",
show_progress=True
)
# Save results
df.to_parquet("cleaned_dataset.parquet")from bd_toolkit import DeepSeekDenoiser, WacGecPipeline
from bd_toolkit import calculate_whitespace_anomaly_rate, calculate_spelling_error_density
# Sample data
texts = ["Your test texts here"]
# Method 1: DeepSeek
denoiser = DeepSeekDenoiser(api_key="your-key")
deepseek_results = denoiser.correct_batch(texts)
# Method 2: WAC-GEC
pipeline = WacGecPipeline(gec_model_path="/path/to/model")
wacgec_results = pipeline.correct_batch(texts)
# Compare metrics
comparison = pd.DataFrame({
"Method": ["Original", "DeepSeek", "WAC-GEC"],
"WAR": [
calculate_whitespace_anomaly_rate(texts),
calculate_whitespace_anomaly_rate(deepseek_results),
calculate_whitespace_anomaly_rate(wacgec_results)
],
"SED": [
calculate_spelling_error_density(texts),
calculate_spelling_error_density(deepseek_results),
calculate_spelling_error_density(wacgec_results)
]
})
print(comparison)bd-toolkit/
├── bd_toolkit/
│ ├── __init__.py
│ ├── pipelines/
│ │ ├── __init__.py
│ │ ├── deepseek_denoising.py
│ │ └── wac_gec.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── war.py
│ │ └── sed.py
│ └── utils/
│ ├── __init__.py
│ └── text_processing.py
├── tests/
├── examples/
├── setup.py
├── requirements.txt
└── README.md
Contributions are welcome! Please feel free to submit a Pull Request.
MIT License
If you use this toolkit in your research, please cite:
@software{bd_toolkit,
title={BD-toolkit: A Toolkit for Text Denoising and Quality Evaluation},
author={Junhui Liu},
year={2025},
url={https://github.com/LLLoUo/bd-toolkit}
}- DeepSeek API for grammar correction
- spaCy for NLP processing
- PySpellChecker for spelling detection
For issues and questions, please open an issue on GitHub.