<a href="https://www.kaggle.com/code/nguynvnln22028281/btl-nlp-clean-medical-data?scriptVersionId=286612567" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install langid sentence-transformers


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-dataset/public_test.en.txt
/kaggle/input/nlp-dataset/train.en.txt
/kaggle/input/nlp-dataset/train.vi.txt
/kaggle/input/nlp-dataset/public_test.vi.txt


In [None]:
from torch.utils.data import Dataset
from tqdm import tqdm
import re
import unicodedata
import langid
import torch

# ENABLE semantic filtering for medical data
use_semantic_filter = False
similarity_threshold = 0.70  # Raised for technical content

if use_semantic_filter:
    from sentence_transformers import SentenceTransformer, util
    print("Loading LaBSE model...")
    labse = SentenceTransformer("sentence-transformers/LaBSE")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    labse = labse.to(device)

# ---------------------------
# Enhanced Cleaning Functions
# ---------------------------

def normalize_text(s):
    """Enhanced normalization with encoding fix"""
    # Fix common encoding artifacts
    s = s.replace("â€", "—").replace("â€™", "'").replace("Â±", "±")
    s = s.replace("Ã", "ê").replace("Âµ", "µ")
    
    # Remove HTML/XML tags
    s = re.sub(r"<[^>]+>", "", s)
    
    # Normalize Unicode
    s = unicodedata.normalize("NFC", s)
    
    # Remove zero-width characters
    s = s.replace("\u200b", "").replace("\ufeff", "")
    
    # Fix spacing around punctuation
    s = re.sub(r'\s+([.,;:!?])', r'\1', s)
    
    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    
    return s

def is_metadata_line(s):
    """Detect non-content lines (headers, page numbers, etc.)"""
    s_lower = s.lower()
    
    # Check for common metadata patterns
    metadata_patterns = [
        r'^page \d+$',
        r'^\d+\s*$',  # Just numbers
        r'^(abstract|introduction|conclusion|references?|methods?):\s*$',
        r'^\w+\s+\d{4}$',  # "December 2021"
        r'^volume \d+',
        r'^doi:',
        r'^issn',
        r'^copyright',
    ]
    
    for pattern in metadata_patterns:
        if re.match(pattern, s_lower):
            return True
    
    # Too short to be meaningful content
    if len(s.split()) < 4:
        return True
        
    return False

def heuristic_bad_pair(en, vi):
    """Enhanced domain-specific filters"""
    en_low = en.lower()
    vi_low = vi.lower()
    
    # Known translation errors in this dataset
    if "vaginal" in en_low and any(kw in en_low for kw in ["ear", "otitis", "tympanogram"]):
        return True
    
    # Corrupted spellings
    if any(bad in en_low for bad in ["otittis", "rhinolaryngology", "imumnohistochemistry"]):
        return True
    
    # Detect if one side is metadata but other isn't
    if is_metadata_line(en) != is_metadata_line(vi):
        return True
    
    # Both are metadata
    if is_metadata_line(en) and is_metadata_line(vi):
        return True
    
    # Check for URL/email mismatches
    en_has_url = bool(re.search(r'https?://|www\.', en))
    vi_has_url = bool(re.search(r'https?://|www\.', vi))
    if en_has_url != vi_has_url:
        return True
    
    # Detect if one sentence has numbers/stats but other doesn't
    en_has_nums = bool(re.search(r'\d+[.,]?\d*\s*[%±]', en))
    vi_has_nums = bool(re.search(r'\d+[.,]?\d*\s*[%±]', vi))
    # Allow medical texts with stats on one side (could be rephrased)
    # But reject if one has LOTS of numbers and other has none
    en_num_count = len(re.findall(r'\d+', en))
    vi_num_count = len(re.findall(r'\d+', vi))
    if en_num_count > 5 and vi_num_count == 0:
        return True
    if vi_num_count > 5 and en_num_count == 0:
        return True
    
    return False

def length_ratio_bad(en, vi):
    """Stricter length constraints for medical abstracts"""
    len_en, len_vi = len(en.split()), len(vi.split())
    
    # Minimum length (medical sentences are usually substantial)
    if len_en < 3 or len_vi < 3:
        return True
    
    # Maximum length (likely concatenated paragraphs)
    if len_en > 150 or len_vi > 150:
        return True
    
    # Tighter ratio for technical content
    ratio = len_en / max(len_vi, 1)
    if ratio > 2 or ratio < 0.5:
        return True
    return False

def language_mismatch(en, vi):
    """Relaxed language detection for medical text"""
    # Medical text may be unreliable for langid - use as soft signal
    try:
        lang_en, score_en = langid.classify(en)
        lang_vi, score_vi = langid.classify(vi)
        
        # Only reject on very confident misdetections
        if score_en > 0.9 and lang_en != "en":
            return True
        if score_vi > 0.9 and lang_vi != "vi":
            return True
            
    except Exception:
        # If langid fails, don't reject
        pass
    
    # Additional heuristic: Vietnamese should have tone marks
    vietnamese_chars = set("áàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđ")
    vi_lower = vi.lower()
    has_vietnamese_chars = any(c in vietnamese_chars for c in vi_lower)
    
    # Reject if supposed Vietnamese has zero tone marks (likely English)
    if len(vi_lower) > 20 and not has_vietnamese_chars:
        return True
    
    return False

def semantic_bad(en, vi):
    """Compute semantic similarity with batching for speed"""
    with torch.no_grad():
        emb_en = labse.encode(en, convert_to_tensor=True, device=device)
        emb_vi = labse.encode(vi, convert_to_tensor=True, device=device)
        score = util.cos_sim(emb_en, emb_vi).item()
    return score < similarity_threshold


# ---------------------------
# Dataset Class
# ---------------------------
class ParallelTextDataset(Dataset):
    def __init__(self, src_file, tgt_file):
        self.data = []
        seen = set()
        rejected_length_samples = []
        
        # Count total lines
        with open(src_file, 'r', encoding='utf-8') as f:
            total_lines = sum(1 for _ in f)
        
        stats = {
            'total': 0,
            'empty': 0,
            'heuristic': 0,
            'length': 0,
            'language': 0,
            'semantic': 0,
            'duplicate': 0,
            'kept': 0
        }
        
        with open(src_file, 'r', encoding='utf-8') as f1, \
             open(tgt_file, 'r', encoding='utf-8') as f2:
            
            for src_line, tgt_line in tqdm(
                zip(f1, f2),
                total=total_lines,
                desc=f"Cleaning {src_file.split('/')[-1]}"
            ):
                stats['total'] += 1
                src = src_line.strip()
                tgt = tgt_line.strip()
                
                if not src or not tgt:
                    stats['empty'] += 1
                    continue
                
                # Normalize
                en = normalize_text(src)
                vi = normalize_text(tgt)
                
                if not en or not vi:
                    stats['empty'] += 1
                    continue
                
                # Filter pipeline
                if heuristic_bad_pair(en, vi):
                    stats['heuristic'] += 1
                    continue
                
                if length_ratio_bad(en, vi):
                    if len(rejected_length_samples) < 20:  # Collect 20 samples
                        rejected_length_samples.append({
                            'en': en,
                            'vi': vi,
                            'len_en': len(en.split()),
                            'len_vi': len(vi.split()),
                            'ratio': len(en.split()) / max(len(vi.split()), 1)
                        })
                    stats['length'] += 1
                    continue
                
                if language_mismatch(en, vi):
                    stats['language'] += 1
                    continue
                
                if use_semantic_filter and semantic_bad(en, vi):
                    stats['semantic'] += 1
                    continue
                
                # Deduplicate
                key = en + "|||" + vi
                if key in seen:
                    stats['duplicate'] += 1
                    continue
                
                seen.add(key)
                self.data.append((en, vi))
                stats['kept'] += 1
        
        # Print detailed statistics
        print(f"\n{'='*60}")
        print(f"Dataset: {src_file.split('/')[-1]}")
        print(f"{'='*60}")
        print(f"Total pairs processed:     {stats['total']:>6}")
        print(f"  - Empty/blank:           {stats['empty']:>6} ({stats['empty']/stats['total']*100:>5.1f}%)")
        print(f"  - Heuristic filters:     {stats['heuristic']:>6} ({stats['heuristic']/stats['total']*100:>5.1f}%)")
        print(f"  - Length ratio:          {stats['length']:>6} ({stats['length']/stats['total']*100:>5.1f}%)")
        print(f"  - Language mismatch:     {stats['language']:>6} ({stats['language']/stats['total']*100:>5.1f}%)")
        if use_semantic_filter:
            print(f"  - Semantic similarity:   {stats['semantic']:>6} ({stats['semantic']/stats['total']*100:>5.1f}%)")
        print(f"  - Duplicates:            {stats['duplicate']:>6} ({stats['duplicate']/stats['total']*100:>5.1f}%)")
        print(f"{'='*60}")
        print(f"CLEAN PAIRS KEPT:          {stats['kept']:>6} ({stats['kept']/stats['total']*100:>5.1f}%)")
        print(f"{'='*60}\n")

        print("\n" + "="*60)
        print("SAMPLE REJECTED PAIRS (Length Ratio):")
        print("="*60)
        for i, sample in enumerate(rejected_length_samples[:10], 1):
            print(f"\n--- Sample {i} ---")
            print(f"EN ({sample['len_en']} words): {sample['en'][:150]}...")
            print(f"VI ({sample['len_vi']} words): {sample['vi'][:150]}...")
            print(f"Ratio: {sample['ratio']:.2f}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

# ---------------------------
# Usage
# ---------------------------
train_dataset = ParallelTextDataset(
    "/kaggle/input/nlp-dataset/train.en.txt",
    "/kaggle/input/nlp-dataset/train.vi.txt"
)

test_dataset = ParallelTextDataset(
    "/kaggle/input/nlp-dataset/public_test.en.txt",
    "/kaggle/input/nlp-dataset/public_test.vi.txt"
)

Cleaning train.en.txt:  36%|███▋      | 181680/500000 [06:32<11:07, 477.08it/s]

In [None]:
# Test it
for i in range(10):
    src, tgt = train_dataset[i]
    print(f"Source: {src}")
    print(f"Target: {tgt}")

print(len(train_dataset))

In [None]:
import json

def save_to_jsonl(dataset, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for en, vi in dataset.data:
            json_line = json.dumps({"en": en, "vi": vi}, ensure_ascii=False)
            f.write(json_line + '\n')

save_to_jsonl(train_dataset, "train_cleaned.jsonl")
save_to_jsonl(test_dataset, "test_cleaned.jsonl")

In [6]:
import os

train_en_path = "/kaggle/input/nlp-dataset/train.vi.txt"

def check_duplicates(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return

    seen_lines = set()
    duplicate_count = 0
    total_lines = 0
    
    # Open the file and iterate line by line
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            total_lines += 1
            # Strip whitespace to ensure accurate matching (optional but recommended)
            clean_line = line.strip()
            
            if clean_line in seen_lines:
                duplicate_count += 1
            else:
                seen_lines.add(clean_line)

    # Calculate percentage
    if total_lines > 0:
        dup_percent = (duplicate_count / total_lines) * 100
    else:
        dup_percent = 0

    print(f"--- Report for {os.path.basename(file_path)} ---")
    print(f"Total Lines:      {total_lines}")
    print(f"Unique Lines:     {len(seen_lines)}")
    print(f"Duplicate Lines:  {duplicate_count}")
    print(f"Duplicate %:      {dup_percent:.2f}%")
    
    if duplicate_count > 0:
        print("\nResult: The file contains duplicates.")
    else:
        print("\nResult: No duplicates found.")

# Run the function
check_duplicates(train_en_path)

--- Report for train.vi.txt ---
Total Lines:      500000
Unique Lines:     345687
Duplicate Lines:  154313
Duplicate %:      30.86%

Result: The file contains duplicates.
