In [1]:
# !pip install langdetect

In [8]:
import os
import re
import shutil
import numpy as np
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory

### 1. Organize File Paths

In [3]:
INPUT_TXT_DIR = 'privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT'
OUTPUT_TXT_DIR = os.path.join(INPUT_TXT_DIR, 'ENG_FILES_TXT')   # keep it nested

In [4]:
DetectorFactory.seed = 0 #set a seed for reproducible language detection

def is_english(text: str) -> bool:
    """
    Returns True if `text` is detected as English;
    False on failure or other lang.
    """
    try:
        return detect(text) == 'en'   # sample first 10 000 chars
    except Exception:
        return False

In [5]:
def load_if_english(path: str):
    with open(path, encoding='utf-8') as fh:
        txt = fh.read()
    
    # Check if it's English
    if not is_english(txt):
        return None  
    
    return txt

In [6]:
english_files = [] # paths of English‑language files

for fname in os.listdir(INPUT_TXT_DIR):
    if not fname.lower().endswith('.txt'):
        continue                               # skip non‑txt files
    full_path = os.path.join(INPUT_TXT_DIR, fname)
    if load_if_english(full_path) is not None:
        english_files.append(full_path)

english_files.sort()
print(f"Done. English files found: {len(english_files)}")

Done. English files found: 34


In [9]:
## Copy English files into output directory

os.makedirs(OUTPUT_TXT_DIR, exist_ok=True) # ensure destination exists

for src_path in english_files:
    fname = os.path.basename(src_path)   # just the filename
    dst_path = os.path.join(OUTPUT_TXT_DIR, fname)
    if not os.path.exists(dst_path):
        shutil.copy2(src_path, dst_path)     # should I use or shutil.move(...) to relocate instead?

print(f"Copied {len(english_files)} files to {OUTPUT_TXT_DIR}")

Copied 34 files to privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/ENG_FILES_TXT
