In [1]:
pip install requests beautifulsoup4 lxml pandas tqdm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os, re, csv
from glob import iglob

WORD_RE = re.compile(r"[A-Za-z']+")
SENT_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
URL_RE = re.compile(r'https?://|www\.', re.IGNORECASE)
EMAIL_RE = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}')
DIGIT_RE = re.compile(r'\d')

def split_sentences(text: str):
    text = re.sub(r'\s+', ' ', text.strip())
    if not text:
        return []
    return SENT_SPLIT_RE.split(text)

def tokenize(text: str):
    return WORD_RE.findall(text.lower())

def is_story_like(s: str) -> bool:
    s = s.strip()
    if not s: return False
    if URL_RE.search(s): return False
    if EMAIL_RE.search(s): return False
    if DIGIT_RE.search(s): return False
    caps_tokens = sum(1 for t in s.split() if len(t) > 2 and t.isupper())
    if caps_tokens >= 3: return False
    return True

def _process_file(path: str, filter_non_story: bool = True):
    try:
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            raw = f.read()
    except Exception as e:
        print(f"[WARN] Skip {path}: {e}")
        return []
    sentences = split_sentences(raw)
    out = []
    for s in sentences:
        if filter_non_story and not is_story_like(s):
            continue
        toks = tokenize(s)
        if len(toks) == 5:
            out.append({
                "sentence_raw": s.strip(),
                "sentence_clean": " ".join(toks),
                "tokens": " ".join(toks),
            })
    return out

def _iter_txt_files(p: str):
    if os.path.isfile(p):
        if p.lower().endswith(".txt"):
            yield p
        else:
            print(f"[WARN] Not a .txt: {p}")
        return
    for q in iglob(os.path.join(p, "**", "*.txt"), recursive=True):
        yield q

def _load_seen_keys(csv_path: str, key_col: str):
    """Muat set nilai kolom 'key_col' dari CSV (untuk de-dup)."""
    seen = set()
    if not os.path.isfile(csv_path):
        return seen
    try:
        with open(csv_path, "r", encoding="utf-8", newline="") as f:
            r = csv.DictReader(f)
            for row in r:
                k = row.get(key_col)
                if k:
                    seen.add(k)
    except Exception as e:
        print(f"[WARN] Gagal baca CSV untuk de-dup: {e}")
    return seen

def preprocess_5word_sentences(input_path: str,
                               output_csv: str = "five_word_sentences.csv",
                               filter_non_story: bool = True,
                               append: bool = True,
                               dedupe: bool = True,
                               dedupe_key: str = "sentence_clean"):
    rows, nfiles = [], 0
    for fp in _iter_txt_files(input_path):
        nfiles += 1
        rows.extend(_process_file(fp, filter_non_story=filter_non_story))

    os.makedirs(os.path.dirname(output_csv) or ".", exist_ok=True)

    file_exists = os.path.isfile(output_csv)
    mode = "a" if (append and file_exists) else "w"

    
    with open(output_csv, mode, newline="", encoding="utf-8") as f:
        fieldnames = ["sentence_raw", "sentence_clean", "tokens"]
        w = csv.DictWriter(f, fieldnames=fieldnames)
        if mode == "w":
            w.writeheader()

        # siapkan set untuk de-dup
        seen = _load_seen_keys(output_csv, dedupe_key) if (dedupe and file_exists) else set()

        written = 0
        for r in rows:
            if dedupe:
                k = r.get(dedupe_key)
                if k in seen:
                    continue
                seen.add(k)
            w.writerow(r)
            written += 1

    print(f"[DONE] files={nfiles}  extracted(5 kata)={len(rows)}  written={written}  saved={output_csv}")
    return rows


In [None]:
rows = preprocess_5word_sentences(
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\The Hound of the Baskervilles.txt",
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv"
)

rows = preprocess_5word_sentences(
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\A Study In Scarlet.txt",
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv"
)

rows = preprocess_5word_sentences(
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\The Sign of the Four.txt",
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv"
)
rows = preprocess_5word_sentences(
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\The Valley of Fear.txt",
    r"C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv"
)




[DONE] files=1  extracted(5 kata)=86  written=86  saved=C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv
[DONE] files=1  extracted(5 kata)=55  written=55  saved=C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv
[DONE] files=1  extracted(5 kata)=75  written=75  saved=C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv
[DONE] files=1  extracted(5 kata)=115  written=115  saved=C:\Users\Neptune\Documents\tugas_kefas\Semester 5\NLP\ancog\five_word_sentences.csv
