In [1]:
# Spec 7 â€” Alternative Measures for Stopword Identification
# Based on: Sarica & Luo (2021) "Stopwords in technical language processing"
# Test IDF, TF-IDF, Entropy, Information Content, Information Gain, KL Divergence
# Compare each measure to NLTK stopword list (|L| words)

import os, math, nltk, numpy as np, pandas as pd
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords, webtext, gutenberg
from nltk import sent_tokenize, word_tokenize
from IPython.display import display

# ---------- Setup ----------
ART_DIR = os.path.abspath("../artifacts" if os.path.basename(os.getcwd())=="notebooks" else "./artifacts")
os.makedirs(ART_DIR, exist_ok=True)

nltk.download('punkt', quiet=True)
nltk.download('webtext', quiet=True)
nltk.download('gutenberg', quiet=True)
nltk.download('stopwords', quiet=True)

# ---------- Helper functions ----------
def clean_words(text):
    """Return lowercase alphabetic tokens."""
    toks = [w.lower() for w in word_tokenize(text) if w.isalpha()]
    return toks

def make_sentences(corpus_words):
    """Treat each sentence as a 'document'."""
    text = " ".join(corpus_words)
    sents = sent_tokenize(text)
    docs = [" ".join(clean_words(s)) for s in sents if len(s.split()) > 2]
    return docs

def entropy(prob_list):
    """Shannon entropy (base e)."""
    return -sum(p * math.log(p + 1e-12) for p in prob_list)

# ---------- Load corpus ----------
corpus = []
for fid in webtext.fileids()[:5]:
    corpus.extend(webtext.words(fid))
if len(corpus) < 20000:  # fallback
    for fid in gutenberg.fileids()[:2]:
        corpus.extend(gutenberg.words(fid))

docs = make_sentences(corpus)
print(f"Documents (sentences): {len(docs)}")

# ---------- Stopword list ----------
L = set(stopwords.words('english'))
TopN = len(L)

# ---------- Frequency (baseline) ----------
freq = Counter(w for d in docs for w in d.split())
top_freq = [w for w, _ in freq.most_common(TopN)]
overlap_freq = len([w for w in top_freq if w in L])

# ---------- IDF & TF-IDF ----------
tfidf_vec = TfidfVectorizer(token_pattern=r"\b[a-zA-Z]+\b", lowercase=True)
tfidf = tfidf_vec.fit_transform(docs)
idf_scores = dict(zip(tfidf_vec.get_feature_names_out(), tfidf_vec.idf_))
tfidf_scores = np.asarray(tfidf.mean(axis=0)).flatten()
tfidf_scores = dict(zip(tfidf_vec.get_feature_names_out(), tfidf_scores))

top_idf = sorted(idf_scores, key=idf_scores.get, reverse=True)[:TopN]
top_tfidf = sorted(tfidf_scores, key=tfidf_scores.get, reverse=True)[:TopN]
overlap_idf = len([w for w in top_idf if w in L])
overlap_tfidf = len([w for w in top_tfidf if w in L])

# ---------- Entropy ----------
word_doc_counts = defaultdict(lambda: np.zeros(len(docs)))
for i, d in enumerate(docs):
    for w in set(d.split()):
        word_doc_counts[w][i] = 1
entropy_scores = {w: entropy(v / v.sum()) if v.sum() > 0 else 0 for w, v in word_doc_counts.items()}
top_entropy = sorted(entropy_scores, key=entropy_scores.get, reverse=True)[:TopN]
overlap_entropy = len([w for w in top_entropy if w in L])

# ---------- Information Content ----------
total_tokens = sum(freq.values())
info_content = {w: -math.log(freq[w]/total_tokens + 1e-12) for w in freq}
top_ic = sorted(info_content, key=info_content.get)[:TopN]  # lowest = most common
overlap_ic = len([w for w in top_ic if w in L])

# ---------- Information Gain (approx) ----------
# simplified version: difference in entropy when removing a word
H_total = entropy(np.array(list(freq.values())) / total_tokens)
IG_scores = {}
for w in freq:
    p_w = freq[w] / total_tokens
    if p_w == 0: continue
    freq_wo = freq.copy()
    del freq_wo[w]
    total_wo = sum(freq_wo.values())
    H_wo = entropy(np.array(list(freq_wo.values())) / total_wo)
    IG_scores[w] = H_total - H_wo
top_ig = sorted(IG_scores, key=IG_scores.get, reverse=True)[:TopN]
overlap_ig = len([w for w in top_ig if w in L])

# ---------- Kullback-Leibler Divergence ----------
# measure uniformity of word distribution across docs
KL_scores = {}
for w, v in word_doc_counts.items():
    p = v / (v.sum() + 1e-12)
    uniform = np.ones(len(p)) / len(p)
    KL = np.sum(p * np.log((p + 1e-12) / (uniform + 1e-12)))
    KL_scores[w] = KL
top_kl = sorted(KL_scores, key=KL_scores.get)[:TopN]  # smaller KL = more uniform
overlap_kl = len([w for w in top_kl if w in L])

# ---------- Results summary ----------
results = pd.DataFrame([
    ("Frequency", overlap_freq, TopN, overlap_freq/TopN),
    ("IDF", overlap_idf, TopN, overlap_idf/TopN),
    ("TF-IDF", overlap_tfidf, TopN, overlap_tfidf/TopN),
    ("Entropy", overlap_entropy, TopN, overlap_entropy/TopN),
    ("Info_Content", overlap_ic, TopN, overlap_ic/TopN),
    ("Info_Gain", overlap_ig, TopN, overlap_ig/TopN),
    ("KL_Divergence", overlap_kl, TopN, overlap_kl/TopN)
], columns=["Measure", "Overlap", "TopN", "Ratio"])
results["Ratio %"] = (results["Ratio"]*100).round(2)

csv_path = os.path.join(ART_DIR, "spec7_alternative_measures.csv")
results.to_csv(csv_path, index=False)
display(results)

print("\nSaved (relative):")
print(f"- artifacts/{os.path.basename(csv_path)}")


Documents (sentences): 23600


Unnamed: 0,Measure,Overlap,TopN,Ratio,Ratio %
0,Frequency,91,198,0.459596,45.96
1,IDF,0,198,0.0,0.0
2,TF-IDF,92,198,0.464646,46.46
3,Entropy,91,198,0.459596,45.96
4,Info_Content,91,198,0.459596,45.96
5,Info_Gain,40,198,0.20202,20.2
6,KL_Divergence,91,198,0.459596,45.96



Saved (relative):
- artifacts/spec7_alternative_measures.csv
