In [None]:
!pip install spacy scikit-learn


In [8]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.7 MB/s[0m  [33m0:00:01[0mta [36m0:00:01[0mm
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import re
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from spacy.language import Language
from spacy.symbols import PROPN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#### **1. Tokenize, lemmatization, remove stopwords** 

In [None]:
def build_nlp():
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 4_000_000

    # 1) Mark protected terms before NER
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    patterns = [
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "elon"}, {"LOWER": "musk"}]},
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "tesla"}]},
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "spacex"}]},
    ]
    ruler.add_patterns(patterns)

    # 2) Register + add custom component
    @Language.component("merge_and_force_propn")
    def merge_and_force_propn(doc):
        spans = [ent for ent in doc.ents if ent.label_ == "PROTECTED_PROPN"]
        with doc.retokenize() as retok:
            for sp in spans:
                retok.merge(
                    sp,
                    attrs={"pos": PROPN, "tag": "NNP", "lemma": sp.text.lower(), "ent_type": sp.label}
                )
        for tok in doc:
            if tok.ent_type_ == "PROTECTED_PROPN":
                tok.pos = PROPN
                tok.tag_ = "NNP"
        return doc

    if "merge_and_force_propn" not in nlp.pipe_names:
        nlp.add_pipe("merge_and_force_propn", after="ner")

    return nlp

nlp = build_nlp()


In [7]:
# -----------------------------------------
# 2) Stopwords (expand aggressively)
# -----------------------------------------
BASE_STOPS = set(nlp.Defaults.stop_words)
MORE_STOPS = {
    # generic filler / vague words
    "people","think","use","thing","want","time","work","life","world","way","good","new","year",
    "day","come","look","talk","right","like","make","find","need","get","say","take","know", "we", 
    "i", "that", "lot", "m"
    # twitter/web artifacts
    "amp","rt"
}
STOPWORDS = BASE_STOPS | MORE_STOPS
# keep protected terms
for keep in ["elon", "musk", "elon musk", "tesla", "spacex"]:
    for part in keep.split():
        STOPWORDS.discard(part)
    STOPWORDS.discard(keep)

print(f"Number of stopwords: {len(STOPWORDS)}")

Number of stopwords: 351


In [None]:
# ------------------------------------------------
# 3) Clean -> tokenize -> lemmatize -> stopwords
# ------------------------------------------------
def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "")).strip()

def to_tokens(text: str):
    """
    Returns a list of final tokens:
      - punctuation & spaces removed
      - protected terms merged and kept as single tokens: elon_musk, tesla, spacex
      - lowercased
      - lemmatized (except we keep lemma for words; for PROPN we use lower form)
      - stopwords removed
    """
    if not isinstance(text, str) or not text.strip():
        return []
    text = normalize_whitespace(text)
    if not text:
        return []

    doc = nlp(text)

    tokens = []
    for tok in doc:
        # Skip spaces/punctuation
        if tok.is_space or tok.is_punct:
            continue

        # Build raw lower form
        lower = tok.text.lower()

        # Collapse protected multi-word into underscored token
        # (after merging, protected is one token with ent_type_ set)
        if tok.ent_type_ == "PROTECTED_PROPN":
            if lower == "elon musk":
                norm = "elon_musk"
            else:
                norm = lower  # "tesla" or "spacex"
            tokens.append(norm)
            continue

        # Lemma (use lemma_ for normal words; for propn keep surface lower)
        if tok.pos_ == "PROPN":
            norm = lower
        else:
            lemma = tok.lemma_.lower()
            # Some lemmas are "-PRON-" or "" in small models; fallback to lower
            norm = lemma if lemma and lemma != "-pron-" else lower

        # Remove tokens that are stopwords or purely numeric or leftover punctuation
        if norm in STOPWORDS:
            continue
        if norm.isnumeric():
            continue
        if not re.search(r"[a-z0-9_]", norm):
            continue

        tokens.append(norm)

    return tokens
# Example usage
text = "  Elon Musk's Tesla cars are amazing! Visit https://tesla.com  "
print(to_tokens(text))

['elon_musk', 'tesla', 'car', 'amazing', 'visit', 'https://tesla.com']


In [None]:
# ------------------
# 4) N-gram helpers
# ------------------
def make_ngrams(tokens, n=2):
    return ["_".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def token_pipeline(text: str, add_bigrams=True, add_trigrams=True):
    toks = to_tokens(text)
    if add_bigrams:
        toks += make_ngrams(toks, 2)
    if add_trigrams:
        toks += make_ngrams(toks, 3)
    return toks
    
# Example usage
text = "  Elon Musk's Tesla cars are like shit. Don't buy it anymore! Visit https://tesla.com  "
print(token_pipeline(text, add_bigrams=True, add_trigrams=True))


['elon_musk', 'tesla', 'car', 'amazing', 'visit', 'https://tesla.com', 'elon_musk_tesla', 'tesla_car', 'car_amazing', 'amazing_visit', 'visit_https://tesla.com', 'elon_musk_tesla_car', 'tesla_car_amazing', 'car_amazing_visit', 'amazing_visit_https://tesla.com', 'visit_https://tesla.com_elon_musk_tesla', 'https://tesla.com_elon_musk_tesla_tesla_car', 'elon_musk_tesla_tesla_car_car_amazing', 'tesla_car_car_amazing_amazing_visit', 'car_amazing_amazing_visit_visit_https://tesla.com']


In [28]:
# ------------------------------------------
# 5) Apply to your DataFrame (example usage)
# ------------------------------------------
df = pd.read_csv("nbc_articles_with_content_official.csv")
df["tokens"] = df["content"].apply(lambda t: to_tokens(t))
df["tokens_with_ngrams"] = df["content"].apply(lambda t: token_pipeline(t))
df["clean_text"] = df["tokens"].apply(lambda ts: " ".join(ts))
df.to_csv("nbc_articles_cleaned.csv", index=False)


#### **2. Topic modelling - Article level** ####

In [8]:
# 1) POS-filtered tokens: keep only PROPN & NOUN (Tesla, SpaceX, Autopilot, battery, etc.)
def to_tokens_nouns(text: str):
    if not isinstance(text, str) or not text.strip():
        return []
    doc = nlp(text)
    out = []
    for tok in doc:
        if tok.is_space or tok.is_punct or tok.like_url or tok.like_email:
            continue
        if tok.ent_type_ == "PROTECTED_PROPN":
            out.append(tok.lemma_.lower().replace(" ", "_"))
            continue
        if tok.pos_ not in {"PROPN", "NOUN"}:
            continue
        if tok.pos_ == "PROPN":
            norm = tok.text.lower()
        else:
            lemma = tok.lemma_.lower()
            norm = lemma if lemma and lemma != "-pron-" else tok.text.lower()
        if norm in STOPWORDS or norm.isnumeric() or not re.search(r"[a-z0-9_]", norm):
            continue
        out.append(norm)
    return out

# 2) Build article-level text using only nouns/proper nouns
df = pd.read_csv("nbc_articles_cleaned.csv")
df["content"] = df["content"].fillna("").astype(str)  # use original content, not previously-cleaned string
df["tokens_nouns"] = df["content"].apply(to_tokens_nouns)
df["clean_text_nouns"] = df["tokens_nouns"].apply(lambda ts: " ".join(ts))
df = df[df["clean_text_nouns"].str.strip().ne("")].reset_index(drop=True)

# 3) Article-level LDA using only PROPN+NOUN
extra_stops = ["s","t","u","re","rt","amp","know","don"]
vect = CountVectorizer(
    stop_words=extra_stops,
    token_pattern=r"(?u)\b[a-z0-9_]+\b",
    lowercase=False,
    min_df=5,
    max_df=0.95
)
X = vect.fit_transform(df["clean_text_nouns"])

n_topics = 10
lda = LatentDirichletAllocation(
    n_components=n_topics, max_iter=20, learning_method="batch",
    random_state=42, n_jobs=-1
)
doc_topic = lda.fit_transform(X)
topic_term = lda.components_
terms = vect.get_feature_names_out()

def top_terms_for_topic(k, topn=10):
    idx = topic_term[k].argsort()[::-1][:topn]
    return ", ".join(terms[i] for i in idx)

topic_summary = pd.DataFrame({
    "topic": range(n_topics),
    "top_terms": [top_terms_for_topic(k) for k in range(n_topics)]
})

article_topic_df = pd.DataFrame(doc_topic, columns=[f"topic_{i}" for i in range(n_topics)])
for col in ["date","title","link"]:
    if col in df.columns: article_topic_df.insert(0, col, df[col].values)
article_topic_df["dominant_topic"] = article_topic_df.filter(like="topic_").idxmax(axis=1)
article_topic_df["dominant_score"] = article_topic_df.filter(like="topic_").max(axis=1)

topic_summary.to_csv("articles_topics_top_terms.csv", index=False)
article_topic_df.to_csv("article_topic_distribution.csv", index=False)
print("Saved article-level LDA using only PROPN+NOUN.")


Saved article-level LDA using only PROPN+NOUN.
