In [None]:
pip install spacy


In [8]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m10.7 MB/s[0m  [33m0:00:01[0mta [36m0:00:01[0mm
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
import re
import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from spacy.language import Language
from spacy.symbols import PROPN

In [None]:
import spacy


def build_nlp():
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 4_000_000

    # 1) Mark protected terms before NER
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    patterns = [
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "elon"}, {"LOWER": "musk"}]},
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "tesla"}]},
        {"label": "PROTECTED_PROPN", "pattern": [{"LOWER": "spacex"}]},
    ]
    ruler.add_patterns(patterns)

    # 2) Register + add custom component
    @Language.component("merge_and_force_propn")
    def merge_and_force_propn(doc):
        spans = [ent for ent in doc.ents if ent.label_ == "PROTECTED_PROPN"]
        with doc.retokenize() as retok:
            for sp in spans:
                retok.merge(
                    sp,
                    attrs={"pos": PROPN, "tag": "NNP", "lemma": sp.text.lower(), "ent_type": sp.label}
                )
        for tok in doc:
            if tok.ent_type_ == "PROTECTED_PROPN":
                tok.pos = PROPN
                tok.tag_ = "NNP"
        return doc

    if "merge_and_force_propn" not in nlp.pipe_names:
        nlp.add_pipe("merge_and_force_propn", after="ner")

    return nlp

nlp = build_nlp()


In [23]:
# -----------------------------------------
# 2) Stopwords (keep Elon Musk / Tesla etc.)
# -----------------------------------------
STOPWORDS = {w for w in nlp.Defaults.stop_words}
# Ensure protected terms are NOT stopwords
for keep in ["elon", "musk", "elon musk", "tesla", "spacex"]:
    STOPWORDS.discard(keep)
    # Also discard split forms just in case
    for part in keep.split():
        STOPWORDS.discard(part)
print(f"Number of stopwords: {len(STOPWORDS)}")

Number of stopwords: 326


In [None]:
# ------------------------------------------------
# 3) Clean -> tokenize -> lemmatize -> stopwords
# ------------------------------------------------
def normalize_whitespace(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "")).strip()

def to_tokens(text: str):
    """
    Returns a list of final tokens:
      - punctuation & spaces removed
      - protected terms merged and kept as single tokens: elon_musk, tesla, spacex
      - lowercased
      - lemmatized (except we keep lemma for words; for PROPN we use lower form)
      - stopwords removed
    """
    if not isinstance(text, str) or not text.strip():
        return []
    text = normalize_whitespace(text)
    if not text:
        return []

    doc = nlp(text)

    tokens = []
    for tok in doc:
        # Skip spaces/punctuation
        if tok.is_space or tok.is_punct:
            continue

        # Build raw lower form
        lower = tok.text.lower()

        # Collapse protected multi-word into underscored token
        # (after merging, protected is one token with ent_type_ set)
        if tok.ent_type_ == "PROTECTED_PROPN":
            if lower == "elon musk":
                norm = "elon_musk"
            else:
                norm = lower  # "tesla" or "spacex"
            tokens.append(norm)
            continue

        # Lemma (use lemma_ for normal words; for propn keep surface lower)
        if tok.pos_ == "PROPN":
            norm = lower
        else:
            lemma = tok.lemma_.lower()
            # Some lemmas are "-PRON-" or "" in small models; fallback to lower
            norm = lemma if lemma and lemma != "-pron-" else lower

        # Remove tokens that are stopwords or purely numeric or leftover punctuation
        if norm in STOPWORDS:
            continue
        if norm.isnumeric():
            continue
        if not re.search(r"[a-z0-9_]", norm):
            continue

        tokens.append(norm)

    return tokens
# Example usage
text = "  Elon Musk's Tesla cars are amazing! Visit https://tesla.com  "
print(to_tokens(text))

['elon_musk', 'tesla', 'car', 'amazing', 'visit', 'https://tesla.com']


In [None]:
# ------------------
# 4) N-gram helpers
# ------------------
def make_ngrams(tokens, n=2):
    return ["_".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def token_pipeline(text: str, add_bigrams=True, add_trigrams=True):
    toks = to_tokens(text)
    if add_bigrams:
        toks += make_ngrams(toks, 2)
    if add_trigrams:
        toks += make_ngrams(toks, 3)
    return toks
    
# Example usage
text = "  Elon Musk's Tesla cars are like shit. Don't buy it anymore! Visit https://tesla.com  "
print(token_pipeline(text, add_bigrams=True, add_trigrams=True))


['elon_musk', 'tesla', 'car', 'amazing', 'visit', 'https://tesla.com', 'elon_musk_tesla', 'tesla_car', 'car_amazing', 'amazing_visit', 'visit_https://tesla.com', 'elon_musk_tesla_car', 'tesla_car_amazing', 'car_amazing_visit', 'amazing_visit_https://tesla.com', 'visit_https://tesla.com_elon_musk_tesla', 'https://tesla.com_elon_musk_tesla_tesla_car', 'elon_musk_tesla_tesla_car_car_amazing', 'tesla_car_car_amazing_amazing_visit', 'car_amazing_amazing_visit_visit_https://tesla.com']


In [28]:
# ------------------------------------------
# 5) Apply to your DataFrame (example usage)
# ------------------------------------------
df = pd.read_csv("nbc_articles_with_content_official.csv")
df["tokens"] = df["content"].apply(lambda t: to_tokens(t))
df["tokens_with_ngrams"] = df["content"].apply(lambda t: token_pipeline(t))
df["clean_text"] = df["tokens"].apply(lambda ts: " ".join(ts))
df.to_csv("nbc_articles_cleaned.csv", index=False)


#### Concate on the same date + topic modelling ####

In [32]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl (8.6 MB)
Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl (20.9 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-learn]
[1A[2KSuccessfully installed scikit-learn-1.7.2 scipy-1.16.2 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
# 3e + 3f: Concat-by-date → Topic Modelling (LDA)

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# ---------- 3e. Concat all same-date docs ----------
df = pd.read_csv("nbc_articles_cleaned.csv")

# ensure date is YYYY-MM-DD
df["date"] = pd.to_datetime(df["date"], errors="coerce").dt.date
df = df.dropna(subset=["date", "clean_text"])

# concat all cleaned texts per date
daily = (df.groupby("date")["clean_text"]
           .apply(lambda s: " ".join(map(str, s.dropna())))
           .reset_index(name="doc"))
# drop empty docs (if any)
daily = daily[daily["doc"].str.strip().ne("")].reset_index(drop=True)

print(f"Unique days: {len(daily)}")

# ---------- Vectorize ----------
# keep underscores in your merged entities/ngrams: elon_musk, tesla_stock, etc.
vect = CountVectorizer(
    token_pattern=r"(?u)\b[a-z0-9_]+\b",  # allow underscores
    lowercase=False,                      # your text is already normalized
    min_df=2,                             # drop ultra-rare tokens (tune)
    max_df=0.95                           # drop very common tokens (tune)
)
X = vect.fit_transform(daily["doc"])

# ---------- 3f. Topic Modelling (LDA) ----------
# tune n_components (topics) to 8–20; start with 12
lda = LatentDirichletAllocation(
    n_components=12,
    max_iter=20,
    learning_method="batch",
    random_state=42,
    n_jobs=-1
)
doc_topic = lda.fit_transform(X)     # shape: [n_days, n_topics]
topic_term = lda.components_         # shape: [n_topics, n_terms]
terms = pd.Index(vect.get_feature_names_out())

# ---------- Inspect & Save ----------
def top_terms_for_topic(k, topn=15):
    idx = topic_term[k].argsort()[::-1][:topn]
    return list(zip(terms[idx], topic_term[k, idx]))

# table of top terms per topic
topn = 15
rows = []
for k in range(lda.n_components):
    tops = top_terms_for_topic(k, topn)
    rows.append({
        "topic": k,
        "top_terms": ", ".join([w for w, _ in tops])
    })
topic_summary = pd.DataFrame(rows)

# per-day topic distribution + dominant topic
daily_topic_df = pd.DataFrame(doc_topic, columns=[f"topic_{i}" for i in range(lda.n_components)])
daily_topic_df.insert(0, "date", daily["date"].values)
daily_topic_df["dominant_topic"] = daily_topic_df.filter(like="topic_").idxmax(axis=1)
daily_topic_df["dominant_score"] = daily_topic_df.filter(like="topic_").max(axis=1)

# save artifacts
topic_summary.to_csv("topics_top_terms.csv", index=False)
daily_topic_df.to_csv("daily_topic_distribution.csv", index=False)

print("Saved:")
print(" - topics_top_terms.csv  (top terms per topic)")
print(" - daily_topic_distribution.csv  (per-day topic mixture + dominant topic)")


Unique days: 3186
Saved:
 - topics_top_terms.csv  (top terms per topic)
 - daily_topic_distribution.csv  (per-day topic mixture + dominant topic)
