In [4]:
import re
from pathlib import Path
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def ensure_nltk():
    # tagger (new name first, then fallback)
    try:
        nltk.data.find("taggers/averaged_perceptron_tagger_eng")
    except LookupError:
        try:
            nltk.download("averaged_perceptron_tagger_eng", quiet=True)
        except Exception:
            nltk.download("averaged_perceptron_tagger", quiet=True)
    # tokenizers / corpora
    for pkg in ["punkt", "stopwords", "wordnet", "omw-1.4"]:
        try:
            nltk.data.find(f"corpora/{pkg}")
        except LookupError:
            nltk.download(pkg, quiet=True)

ensure_nltk()
from nltk import pos_tag, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

ALL_PATH = Path("all_musk_posts.csv")

df_raw = pd.read_csv(ALL_PATH, low_memory=False)
text_col = "fullText" if "fullText" in df_raw.columns else ("text" if "text" in df_raw.columns else None)
if not text_col:
    raise ValueError("Expected a text column named 'fullText' or 'text'.")
df = df_raw[[text_col, "createdAt"]].rename(columns={text_col: "text"}).dropna(subset=["text"]).copy()
df["text"] = df["text"].astype(str)

URL_RE   = re.compile(r"http\S+|www\.\S+")
USER_RE  = re.compile(r"@\w+")
EMOJI_RE = re.compile(
    "["  # broad emoji ranges
    "\U0001F1E0-\U0001F1FF"
    "\U0001F300-\U0001F5FF"
    "\U0001F600-\U0001F64F"
    "\U0001F680-\U0001F6FF"
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FA6F"
    "\U0001FA70-\U0001FAFF"
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE
)
PUNCT_RE = re.compile(r"[^\w\s$]")  # keep $ (cashtags: $TSLA)

base_stops = set(stopwords.words("english"))
extra = {
    # conversational fillers
    "rt","amp","im","ive","youre","weve","hes","shes","its","dont","cant","wont",
    "yeah","ok","okay","true","haha","ha","wow","cool","good","great","thanks","thank",
    "exactly","right","love","like","really","much","many","one","thing","things",
    "today","tomorrow","yesterday","time","year","years","people","guys"
}
stops = base_stops | extra
lemm = WordNetLemmatizer()

def basic_cleanup(t: str) -> str:
    t = t.lower()
    t = URL_RE.sub(" ", t)
    t = USER_RE.sub(" ", t)
    t = EMOJI_RE.sub(" ", t)
    t = PUNCT_RE.sub(" ", t)    
    t = re.sub(r"\s+", " ", t).strip()
    return t

#POS taggger
def clean_with_pos(text: str) -> str:
    t = basic_cleanup(text)
    toks = word_tokenize(t)
    if not toks:
        return ""
    tags = pos_tag(toks)
    out = []
    for w, p in tags:
        if w in stops or len(w) <= 2:
            continue
        if w.startswith("$") or re.fullmatch(r"\d+(m|b|%)?", w):
            out.append(w.upper()); continue
        if p.startswith("NN"):
            out.append(lemm.lemmatize(w))
    return " ".join(out)

#Without POS
def clean_simple(text: str) -> str:
    t = basic_cleanup(text)
    out = []
    for w in t.split():
        if w in stops or len(w) <= 2:
            continue
        if w.startswith("$") or re.fullmatch(r"\d+(m|b|%)?", w):
            out.append(w.upper()); continue
        out.append(lemm.lemmatize(w))
    return " ".join(out)

try:
    _ = pos_tag(["test"])
    use_pos = True
except Exception:
    use_pos = False

clean_fn = clean_with_pos if use_pos else clean_simple
print(f"Cleaning mode: {'POS-filtered nouns' if use_pos else 'simple (no POS)'}")

df["clean"] = df["text"].apply(clean_fn)

df = df[df["clean"].str.split().str.len() >= 3].copy()
print("Kept rows after cleaning:", len(df))

df[["text", "clean"]].to_csv("elon_musk_clean_tweets.csv", index=False, encoding="utf-8")

#tfidf
tfidf = TfidfVectorizer(
    ngram_range=(1,3),   
    min_df=10,          
    max_df=0.85,       
)
X = tfidf.fit_transform(df["clean"])
vocab = np.array(tfidf.get_feature_names_out())
print("TF-IDF shape:", X.shape)

# NMF topic
n_topics = 8   
nmf = NMF(n_components=n_topics, random_state=42, init="nndsvd", max_iter=400)
W = nmf.fit_transform(X)   
H = nmf.components_        

def topics_table(model, feat_names, topn=12) -> pd.DataFrame:
    rows = []
    for k, comp in enumerate(model.components_):
        idx = comp.argsort()[::-1][:topn]
        rows.extend([{"topic": k, "term": feat_names[i], "weight": comp[i]} for i in idx])
    out = pd.DataFrame(rows)
    for k in range(model.n_components):
        terms = ", ".join(out[out.topic==k].sort_values("weight", ascending=False)["term"].head(12))
        print(f"Topic {k:02d}: {terms}")
    return out

topics_df = topics_table(nmf, vocab, topn=12)

df["topic_id"] = W.argmax(axis=1)
df["topic_score"] = W.max(axis=1)

df.to_csv("elon_musk_clean_topics.csv", index=False, encoding="utf-8")
topics_df.to_csv("nmf_topics_terms.csv", index=False, encoding="utf-8")

print("Saved:")
print("- elon_musk_clean_topics.csv  (text, clean, topic_id, topic_score)")
print("- nmf_topics_terms.csv        (top terms per topic)")



Cleaning mode: POS-filtered nouns
Kept rows after cleaning: 19404
TF-IDF shape: (19404, 2078)
Topic 00: tesla, company, autopilot, tesla team, tesla model, software, owner, tesla owner, tesla autopilot, tesla car, supercharger, vehicle
Topic 01: medium, legacy, legacy medium, propaganda, lie, machine, platform, medium propaganda, truth, news, medium lie, link
Topic 02: launch, rocket, falcon, starship, flight, engine, starlink, space, spacex, dragon, earth, mar
Topic 03: car, model, production, company, tesla car, month, week, cost, part, car company, software, model car
Topic 04: twitter, government, world, account, money, company, speech, country, law, platform, america, party
Topic 05: day, point, hour, post, night, person, news, house, end, end day, system, 100
Topic 06: way, future, civilization, company, kid, money, platform, game, earth, level, need, tunnel
Topic 07: team, lot, work, spacex, week, tesla team, spacex team, congrats, game, engineering, congratulation, lot work
Sav

In [6]:
df

Unnamed: 0,text,createdAt,clean,topic_id,topic_score
0,RT @einarvollset: I read @paulg’s “How to Mak...,2023-05-07 10:36:27+00:00,wealth hacker painter mid twenty piece,4,0.002072
2,RT @BillyM2k: dude bookmarks are an awesome tw...,2023-02-09 20:03:00+00:00,dude bookmark twitter feature twitter,4,0.158176
3,Event Horizon Balance Beam,2023-05-12 05:52:26+00:00,event horizon balance beam,4,0.002617
5,RT @SpaceX: Watch Falcon 9 launch 54 Starlink ...,2022-12-28 09:43:36+00:00,watch falcon launch starlink satellite,2,0.107221
10,"To get Blue Verified for $7/month, sign up via...",2023-03-23 21:52:23+00:00,month sign web browser,3,0.007937
...,...,...,...,...,...
55053,Sterilizing minor children before the age of c...,2024-12-04 14:08:44+00:00,child age consent surgery,4,0.003809
55074,Extreme birth rate collapse is the biggest dan...,2024-12-03 17:14:13+00:00,birth rate collapse danger civilization,6,0.008933
55081,@ajtourville @Tesla Rep. Khanna is a sensible ...,2024-12-03 15:04:37+00:00,rep khanna moderate,0,0.000000
55090,The scale of spending on illegal immigration b...,2024-12-03 02:10:20+00:00,scale spending immigration mind,4,0.018651
