In [None]:
'''!pip -q install "numpy<2.0" "scipy<1.11" "gensim==4.3.2" \
                tqdm pandas matplotlib scikit-learn spacy==3.7.4

# spaCy English model
!python -m spacy download en_core_web_sm -qq
print("✅  All wheels installed — now click  Runtime ▸ Restart  .")


In [None]:
# ---------- imports -----------------------------
import re, datetime as dt, itertools, numpy as np, matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

import spacy, gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

# ---------- user config -------------------------
SOURCE_FILE = Path("/content/EQIX_transcripts.txt")
CALL_DIR    = Path("/content/calls_split"); CALL_DIR.mkdir(exist_ok=True)
TOPIC_CSV   = Path("/content/call_topics.csv")
DETAIL_CSV  = Path("/content/call_topic_details.csv")

MIN_DF = 0.01     # keep tokens appearing in ≥1 % and ≤25 % docs
MAX_DF = 0.25
K_RANGE = range(5, 21)      # topic counts to test
TOPN_WORDS = 12
TOP_K_PER_CALL = 5          # strongest topics to record per call
MIN_PROB_TO_KEEP = 0.02

# ---------- split master transcript -------------
HDR = re.compile(r"^=+\s*(?P<year>\d{4})\s*Q(?P<qtr>[1-4])\s*\((?P<ts>[\d\- :]+)\)\s*=+", re.MULTILINE)
def split(path):
    txt = path.read_text(errors="ignore")
    meta, calls = [], {}
    hits = list(HDR.finditer(txt))
    for i, m in enumerate(hits):
        body = txt[m.end(): hits[i+1].start() if i+1<len(hits) else len(txt)].strip()
        cid  = f"{m['year']}-Q{m['qtr']}"
        (CALL_DIR/f"{cid}.txt").write_text(body)
        calls[cid] = body
        meta.append({"call_id":cid,
                     "timestamp":dt.datetime.fromisoformat(m['ts'].strip())})
    return meta, calls
meta, calls = split(SOURCE_FILE); print("Split →",len(meta),"calls")

# ---------- spaCy clean -------------------------
nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])
for w in {"quarter","year","q&a","operator"}: nlp.vocab[w].is_stop=True

def tokens(doc):
    d=nlp(doc.lower())
    return [t.lemma_ for t in d if t.is_alpha and not t.is_stop and t.pos_ in {"NOUN","ADJ"}]

token_lists = [tokens(t) for t in tqdm(calls.values())]

# ---------- min_df / max_df vocab trim ----------
df = pd.Series(itertools.chain.from_iterable({w for w in s} for s in token_lists)).value_counts()
keep = df[(df>=MIN_DF*len(token_lists)) & (df<=MAX_DF*len(token_lists))].index
token_lists = [[w for w in doc if w in keep] for doc in token_lists]

dictionary  = Dictionary(token_lists)
corpus_bow  = [dictionary.doc2bow(t) for t in token_lists]
print("Vocabulary size:", len(dictionary))

# ---------- grid-search LDA --------------------
best_k,best_coh,best_lda=None,-np.inf,None
for k in K_RANGE:
    lda=LdaModel(corpus_bow, id2word=dictionary, num_topics=k,
                 passes=10, alpha="auto", random_state=42)
    coh=CoherenceModel(model=lda, corpus=corpus_bow,
                       coherence="u_mass").get_coherence()
    print(f"k={k:2d} → UMass {coh:.3f}")
    if coh>best_coh: best_k,best_coh,best_lda=k,coh,lda
lda = best_lda
print(f"\n✅  Best model: k={best_k}  (UMass={best_coh:.3f})")

# show topic keywords
for t in range(lda.num_topics):
    print(f"[T{t:02}] "," , ".join(w for w,_ in lda.show_topic(t, topn=TOPN_WORDS)))

# ---------- wide matrix per call ---------------
rows=[]
for meta_row,bow in zip(meta,corpus_bow):
    dist=lda.get_document_topics(bow, minimum_probability=0.0)
    rows.append({"call_id":meta_row["call_id"],
                 "timestamp":meta_row["timestamp"],
                 **{f"T{tid:02}":prob for tid,prob in dist}})
topic_df=pd.DataFrame(rows).sort_values("timestamp")
topic_df.to_csv(TOPIC_CSV,index=False,date_format="%Y-%m-%d %H:%M:%S")
print("Saved", TOPIC_CSV)

# ---------- long table: top-K topics per call ---
kw_cache={t:", ".join(w for w,_ in lda.show_topic(t, topn=10))
          for t in range(lda.num_topics)}
detail=[]
for (cid,ts),bow in zip(meta,corpus_bow):
    for tid,prob in sorted(lda.get_document_topics(bow, 0.0),
                           key=lambda x:x[1], reverse=True)[:TOP_K_PER_CALL]:
        if prob<MIN_PROB_TO_KEEP: continue
        detail.append({"call_id":cid,"timestamp":ts,
                       "topic_id":tid,"weight":prob,"keywords":kw_cache[tid]})
detail_df=pd.DataFrame(detail).sort_values(["timestamp","weight"],ascending=[True,False])
detail_df.to_csv(DETAIL_CSV,index=False,date_format="%Y-%m-%d %H:%M:%S")
pd.options.display.max_colwidth=80
display(detail_df.head(15))

# ---------- quick plot --------------------------
plt.figure(figsize=(12,4))
for tid in range(best_k):
    plt.plot(topic_df["timestamp"], topic_df[f"T{tid:02}"], label=f"T{tid:02}")
plt.legend(ncol=4,fontsize=8); plt.grid(alpha=.3)
plt.title("Topic weights over time"); plt.tight_layout(); plt.show()


In [None]:
import re, pandas as pd

kw_cache = {t: ", ".join(w for w, _ in lda.show_topic(t, topn=10))
            for t in range(lda.num_topics)}

records = []
for meta_row, bow in zip(meta, corpus_bow):
    cid = meta_row["call_id"]
    ts  = meta_row["timestamp"]

    m = re.match(r"(?P<year>\d{4})-Q(?P<qtr>[1-4])", cid)
    year, qtr = int(m["year"]), int(m["qtr"])

    for tid, prob in sorted(lda.get_document_topics(bow, 0.0),
                            key=lambda x: x[1], reverse=True)[:TOP_K_PER_CALL]:
        if prob < MIN_PROB_TO_KEEP:
            continue
        records.append({
            "call_id" : cid,
            "year"    : year,
            "quarter" : qtr,
            "timestamp": ts,
            "topic_id": tid,
            "weight"  : prob,
            "keywords": kw_cache[tid]
        })

detail_df = (pd.DataFrame(records)
               .sort_values(["timestamp", "weight"], ascending=[True, False]))

detail_df.to_csv(DETAIL_CSV, index=False,
                 date_format="%Y-%m-%d %H:%M:%S")

pd.options.display.max_colwidth = 100
display(detail_df.tail(20))
print(f"✓ {DETAIL_CSV} rewritten with call_id, year, quarter, timestamp.")
