# **🚀 Notebook to extract text from pdf (if needed), build an LDA model, save it, set up a pyLDAvis visual, and then write a Streamlit file.**


# 🛠 Install and Configure Environment

This section sets up the Python environment by installing specific package versions needed for topic modelling.

**Important:**  
- Some Colab default packages conflict with `pyLDAvis` and `gensim`.
- We first uninstall any conflicting packages, then install the correct versions.

✅ **The runtime will say it has crashed after this step. This is normal and part of the code - just continue to the next block and run it.**


In [None]:
# Install required libraries for topic modeling and visualisation:
# - gensim: for topic modelling using LDA (Latent Dirichlet Allocation) to discover hidden topics
# - nltk: for text cleaning and tokenization
# - pyLDAvis: for interactive topic visualisation
# - tqdm: for progress bars during data loading

# Setup environment for Gensim + pyLDAvis compatibility in Colab (encountered issues with this previously)
# This ensures NumPy is downgraded before Gensim is installed

import sys
import importlib

# ✅ Reset environment, uninstall problematic packages first
!pip uninstall -y numpy scipy scikit-learn gensim pyldavis tensorflow thinc blosc2 imbalanced-learn

# 🧹 Reinstall compatible versions (lock to known working versions)
!pip install numpy==1.25.2 scipy==1.10.1 scikit-learn==1.3.1 --quiet
!pip install gensim==4.3.1 pyLDAvis==3.4.1 tqdm nltk pandas pymupdf --quiet

# 🔁 Restart runtime AFTER THIS CELL RUNS
import os
print("✅ All packages reinstalled successfully. Now restarting the runtime...")
os.kill(os.getpid(), 9)





# 📚 Import Libraries and Configure Logging

We now import all the essential libraries needed for:
- Text preprocessing
- Topic modelling
- Visualisation
- File handling

Logging is also configured to help monitor progress and spot errors.


In [None]:
# Import libraries and set up logging for review

# Libraries:
import os
import logging
from pathlib import Path
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Logging (time that the log was made, log type, and the log message):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')

# 🔗 Mount Google Drive

Mount your Google Drive to Colab to access:
- Your input documents (PDFs or extracted `.txt` files)
- Folders where outputs (models, CSVs, visualisations) will be saved, etc.

**Action:**  
- Click "Connect to Google Drive" when prompted.
- Replace "/YOUR/GDRIVE/PATH/HERE" with the GDrive path you want your outputs to be saved at.

In [None]:
# Mount GDrive to access both the input text file database and to save the output files

from google.colab import drive
drive.mount('/content/drive')

# Define the folder where .txt files are located
output_folder = Path("/YOUR/GDRIVE/PATH/HERE")
output_folder.mkdir(parents=True, exist_ok=True)

# 📄 (Optional) Extract Text from PDF Files

If you start with PDFs (not `.txt` files), the following modules in this section will:
- Define file paths.
- Extract text from each PDF file.
- Save them as `.txt` files into your output folder.

**Note:**  
- Only run this if you have raw PDF files.
- Otherwise, skip ahead if you already have `.txt` documents.
- Replace "/YOUR/GDRIVE/PATH/HERE" with the GDrive paths leading to where you have your pdf dataset saved, as well as where you want the extracted files to be saved.

In [None]:
# Define file paths: Unneeded if you've already run the prior step AND you're NOT extracting new text.
# If you are extracting new text, make sure to run this path & replace the input folder with the correct path to your DS.

#Define data input & output paths to destination directories using pathlib

#Input: Folder containing source PDFs (test first with a small batch <100, larger batches will take a longer time)

pdf_input_folder = Path("/YOUR/GDRIVE/PATH/HERE")

#Output: Folder to store the corresponding extracted .txt files for each PDF.

output_folder = Path("/YOUR/GDRIVE/PATH/HERE")
output_folder.mkdir(parents=True, exist_ok=True)

#logging

logging.info(f"Saving extracted texts to {output_folder}")

In [None]:
# Optional: Extract .txt files from PDF files. I'm doing this step again because I want a larger dataset
# than the 100 I originally extracted.
# NOTE: This code only needs to run once per dataset, to prevent saving duplicate .txt files by accident.

import fitz #for PyMuPDF
from tqdm import tqdm #for progress bar

# Define a function to extract all text from a PDF using PyMuPDF
# Each page's text is joined into a single string.
# Returns an empty string and logs an error if the PDF fails to open/read
def extract_text_from_pdf(file_path):
  try:
    doc = fitz.open(file_path)
    return "\n".join(page.get_text() for page in doc)
  except Exception as e:
    logging.error(f"Error reading {file_path}: {e}")
    return ""

# Next I process and extract text from a batch of 100 PDFs only
# Again, this is to speed up testing as this is only a mini project

pdf_files = sorted(list(pdf_input_folder.glob("*.pdf")))[:500]

# Save each extracted document as a .txt file in the output folder

for pdf_file in tqdm(pdf_files, desc="Extracting text"):
  text = extract_text_from_pdf(pdf_file)
  output_file = output_folder / f"{pdf_file.stem}.txt"
  with open(output_file, "w", encoding="utf-8") as f:
    f.write(text)
  logging.info(f"✅ Saved extracted text: {output_file.name}")

# 🧹 Download NLTK Resources

Download NLTK tokenizers and stopword lists.

**Only needs to be done once** per runtime session.


In [None]:
# Download required NLTK resources
# - punkt: for sentence and word tokenisation
# - stopwords: for filtering out common words in topic modelling

# NOTE: Only do this once.

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")

# 📂 Load Text Files into Memory

Load all extracted `.txt` files into Python memory for preprocessing and topic modelling.

Each document becomes a text string ready for cleaning and analysis.


In [None]:
# Read all .txt files into memory for analysis
# I am using pre-extracted PDFs from an earlier project

documents = []
filenames = []

for text_file in tqdm(sorted(output_folder.glob("*.txt")), desc="Loading text files"):
  with open(text_file, "r", encoding="utf-8") as f:
    text = f.read()
    documents.append(text)
    filenames.append(text_file.name)

logging.info(f"Loaded {len(documents)} documents for topic modelling.")

# 🧹 Preprocess Text Data for LDA

This section:
- Tokenizes each document (splits into individual words)
- Removes:
  - Stopwords
  - Non-alphabetic tokens
  - Rare or meaningless words (custom stopword list, very necessary)

The result: clean token lists for each document, ready for modelling.


In [None]:
# Process the text (basic preprocessing):
# - Convert to lowercase
# - Tokenise into small KWs
# - Remove stopwords and non-alphabetic tokens
# - Remove rare occurences etc.

# I am using Gensim for training, Gensim expects (/ functions we will call):
# - A dict mapping word IDs to words
# - A corpus as a list of Bag-of-wWrd vectors

import nltk

base_stopwords = set(stopwords.words("english"))

# Custom stopwords to avoid unhelpful topic/KW inclusion
custom_stopwords = {
    # Structural / procedural terms
    "section", "article", "clause", "paragraph", "subsection", "chapter", "appendix", "exhibit",

    # Legalese connectors
    "herein", "hereby", "hereof", "hereinafter", "hereunder", "herewith", "hereto",
    "thereof", "therein", "thereby", "thereunder", "therewith", "thereon", "thereafter", "therefrom",
    "whereas", "whereof", "witnesseth", "aforementioned", "aforesaid", "forthwith",

    # Temporal and procedural glue
    "forth", "following", "preceding", "prior", "subsequent", "pursuant", "notwithstanding",

    # Reference/summary phrases
    "included", "includes", "including", "refer", "referred", "reference", "relating", "contained", "contain",
    "provides", "provided", "consist", "comprise", "accordingly", "respectively", "additionally", "meanwhile", "similarly",

    # Auxiliary / modal verbs
    "must", "shall", "may", "might", "should", "could", "can", "cannot", "will", "would",

    # Single-character filler & incomplete (etc) words
    "b", "x", "e", "i", "ii", "iii", "iv", "v", "de", "des", "le", "les", "et", "en", "fi", "d", "l", "la", "d",

    # Misc empty-content terms & pronouns
    "total", "date", "number", "digits", "digit", "page", "note", "notes", "notation", "noted", "they", "I", "we", "she", "he"
}

# Merge both stopwords into master stopword list
all_stopwords = base_stopwords.union(custom_stopwords)

def preprocess(text):
  tokens = word_tokenize(text.lower())
  return [word for word in tokens if word.isalpha() and word not in all_stopwords]

# Apply preprocssing to all documents
tokenized_docs = [preprocess(doc) for doc in tqdm(documents, desc="Tokenising documents")]

# Create BoW corpus and dictionary
dictionary = corpora.Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]


# 🧠 Train LDA Topic Model

The below sections build and then run a function to train an LDA model to discover hidden topics across the documents.

Key parameters you can adjust:
- `num_topics`: Number of topics to discover
- `passes`: How many times the model scans the corpus
- `no_below` and `no_above`: Controls filtering of rare/frequent words

Training outputs:
- The LDA model
- Dictionary of terms
- Bag-of-Words corpus


In [None]:
# Build reusable function to train the LDA model

def build_lda_model(tokenized_docs, num_topics=5, no_below=5, no_above=0.5, passes=10, random_state=42):
  """
  Builds and returns the LDA model, corpus, and dict from tokenized docs.

  Returns:
  - lda_model: trained LDA model
  - Corpus: BoW format corpus
  -dictionary: Gensim dict mapping tokens to IDs
  """
  from gensim import corpora, models

  # 1 Create dict from tokens
  dictionary = corpora.Dictionary(tokenized_docs)

  # 2 Remove tokens that are too rare / too common
  dictionary.filter_extremes(no_below=no_below, no_above=no_above)

  # 3 Convert docs to BoW format
  corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

  # 4 Train LDA model
  lda_model = models.LdaModel(
      corpus=corpus,
      id2word=dictionary,
      num_topics=num_topics,
      random_state=random_state,
      passes=passes,
  )

  return lda_model, corpus, dictionary


In [None]:
# Build LDA model using all-singing function
lda_model, corpus, dictionary = build_lda_model(tokenized_docs, num_topics=5)

logging.info("LDA model trained successfully.")

# Print discovered topics
print(" Discovered Topics & Top KWs: \n")
for i, topic in lda_model.print_topics():
  print(f"-> Topic {i}: {topic}")

# 🏷️ Assign Topics to Documents

Each document is now:
- Assigned its most probable topic
- Saved in a CSV (`topic_assignments.csv`) for easy use in dashboards later.

This is the backbone of the Streamlit dashboard search and filter system.

Before running, remember to:

- Replace "/YOUR/GDRIVE/PATH/HERE" with the path at which you want to save your csv.

In [None]:
# Export topic assignments for each document
# Assign each doc its most dominant topic to group similar documents together, etc.

doc_topics = []
for i, bow in enumerate(corpus):
  topic_probs = lda_model.get_document_topics(bow)
  dominant_topic = sorted(topic_probs, key=lambda x: -x[1])[0][0]
  doc_topics.append({
      "filename": filenames[i],
      "topic": dominant_topic,
  })

  import pandas as pd

  # Save the topic assignments as a CSV file
  # Convert results into DF using pandas

  topic_df = pd.DataFrame(doc_topics)
  topic_csv_path = "/YOUR/GDRIVE/PATH/HERE"
  topic_df.to_csv(topic_csv_path, index=False)

  logging.info(f"Document topic assignments saved to {topic_csv_path}")

# 🖋️ Map Topics to Human-Readable Titles

Instead of displaying vague topic IDs (Topic 0, Topic 1, etc.),  the modules in this step map each topic to a **meaningful descriptive label** based on its top keywords, before saving them to an updated CSV.

Outputs:
- `topic_label_map.csv`
- Updated `topic_assignments.csv` with human-friendly names

Note: You will need to run this twice if you are using your own dataset, so you can fetch the top 5 KWs the first time, and then add human-friendly readable topic titles the second time.

Before running, remember to:

- Replace "/YOUR/GDRIVE/PATH/HERE" with the path at which you want to save your topic label map.

In [None]:
# Create a mapping of the topic ID to display descriptibe KW labels

def format_topic_label(topic_id, lda_model, top_n=5):
  keywords = [word for word, I in  lda_model.show_topic(topic_id, topn=top_n)]
  return f"Topic {topic_id}: " + ", ".join(keywords)

# Build a label map dict

topic_labels = {i: format_topic_label(i, lda_model) for i in range(lda_model.num_topics)}

# Replace topic IDs in topic_df
topic_df["topic_label"] = topic_df["topic"].map(topic_labels)

# Human friendly topic titles for topic labels - Note these human friendly titles might need changing and then the code running again if iteratively testing
# and getting different words
topic_title_map = {
    "Topic 0: water, system, program, control, systems": "🌊 Environmental Legislation & Infrastructure",
    "Topic 1: merchant, data, wholesale, goods, humanitarian": "🛒 Merchandise Data & Humanitarian Commerce",
    "Topic 2: school, données, state, students, district": "🏫 School District Analysis & Policy Metrics", #Note: données means data in French
    "Topic 3: state, act, transportation, secretary, states": "📜 State-Level Transport Policy & Administration",
    "Topic 4: amendment, bank, student, new, virus": "🏦 Financial Policy, Education, & Pandemic Response"
}

# Add new column with better theme names
topic_df["topic_label"] = topic_df["topic_label"].map(topic_title_map)

# For debugging:

print(topic_df.columns.tolist())

print("Discovered Topics & Top KWs: \n")
for topic_id, label in topic_labels.items():
  print(f" -> {label}")

topic_df.to_csv(topic_csv_path, index=False)

logging.info(f"New document topic mapping saved to {topic_csv_path}")

In [None]:
# Function to export a topic-to-label CSV for more readable data headings in html

def export_topic_label_map(topic_labels, topic_title_map, save_path):

  """
  Export a CSV mapping of tpic IDs, KW labels, and human-readable topic titles

  Args:
  - topic_labels (dict): topic_id -> kw string
  - topic_title_map (dict): kw string -> readable title
  - save-path (str or Path): path to save the CSV
  """

  import pandas as pd

  data = []
  for topic_id, keyword_label in topic_labels.items():
    readable_title = topic_title_map.get(keyword_label, "")
    data.append({
        "topic_id": topic_id,
        "keyword_label": keyword_label,
        "readable_title": readable_title
    })

    df = pd.DataFrame(data)
    df.to_csv(save_path, index=False)
    print(f"Topic label mapping exported to {save_path}")


In [None]:
# Export the topic label map

export_topic_label_map(
    topic_labels,
    topic_title_map,
    "/YOUR/GDRIVE/PATH/HERE"
)

In [None]:
# Print the Discovered topics
# Display text for most relevant KWs in each topic
# This is useful for interpreting what each topic represents

print("Topics and topic titles:\n")
for i, topic in lda_model.print_topics():
    label = topic_labels[i]
    readable_title = topic_title_map.get(label, "Unnamed Topic")
    print(f"-> {readable_title} ({label})")

# 📊 Save pyLDAvis Interactive Visualization

Save a fully interactive pyLDAvis HTML visualization showing:
- Each topic's position
- Top keywords
- Topic relevance and relationships

This HTML will later be accessible inside the Streamlit app.

Before running, remember to:

- Replace "/YOUR/GDRIVE/PATH/HERE" with the path at which you want to save your pyLDAvis.html

In [None]:
# Visualise the topic
# Create an interactive visualisation including:
# - Topic circles representing clusters
# - Right pane showing top words per topic

# Prepare the pyLDAvis visualisation data object
vis_data = gensimvis.prepare (lda_model, corpus, dictionary)


pyLDAvis.save_html(vis_data, "/YOUR/GDRIVE/PATH/HERE")
logging.info("pyLDAvis HTML is saved.")

pyLDAvis.display(vis_data)


# 🖥️ Build the Streamlit Dashboard

This final section automatically generates a ready-to-run Streamlit app:
- Topic filtering
- Full-text keyword search
- pyLDAvis visualization
- Human-readable topic names

✅ You can now deploy your LDA Topic Modelling project as an interactive web dashboard!

Before running, remember to:

- Replace "/YOUR/GDRIVE/PATH/HERE" with the path at which you want to save your streamlit .py file.

In [None]:
# Code to create a Streamlit dashboard for topic modelling
# First, define Streamlit dashboard layout and logic

streamlit_script = r"""

import pandas as pd
import streamlit as st
import os
import re
import webbrowser

# Set up Streamlit
st.set_page_config(page_title="Topic Modelling Dashboard", page_icon="📊", layout="wide")

# Define base path relative to location of the app folder
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
csv_path = os.path.join(BASE_DIR, "topic_assignments.csv")
label_map_path = os.path.join(BASE_DIR, "topic_label_map.csv")
text_folder = os.path.join(BASE_DIR, "extracted_texts")
pyldavis_path = os.path.join(BASE_DIR, "lda_topic_dashboard.html")

# Data load
df = pd.read_csv(csv_path, encoding="utf-8")  # Force correct UTF-8 decoding for emoji characters

# Load labels
if os.path.exists(label_map_path):
    label_map = pd.read_csv(label_map_path)
else:
    label_map = None
    st.warning(f"Topic label map file not found: {label_map_path}")

# UI heading
st.title("📊 Topic Modelling Dashboard")
st.markdown("Explore topics discovered in synthetic documents.")

# Topic selection
selected_theme = st.selectbox("Select a topic to explore:", sorted(df['topic_label'].dropna().unique()))

# Theme filter
filtered_df = df[df['topic_label'] == selected_theme]

st.subheader(f"Topic: {selected_theme}")

if not filtered_df.empty:
    st.write(f"Showing {len(filtered_df)} documents related to this theme.")
    st.dataframe(filtered_df[['filename', 'topic_label']])
else:
    st.warning("🛑 No documents found for this theme.")

# KW search (an adapted vers of user Kaushik000raj's search code) ---
search_term = st.text_input("🔍 Search for a keyword or phrase inside all document contents (regex supported):")

if search_term:
    st.markdown(f"Searching for **'{search_term}'**...")
    matches = []

    for root, dirs, files in os.walk(text_folder):
        for fname in files:
            if fname.endswith(".txt"):
                full_path = os.path.join(root, fname)

                try:
                    with open(full_path, "r", encoding="utf-8") as f:
                        content = f.read()
                        match = re.search(search_term, content, re.IGNORECASE)
                        if match:
                            snippet_start = max(0, match.start() - 100)
                            snippet = content[snippet_start:match.end() + 100]
                            highlighted = re.sub(search_term, f"**{match.group()}**", snippet, flags=re.IGNORECASE)

                            match_row = {
                                "filename": fname,  # match using .txt filename
                                "snippet": highlighted
                            }

                            topic_row = df[df['filename'] == fname]
                            if not topic_row.empty:
                                match_row["topic_label"] = topic_row.iloc[0]['topic_label']

                            matches.append(match_row)

                except Exception as e:
                    st.error(f"Error reading {fname}: {e}")

    # Search result display
    if matches:
        st.success(f"✅ Found {len(matches)} documents containing the term.")
        for row in matches:
            st.markdown(f"📄 **{row['filename']}** — _{row.get('topic_label', 'No label')}_")
            st.markdown(row["snippet"], unsafe_allow_html=True)
    else:
        st.warning("🛑 No matches found.")

# Sidebar ref for themes
if label_map is not None:
    st.sidebar.title("📘 Topic Theme Reference")
    for _, row in label_map.iterrows():
        st.sidebar.markdown(f"**Topic {row['topic_id']}**: {row['readable_title']}")

# Button to open pyLDAvis map
if st.button("Open pyLDAvis topic map in browser"):
    if os.path.exists(pyldavis_path):
        webbrowser.open_new_tab(pyldavis_path)
    else:
        st.error("Topic map file not found.")


"""

# Save the Streamlit .py script to GDrive

output_path = "/YOUR/GDRIVE/PATH/HERE"

with open(output_path, "w") as f:
  f.write(streamlit_script)

print(f"Streamlit app saved to {output_path}")