In [None]:
pip install bertopic sentence-transformers
pip install umap-learn hdbscan
pip install PyMuPDF  # For loading PDFs

In [None]:
import os
import fitz  # PyMuPDF
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# === 1. Load papers ===
def load_documents_from_folder(folder_path, file_ext=".pdf"):
    documents = []
    filenames = []

    for filename in os.listdir(folder_path):
        if filename.endswith(file_ext):
            full_path = os.path.join(folder_path, filename)
            text = ""
            if file_ext == ".pdf":
                with fitz.open(full_path) as doc:
                    for page in doc:
                        text += page.get_text()
            elif file_ext == ".txt":
                with open(full_path, 'r', encoding='utf-8') as f:
                    text = f.read()
            documents.append(text)
            filenames.append(filename)
    return documents, filenames

# Set your folder path where papers are stored
folder_path = "path_to_your_30_papers"
documents, filenames = load_documents_from_folder(folder_path)

# Optional: You can preprocess texts here (e.g. remove stopwords) but not strictly needed with SBERT

# === 2. Initialize BERTopic ===
# This uses MiniLM SBERT model by default, but you can also pass a custom embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

# === 3. Fit the model ===
topics, probs = topic_model.fit_transform(documents)

# === 4. View topics ===
topic_info = topic_model.get_topic_info()
print(topic_info)

# === 5. Visualize ===
topic_model.visualize_topics().show()
