<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install required libraries
!pip install gensim sentence-transformers umap-learn hdbscan pandas numpy matplotlib scikit-learn pyLDAvis

In [None]:
from google.colab import files
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import hdbscan
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score, davies_bouldin_score

class AdvancedTopicModel:
    def __init__(self):
        self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
        self.vectorizer = CountVectorizer(stop_words=None, max_features=10000)

    def upload_file(self):
        print("📤 Please upload an Excel or CSV file containing texts:")
        uploaded = files.upload()
        file_name = next(iter(uploaded))

        if file_name.endswith('.xlsx'):
            df = pd.read_excel(file_name)
        elif file_name.endswith('.csv'):
            df = pd.read_csv(file_name)
        else:
            raise ValueError("File format must be .xlsx or .csv")

        texts = df.iloc[:, 1].astype(str).tolist()
        return [text for text in texts if len(text.split()) > 1]

    def preprocess(self, texts):
        cleaned = []
        for text in texts:
            tokens = text.replace('\n', ' ').replace('\r', '').strip().split(',')
            unique_tokens = list(dict.fromkeys([t.strip() for t in tokens if t.strip() != '']))
            cleaned.append(" ".join(unique_tokens))
        return cleaned

    def extract_topics(self, texts):
        embeddings = self.model.encode(texts, show_progress_bar=True)

        best_result = {
            "score": -1,
            "topics": None,
            "clusters": None,
            "texts": None,
            "embeddings": None,
            "silhouette": None,
            "db_index": None,
            "n_clusters": None,
            "params": None
        }

        print("\n🔍 Searching for best clustering parameters...")

        for n_neighbors in [10, 15]:
            for min_cluster_size in [3, 5, 8]:
                umap_embeddings = umap.UMAP(
                    n_neighbors=n_neighbors,
                    n_components=5,
                    metric='cosine',
                    random_state=42
                ).fit_transform(embeddings)

                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=min_cluster_size,
                    min_samples=2,
                    metric='euclidean',
                    cluster_selection_method='eom'
                )
                clusters = clusterer.fit_predict(umap_embeddings)

                valid_indices = [i for i, c in enumerate(clusters) if c != -1]
                if len(valid_indices) < 10:
                    continue

                filtered_embeddings = umap_embeddings[valid_indices]
                filtered_clusters = clusters[valid_indices]
                filtered_texts = [texts[i] for i in valid_indices]

                cluster_counts = pd.Series(filtered_clusters).value_counts()
                valid_cluster_ids = cluster_counts[cluster_counts >= 5].index

                final_texts = []
                final_clusters = []
                final_embeddings = []

                for i, (text, cluster, emb) in enumerate(zip(filtered_texts, filtered_clusters, filtered_embeddings)):
                    if cluster in valid_cluster_ids:
                        final_texts.append(text)
                        final_clusters.append(cluster)
                        final_embeddings.append(emb)

                final_clusters = np.array(final_clusters)
                final_embeddings = np.array(final_embeddings)

                if len(set(final_clusters)) <= 1:
                    continue

                silhouette = silhouette_score(final_embeddings, final_clusters)
                db_index = davies_bouldin_score(final_embeddings, final_clusters)
                score = silhouette - db_index

                if score > best_result["score"]:
                    vectorizer = self.vectorizer.fit(final_texts)
                    vocab = vectorizer.get_feature_names_out()
                    word_counts = vectorizer.transform(final_texts)

                    topics = {}
                    for cluster_id in set(final_clusters):
                        indices = [i for i, c in enumerate(final_clusters) if c == cluster_id]
                        cluster_texts = [final_texts[i] for i in indices]
                        word_freq = word_counts[indices].sum(axis=0).A1
                        top_words_idx = word_freq.argsort()[::-1][:10]
                        top_words = [vocab[i] for i in top_words_idx]

                        topics[cluster_id] = {
                            'words': top_words,
                            'sample_texts': cluster_texts[:3]
                        }

                    best_result.update({
                        "score": score,
                        "topics": topics,
                        "clusters": final_clusters,
                        "texts": final_texts,
                        "embeddings": final_embeddings,
                        "silhouette": silhouette,
                        "db_index": db_index,
                        "n_clusters": len(set(final_clusters)),
                        "params": (n_neighbors, min_cluster_size)
                    })

        if best_result["topics"] is None:
            print("❌ No high-quality clusters were found.")
            return {}

        print("\n✅ Best result found with parameters:")
        print("🔹 n_neighbors =", best_result["params"][0])
        print("🔹 min_cluster_size =", best_result["params"][1])
        print("📈 Silhouette Score:", round(best_result["silhouette"], 3))
        print("📉 Davies-Bouldin Index:", round(best_result["db_index"], 3))
        print("🧠 Final number of clusters:", best_result["n_clusters"])

        self.clusters = best_result["clusters"]
        self.texts = best_result["texts"]
        self.topics = best_result["topics"]
        self.umap_embeddings = best_result["embeddings"]

        return best_result["topics"]

    def save_to_excel(self):
        data = []
        for i, (text, cluster) in enumerate(zip(self.texts, self.clusters)):
            data.append({
                'Index': i + 1,
                'Text': text,
                'Cluster': cluster
            })

        df_out = pd.DataFrame(data)
        file_name = "Optimal_Topic_Clusters.xlsx"
        df_out.to_excel(file_name, index=False)
        print(f"\n📁 Output file saved as '{file_name}'.")
        files.download(file_name)

    def run(self):
        texts = self.upload_file()
        texts = self.preprocess(texts)

        print("\n🔮 Analyzing topics...")
        self.extract_topics(texts)

        self.save_to_excel()

# Run the model
if __name__ == "__main__":
    model = AdvancedTopicModel()
    model.run()


In [None]:
import pandas as pd

# Assuming model.topics is a dictionary like:
# {cluster_id: {'words': [word1, word2, ...]}}

# Prepare data for output
data = []
for cluster_id, topic in model.topics.items():
    words = ", ".join(topic['words'])
    print(f"Cluster {cluster_id}: {words}")  # Print to console
    data.append({'Cluster': cluster_id, 'Keywords': words})

# Create Excel file
df = pd.DataFrame(data)
file_name = 'Cluster_Keywords_Output.xlsx'
df.to_excel(file_name, index=False)

print(f"\n✅ Output file named '{file_name}' was saved successfully.")


In [None]:
# Print topics of each cluster
for cluster_id, topic in model.topics.items():
    words = ", ".join(topic['words'])
    print(f"Cluster {cluster_id}: {words}")
