<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Friedman_Nemenyi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 📦 Installing packages (for Colab)
!pip install -q sentence-transformers scikit-posthocs

In [None]:
#  📥 Importing libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# 📦 Loading the multilingual BERT model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# 🧠 Defining topics for different models

bert_topics = [

]

lda2vec_topics = [

]

lda_topics = [

]

n_gram_topics = [

]

tf_idf_topics = [

]

word2vec_topics = [

]

# 📌 Building the dictionary of models
models = {
    'TF-IDF': tf_idf_topics,
    'BERT': bert_topics,
    'LDA': lda_topics,
    'LDA2Vec': lda2vec_topics,
    'n-Gram': n_gram_topics,
    'Word2Vec': word2vec_topics
}

# ✨ Function to convert each topic to a sentence and extract embeddings
def embed_topics(topics):
    topic_sentences = [' '.join(topic) for topic in topics]
    return model.encode(topic_sentences)

# ✅ Create embeddings for all models
embeddings = {name: embed_topics(topics) for name, topics in models.items()}

# 🧮 Calculate maximum semantic similarity between models
raw_scores = []
labels = []

for name_a, emb_list_a in embeddings.items():
    for i, emb_a in enumerate(emb_list_a):
        row = {}
        labels.append(f"{name_a}_T{i+1}")
        for name_b, emb_list_b in embeddings.items():
            if name_a == name_b:
                row[name_b] = 1.0
            else:
                sims = cosine_similarity([emb_a], emb_list_b)
                row[name_b] = float(np.max(sims))
        raw_scores.append(row)

# 📊 Convert results to a DataFrame
scores_df = pd.DataFrame(raw_scores, index=labels)
print("📊 Semantic similarity scores between topics (across different models):")
print(scores_df)

# 🧪 Friedman test (if number of models ≥ 3)
if scores_df.shape[1] >= 3:
    stat, p = friedmanchisquare(*[scores_df[col] for col in scores_df.columns])
    print(f"\n🧪 Friedman Test → χ² = {stat:.3f}, P-value = {p:.3f}")

    if p < 0.05:
        print("✅ There is a significant difference between the models. Running Nemenyi test...")
        nemenyi = sp.posthoc_nemenyi_friedman(scores_df.values)
        nemenyi.columns = scores_df.columns
        nemenyi.index = scores_df.columns
        print("\n📈 Nemenyi Test Results:")
        print(nemenyi)
    else:
        print("❌ No significant difference found between the models.")
else:
    print("⚠️ At least 3 models are required to run the Friedman test.")