In [None]:
# 필요한 모듈 import
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import gensim
from gensim import models, corpora, matutils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from gensim.models.coherencemodel import CoherenceModel 
import seaborn as sns
import matplotlib.pyplot as plt
import openai
import os
import re
import json
import time
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from scipy.stats import pearsonr, spearmanr, f_oneway, kruskal
from tenacity import retry, stop_after_attempt, wait_random_exponential
from bertopic import BERTopic
from transformers import BertTokenizer, BertModel 
from math import log
from itertools import combinations


In [None]:
# 데이터셋 선정 함수

def load_data(file_path, sample_size=80):
    df = pd.read_csv(file_path, header=None, names=['text'])
    texts = df['text'].astype(str)

    # 샘플링
    if len(texts) > sample_size:
        texts = texts.sample(n=sample_size, random_state=42)

    print(f"Loaded {len(texts)} texts from {file_path}")
    return texts.tolist()


# 데이터셋 로드
datasets = {
    'academy': {
        'business': load_data('data/academy/business.csv'),
        'ACL': load_data('data/academy/ACL.csv'),
        'covid': load_data('data/academy/covid.csv')
    },
    'media': {
        'clothing_review': load_data('data/media/clothing_review.csv'),
        'vaccine_tweets': load_data('data/media/vaccine_tweets.csv'),
        'reddit_comments': load_data('data/media/reddit_comments.csv')
    },
    'news': {
        'newsgroups': load_data('data/news/20newsgroups.csv'),
        'agnews': load_data('data/news/agnews.csv'),
        'Huffpost': load_data('data/news/Huffpost.csv')
    }
}


# VAE 모델 정의
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()

        # Encoder
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc21 = nn.Linear(hidden_dim, latent_dim)
        self.fc22 = nn.Linear(hidden_dim, latent_dim)

        # Decoder
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))  # Changed to sigmoid

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar


def vae_loss(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD


def perform_topic_modeling(data, num_topics, model_type):
    # Ensure all data is of type string
    data = [str(doc) for doc in data if isinstance(doc, str) or pd.notna(doc)]

    # Check if num_topics is greater than the number of documents
    if num_topics > len(data):
        print(f"Adjusting num_topics from {num_topics} to {len(data)}")
        num_topics = len(data)  # Adjust to the number of documents

    # Common vectorizer
    vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(data)

    # GPU usage setting
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_type == 'LDA':
        # Create corpus and dictionary for LDA
        corpus = matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False)
        id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

        # Create LDA model
        lda_model = models.LdaMulticore(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            workers=2,  # Number of cores to use
            passes=10,
            random_state=42
        )

        return lda_model, vectorizer

    elif model_type == 'BERTopic':
        # Create BERTopic model
        bertopic_model = BERTopic(nr_topics=num_topics)
        
        # Ensure documents is a Series
        if not isinstance(data, pd.Series):
            data = pd.Series(data)
        
        bertopic_topics, _ = bertopic_model.fit_transform(data)

        return bertopic_model, None  # BERTopic uses its own vectorizer

    elif model_type == 'VAE':
        # VAE part
        input_dim = doc_term_matrix.shape[1]
        hidden_dim = 256
        latent_dim = num_topics

        vae_model = VAE(input_dim, hidden_dim, latent_dim).to(device)
        optimizer = torch.optim.Adam(vae_model.parameters(), lr=1e-3)

        # VAE training
        num_epochs = 2  # 10
        batch_size = 128

        for epoch in range(num_epochs):
            for i in range(0, doc_term_matrix.shape[0], batch_size):
                batch = torch.FloatTensor(doc_term_matrix[i:i+batch_size].toarray()).to(device)
                batch = batch / batch.max()  # Normalize batch to between 0 and 1

                optimizer.zero_grad()
                recon_batch, mu, logvar = vae_model(batch)
                loss = vae_loss(recon_batch, batch, mu, logvar)
                loss.backward()
                optimizer.step()

        # Extract topic words from latent vectors
        from sklearn.cluster import KMeans

        latent_vectors = []
        doc_term_matrix = vectorizer.transform(data)
        with torch.no_grad():
            for i in range(0, doc_term_matrix.shape[0], 128):
                batch = torch.FloatTensor(doc_term_matrix[i:i+128].toarray()).to(device)
                mu, logvar = vae_model.encode(batch)
                z = vae_model.reparameterize(mu, logvar)
                latent_vectors.append(z.cpu().numpy())

        latent_vectors = np.vstack(latent_vectors)
        kmeans = KMeans(n_clusters=num_topics, random_state=42).fit(latent_vectors)

        topics = [[] for _ in range(num_topics)]
        for idx, label in enumerate(kmeans.labels_):
            doc = data[idx]
            topics[label].extend(doc.split())

        # Extract top 10 words for each topic
        topics = [list(pd.Series(words).value_counts().index[:10]) for words in topics]

        # Return VAE model, vectorizer, and topic words
        return vae_model, vectorizer, topics


# 평가 지표 계산 함수들 
def calculate_npmi(topic_words_with_weights, texts, top_n=10):
    # 단어들만 추출하고, 문자열인지 확인
    topic_words = [word for word, _ in topic_words_with_weights[:top_n] if isinstance(word, str)]
    
    if not topic_words:
        return 0  # 토픽 단어가 없으면 0 반환

    vectorizer = CountVectorizer(vocabulary=topic_words)
    doc_word_matrix = vectorizer.fit_transform(texts)
    
    word_doc_counts = doc_word_matrix.sum(axis=0).A1  # 각 단어의 문서 내 등장 횟수
    doc_count = len(texts)
    
    npmi_scores = []
    for i, word1 in enumerate(topic_words):
        for j, word2 in enumerate(topic_words):
            if i < j:
                idx1 = i
                idx2 = j
                co_doc_count = doc_word_matrix[:, idx1].multiply(doc_word_matrix[:, idx2]).nnz
                if co_doc_count == 0:
                    continue  # 공출현 빈도가 0이면 건너뜁니다.

                p_w1 = word_doc_counts[idx1] / doc_count
                p_w2 = word_doc_counts[idx2] / doc_count
                p_w1_w2 = co_doc_count / doc_count

                pmi = log(p_w1_w2 / (p_w1 * p_w2) + 1e-12)
                npmi = pmi / (-log(p_w1_w2 + 1e-12))
                npmi_scores.append(npmi)

    if npmi_scores:
        return np.mean(npmi_scores)
    else:
        return 0  # 계산된 NPMI 점수가 없으면 0 반환

def calculate_cv(topic_words_with_weights, texts, top_n=10):
    # 단어들만 추출하고, 문자열인지 확인
    topic_words = [word for word, _ in topic_words_with_weights[:top_n] if isinstance(word, str)]
    
    if not topic_words:
        return 0  # 토픽 단어가 없으면 0 반환

    # 각 단어의 컨텍스트 벡터를 구하기 위해 모든 문서에 대해 단어-문서 행렬을 생성합니다.
    vectorizer = CountVectorizer()
    doc_word_matrix = vectorizer.fit_transform(texts)
    vocabulary = vectorizer.get_feature_names_out()
    word2id = {word: idx for idx, word in enumerate(vocabulary)}

    total_documents = len(texts)
    word_doc_counts = doc_word_matrix.toarray().sum(axis=0)

    # NPMI 행렬 계산
    npmi_matrix = np.zeros((len(topic_words), len(topic_words)))
    for i, word1 in enumerate(topic_words):
        for j, word2 in enumerate(topic_words):
            if i <= j:
                continue  # 대각선 및 대칭 행렬이므로 절반만 계산

            idx1 = word2id.get(word1)
            idx2 = word2id.get(word2)

            if idx1 is None or idx2 is None:
                continue  # 단어가 사전에 없으면 건너뜁니다.

            co_occur_count = doc_word_matrix[:, idx1].multiply(doc_word_matrix[:, idx2]).nnz
            if co_occur_count == 0:
                continue  # 공출현 빈도가 0이면 건너뜁니다.

            p_w1 = word_doc_counts[idx1] / total_documents
            p_w2 = word_doc_counts[idx2] / total_documents
            p_w1_w2 = co_occur_count / total_documents

            pmi = log(p_w1_w2 / (p_w1 * p_w2) + 1e-12)
            npmi = pmi / (-log(p_w1_w2 + 1e-12))
            npmi_matrix[i, j] = npmi

    # Coherence 점수 계산
    cv_scores = []
    for i in range(len(topic_words)):
        for j in range(len(topic_words)):
            if i <= j:
                continue  # 대각선 및 대칭 행렬이므로 절반만 사용
            npmi_value = npmi_matrix[i, j]
            if npmi_value != 0:
                cv_scores.append(npmi_value)

    if cv_scores:
        return np.mean(cv_scores)
    else:
        return 0  # 계산된 C_V 점수가 없으면 0 반환

def calculate_coherence(model, data, tokenizer, bert_model, model_type, vectorizer=None, num_topics=10, topics=None):
    coherence_scores = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if model_type == 'LDA':
        topics = []
        for i in range(num_topics):
            topic_terms = model.show_topic(i, topn=10)
            topic_words = [word for word, _ in topic_terms]
            topics.append(topic_words)

    elif model_type == 'BERTopic':
        topics = []
        for i in range(num_topics):
            topic_info = model.get_topic(i)
            if topic_info:
                topic_words = [word for word, _ in topic_info]
                topics.append(topic_words)

    elif model_type == 'VAE':
        # VAE의 경우 이미 추출한 토픽 단어를 사용
        topics = topics
    else:
        raise ValueError("Invalid model type")

    for topic_words in topics:
        # 토픽 단어들을 문장 형태로 변환하여 문맥을 고려한 임베딩 계산
        sentences = ["The topic is about " + word for word in topic_words]
        inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = bert_model(**inputs)

        # 문장 임베딩은 last_hidden_state의 [CLS] 토큰 벡터 사용
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]  # [batch_size, hidden_size]

        # 단어 쌍의 조합 생성 (중복 없이)
        pairs = list(combinations(range(len(topic_words)), 2))

        if not pairs:
            coherence_scores.append(0)
            continue

        # 코사인 유사도 계산
        embeddings1 = sentence_embeddings[[i for i, j in pairs]]
        embeddings2 = sentence_embeddings[[j for i, j in pairs]]
        cosine_similarities = torch.nn.functional.cosine_similarity(embeddings1, embeddings2, dim=1)

        # coherence 점수는 코사인 유사도의 평균값
        coherence = cosine_similarities.mean().item()
        coherence_scores.append(coherence)

    if coherence_scores:
        return np.mean(coherence_scores)
    else:
        return 0

def calculate_evaluation_metrics(model, data, model_type, vectorizer, num_topics, topics=None):
    # BERT 모델 로드
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model = bert_model.to(device)

    # Coherence 계산
    coherence = calculate_coherence(model, data, tokenizer, bert_model, model_type, vectorizer, num_topics, topics)

    # NPMI 계산
    if model_type == 'LDA':
        topic_words_with_weights = model.show_topic(0, topn=10)
    elif model_type == 'BERTopic':
        topic_words_with_weights = model.get_topic(0)
    elif model_type == 'VAE':
        topic_words_with_weights = [(word, 1.0) for word in topics[0]]  # 첫 번째 토픽 사용
    else:
        topic_words_with_weights = []

    npmi = calculate_npmi(topic_words_with_weights, data)

    # C_V 계산
    cv = calculate_cv(topic_words_with_weights, data)

    return coherence, npmi, cv

# BERT 모델 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)

# 평가 지표 결과 저장을 위한 데이터프레임 초기화
metrics_list = []

# 계산 시간 저장을 위한 딕셔너리 초기화
computation_times = {}

# 모델 유형 및 토픽 수 설정
model_types = ['LDA', 'BERTopic', 'VAE']

# 토픽 수를 2와 4로 고정
num_topics_list = [2, 4]

# 계산 시간 저장을 위한 딕셔너리 초기화
computation_times = {}

def get_top_words(model, vectorizer, num_words=5):
    if isinstance(model, models.LdaMulticore):
        return [dict(model.show_topic(topicid, topn=num_words)) for topicid in range(2)]
    elif isinstance(model, BERTopic):
        return [dict(model.get_topic(i)[:num_words]) for i in range(2)]
    elif isinstance(model, VAE):
        # VAE의 경우, 디코더의 가중치를 사용하여 단어의 중요도를 계산
        feature_names = vectorizer.get_feature_names_out()
        word_importance = model.fc4.weight.data.cpu().numpy()
        top_words = []
        for i in range(2):
            top_indices = word_importance[i].argsort()[-num_words:][::-1]
            top_words.append({feature_names[j]: word_importance[i][j] for j in top_indices})
        return top_words

def process_datasets(datasets):
    results = []
    for category, category_datasets in datasets.items():
        for dataset_name, data in category_datasets.items():
            print(f"Processing {category} - {dataset_name}")
            for model_type in ['LDA', 'BERTopic', 'VAE']:
                try:
                    model, vectorizer = perform_topic_modeling(data, num_topics=2, model_type=model_type)
                    top_words = get_top_words(model, vectorizer)
                    for topic_id, words in enumerate(top_words):
                        results.append({
                            'Category': category,
                            'Dataset': dataset_name,
                            'Model': model_type,
                            'Topic': f'Topic {topic_id + 1}',
                            'Words': ', '.join(list(words.keys()))
                        })
                except Exception as e:
                    print(f"Error processing {category} - {dataset_name} - {model_type}: {str(e)}")
    return pd.DataFrame(results)

if __name__ == '__main__':
    # 토픽 모델링 및 지표 계산
    for domain, domain_datasets in datasets.items():
        for dataset_name, data in domain_datasets.items():
            print(f"\nProcessing {domain} - {dataset_name}")
            for model_type in model_types:
                for num_topics in num_topics_list:
                    print(f"\nModel: {model_type}, Num Topics: {num_topics}")
                    try:
                        start_time = time.time()
                        if model_type == 'VAE':
                            model, vectorizer, topics = perform_topic_modeling(data, num_topics, model_type)
                        else:
                            model, vectorizer = perform_topic_modeling(data, num_topics, model_type)
                            topics = None  # VAE가 아닌 경우
                        topic_modeling_time = time.time() - start_time

                        start_time = time.time()
                        coherence = calculate_coherence(model, data, tokenizer, bert_model, model_type, vectorizer, num_topics, topics)
                        coherence_time = time.time() - start_time

                        # 토픽 단어 및 가중치 추출
                        if model_type == 'LDA':
                            topic_words_with_weights = model.show_topic(0, topn=10)
                        elif model_type == 'BERTopic':
                            topic_words_with_weights = model.get_topic(0)
                        elif model_type == 'VAE':
                            # VAE의 경우 추출한 토픽 단어를 사용
                            topic_words_with_weights = [(word, 1.0) for word in topics[0]]  # 첫 번째 토픽 사용
                        else:
                            topic_words_with_weights = []

                        start_time = time.time()
                        npmi = calculate_npmi(topic_words_with_weights, data)
                        npmi_time = time.time() - start_time

                        start_time = time.time()
                        cv = calculate_cv(topic_words_with_weights, data)
                        cv_time = time.time() - start_time

                        # 결과 저장
                        metrics_list.append({
                            'Domain': domain,
                            'Dataset': dataset_name,
                            'Model': model_type,
                            'Num_Topics': num_topics,
                            'Coherence': coherence,
                            'NPMI': npmi,
                            'C_V': cv
                        })

                        computation_times[f"{domain}_{dataset_name}_{model_type}_{num_topics}"] = {
                            'Topic Modeling': topic_modeling_time,
                            'Coherence': coherence_time,
                            'NPMI': npmi_time,
                            'C_V': cv_time
                        }

                        print(f"Coherence: {coherence:.4f}")
                        print(f"NPMI: {npmi:.4f}")
                        print(f"C_V: {cv:.4f}")

                    except Exception as e:
                        print(f"Error processing {domain} - {dataset_name} - {model_type} - {num_topics}: {str(e)}")
                        continue

    # metrics_df 생성
    metrics_df = pd.DataFrame(metrics_list)

    print("\n--- Existing metrics calculation completed ---\n")

    # 새로운 토픽 모델링 결과 출력
    print("\n--- Starting new topic modeling for top words ---\n")
    results_df = process_datasets(datasets)
    
    # 결과 출력
    for category in results_df['Category'].unique():
        print(f"\n{category.upper()}")
        for dataset in results_df[results_df['Category'] == category]['Dataset'].unique():
            print(f"\n{dataset}")
            subset = results_df[(results_df['Category'] == category) & (results_df['Dataset'] == dataset)]
            print(subset[['Model', 'Topic', 'Words']].to_string(index=False))

    # 결과를 CSV 파일로 저장 (선택사항)
    results_df.to_csv('topic_modeling_top_words.csv', index=False)

    print("\n--- New topic modeling completed ---")

In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 상관관계 분석 함수
def correlation_analysis(metrics_df):
    from scipy.stats import pearsonr, spearmanr

    # 지표 목록
    metric_names = ['Coherence', 'NPMI', 'C_V']

    # 모델별, 토픽 수별로 상관관계 계산
    for model in metrics_df['Model'].unique():
        for num_topics in metrics_df['Num_Topics'].unique():
            subset = metrics_df[(metrics_df['Model'] == model) & (metrics_df['Num_Topics'] == num_topics)]
            if len(subset) < 2:
                continue  # 상관계수를 계산하기 위한 데이터가 충분하지 않으면 건너뜀
            print(f"\n상관관계 분석 - 모델: {model}, 토픽 수: {num_topics}")
            for i in range(len(metric_names)):
                for j in range(i+1, len(metric_names)):
                    metric1 = metric_names[i]
                    metric2 = metric_names[j]
                    pearson_corr, _ = pearsonr(subset[metric1], subset[metric2])
                    spearman_corr, _ = spearmanr(subset[metric1], subset[metric2])
                    print(f"{metric1} vs {metric2} - Pearson: {pearson_corr:.4f}, Spearman: {spearman_corr:.4f}")

# 일관성 분석 함수
def consistency_analysis(metrics_df):
    metric_names = ['Coherence', 'NPMI', 'C_V']

    # 지표별 변동 계수 계산
    for metric in metric_names:
        cv = metrics_df[metric].std() / metrics_df[metric].mean()
        print(f"{metric}의 변동 계수 (전체): {cv:.4f}")

    # 모델별, 토픽 수별로 변동 계수 계산
    for model in metrics_df['Model'].unique():
        for num_topics in metrics_df['Num_Topics'].unique():
            subset = metrics_df[(metrics_df['Model'] == model) & (metrics_df['Num_Topics'] == num_topics)]
            if len(subset) < 2:
                continue
            print(f"\n일관성 분석 - 모델: {model}, 토픽 수: {num_topics}")
            for metric in metric_names:
                cv = subset[metric].std() / subset[metric].mean()
                print(f"{metric}의 변동 계수: {cv:.4f}")

# 계산 시간 비교 함수
def computation_time_analysis(comp_times_df):
    # 수치형 열만 선택하여 평균 계산
    numeric_columns = ['Topic_Modeling_Time', 'Coherence_Time', 'NPMI_Time', 'C_V_Time']
    avg_times = comp_times_df.groupby(['Model', 'Num_Topics'])[numeric_columns].mean().reset_index()

    print("\n계산 시간 비교:")
    print(avg_times[['Model', 'Num_Topics', 'Topic_Modeling_Time', 'Coherence_Time', 'NPMI_Time', 'C_V_Time']])

    # 계산 시간 시각화 (예시)
    sns.barplot(data=avg_times, x='Num_Topics', y='Topic_Modeling_Time', hue='Model')
    plt.title('모델별 토픽 수에 따른 토픽 모델링 시간')
    plt.show()

# 계산 시간 데이터프레임 생성
comp_times_list = []
for key, times in computation_times.items():
    parts = key.split('_')
    domain = parts[0]
    model = parts[-2]
    num_topics = parts[-1]
    comp_times_list.append({
        'Domain': domain,
        'Model': model,
        'Num_Topics': int(num_topics),
        'Topic_Modeling_Time': times['Topic Modeling'],
        'Coherence_Time': times['Coherence'],
        'NPMI_Time': times['NPMI'],
        'C_V_Time': times['C_V']
    })

comp_times_df = pd.DataFrame(comp_times_list)

# 도메인 간 지표 성능 비교 함수
def domain_performance_analysis(metrics_df):
    metric_names = ['Coherence', 'NPMI', 'C_V']
    from scipy.stats import f_oneway, kruskal

    for metric in metric_names:
        print(f"\n도메인 간 {metric} 성능 비교")
        data_per_domain = [metrics_df[metrics_df['Domain'] == domain][metric] for domain in metrics_df['Domain'].unique()]
        # ANOVA 검정
        f_stat, p_value = f_oneway(*data_per_domain)
        print(f"ANOVA 결과 - F-statistic: {f_stat:.4f}, p-value: {p_value:.4f}")
        # Kruskal-Wallis 검정
        h_stat, p_value = kruskal(*data_per_domain)
        print(f"Kruskal-Wallis 결과 - H-statistic: {h_stat:.4f}, p-value: {p_value:.4f}")

# 토픽 단어 추출 함수
def extract_topics(model_type, model, num_topics, topics=None):
    extracted_topics = []
    if model_type == 'LDA':
        for i in range(num_topics):
            topic_terms = model.show_topic(i, topn=10)
            topic_words = [word for word, _ in topic_terms]
            extracted_topics.append(topic_words)
    elif model_type == 'BERTopic':
        for i in range(num_topics):
            topic_info = model.get_topic(i)
            if topic_info:
                topic_words = [word for word, _ in topic_info]
                extracted_topics.append(topic_words)
    elif model_type == 'VAE':
        # VAE의 경우 이미 추출한 토픽 단어를 사용
        extracted_topics = topics
    return extracted_topics

# LLM 평가 수행 함수
from tenacity import retry, stop_after_attempt, wait_random_exponential

def call_openai_api(prompt: str, max_tokens: int = 3000) -> str:
    client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    full_response = ""
    while True:
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert in topic modeling and text analysis. Your task is to evaluate the coherence of topics based on provided documents."
                    },
                    {
                        "role": "user",
                        "content": prompt + ("\n\nContinue from: " + full_response if full_response else "")
                    }
                ],
                temperature=0,
                max_tokens=max_tokens,
                top_p=1,
                frequency_penalty=0.1,
                presence_penalty=0.1,
            )
            chunk = response.choices[0].message.content
            full_response += chunk
            
            if not response.choices[0].finish_reason == "length":
                break
            
            prompt = "Continue the previous response:"
        except openai.error.RateLimitError:
            print("Rate limit exceeded. Retrying...")
            raise
        except openai.error.AuthenticationError:
            print("Authentication error. Check your API key.")
            raise
        except Exception as e:
            print(f"Unexpected error: {e}")
            raise

    return full_response

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def llm_evaluation(topics, documents, model="gpt-4o-mini"):
    api_key = os.environ.get('OPENAI_API_KEY')
    if not api_key:
        raise ValueError("OpenAI API key not found in environment variables")

    openai.api_key = api_key

    scores = []
    feedbacks = []

    if not isinstance(documents, list):
        documents = list(documents)

    for topic_words in topics:
        topic = ', '.join(topic_words)
        docs_sample = documents[:3]
        prompt = f"""
주어진 토픽과 관련 문서를 평가해주세요. 다음 기준에 따라 1-10 척도로 점수를 매겨주세요:
일관성: 토픽 내 단어들이 의미적으로 얼마나 연관되어 있는가?

토픽: {topic}
관련 문서 샘플:
{docs_sample}

일관성에 대해 1-10 점수를 매기고, 간단한 설명을 덧붙여주세요.
"""
        try:
            evaluation = call_openai_api(prompt, max_tokens=500)
            match = re.search(r'(\d+)', evaluation)
            if match:
                topic_score = int(match.group(1))
                scores.append(topic_score)
                feedbacks.append(evaluation)
            else:
                print(f"점수를 추출할 수 없습니다: {evaluation}")
        except openai.error.RateLimitError:
            print("Rate limit exceeded. Retrying...")
            raise
        except openai.error.AuthenticationError:
            print("Authentication error. Check your API key.")
            raise
        except Exception as e:
            print(f"Unexpected error: {e}")
            raise

    return scores, feedbacks


def run_llm_evaluation(metrics_df, datasets, sample_size=100):
    llm_results = []
    # 샘플 사이즈를 고정
    actual_sample_size = sample_size

    for index, row in metrics_df.sample(n=actual_sample_size, random_state=42).iterrows():
        domain = row['Domain']
        dataset_name = row['Dataset']
        model_type = row['Model']
        num_topics = row['Num_Topics']
        print(f"\nLLM 평가 진행 중 - 도메인: {domain}, 데이터셋: {dataset_name}, 모델: {model_type}, 토픽 수: {num_topics}")

        data = datasets[domain][dataset_name]

        if model_type == 'VAE':
            model, _, topics = perform_topic_modeling(data, num_topics, model_type)
        else:
            model, _ = perform_topic_modeling(data, num_topics, model_type)
            topics = None

        topics = extract_topics(model_type, model, num_topics, topics)

        scores, feedbacks = llm_evaluation(topics, data)

        result = {
            'Domain': domain,
            'Dataset': dataset_name,
            'Model': model_type,
            'Num_Topics': num_topics,
            'LLM_Scores': scores,
            'LLM_Feedbacks': feedbacks
        }
        llm_results.append(result)

        # 중간 결과 저장
        import json
        with open('llm_evaluation_results.json', 'a') as f:
            json.dump(result, f)
            f.write('\n')

    llm_df = pd.DataFrame(llm_results)
    return llm_df


def analyze_llm_results(llm_df):
    llm_df['LLM_Avg_Score'] = llm_df['LLM_Scores'].apply(lambda scores: np.mean([s for s in scores if s is not None]))
    llm_df['LLM_Std_Score'] = llm_df['LLM_Scores'].apply(lambda scores: np.std([s for s in scores if s is not None]))
    llm_df['LLM_Median_Score'] = llm_df['LLM_Scores'].apply(lambda scores: np.median([s for s in scores if s is not None]))

    print("\nLLM 평가 결과:")
    print(llm_df[['Domain', 'Model', 'Num_Topics', 'LLM_Avg_Score', 'LLM_Std_Score', 'LLM_Median_Score']])

def llm_auto_metric_correlation(metrics_df, llm_df):
    merged_df = pd.merge(metrics_df, llm_df, on=['Domain', 'Dataset', 'Model', 'Num_Topics'])

    metric_names = ['Coherence', 'NPMI', 'C_V']
    for metric in metric_names:
        valid_idx = merged_df['LLM_Avg_Score'].notnull()
        pearson_corr, p_value_pearson = pearsonr(merged_df.loc[valid_idx, metric], merged_df.loc[valid_idx, 'LLM_Avg_Score'])
        spearman_corr, p_value_spearman = spearmanr(merged_df.loc[valid_idx, metric], merged_df.loc[valid_idx, 'LLM_Avg_Score'])
        print(f"\nLLM 평가 점수와 {metric}의 상관관계:")
        print(f"Pearson: 상관계수 = {pearson_corr:.4f}, p-value = {p_value_pearson:.4f}")
        print(f"Spearman: 상관계수 = {spearman_corr:.4f}, p-value = {p_value_spearman:.4f}")


def verify_llm_consistency(topics, documents, n_repeats=5):
    all_scores = []
    for _ in range(n_repeats):
        scores, _ = llm_evaluation(topics, documents)
        all_scores.append(scores)
    all_scores = np.array(all_scores)
    std_scores = np.std(all_scores, axis=0)
    avg_std = np.mean(std_scores)
    cv_scores = std_scores / np.mean(all_scores, axis=0)
    avg_cv = np.mean(cv_scores)
    print(f"\nLLM 평가의 평균 표준편차: {avg_std:.4f}")
    print(f"LLM 평가의 평균 변동계수(CV): {avg_cv:.4f}")

def analyze_llm_feedback(llm_df):
    # 피드백에서 자주 등장하는 키워드 추출
    all_words = []
    for feedbacks in llm_df['LLM_Feedbacks']:
        for feedback in feedbacks:
            words = feedback.lower().split()
            all_words.extend([word for word in words if word not in stop_words])

    word_freq = Counter(all_words)
    print("\n피드백에서 가장 자주 등장하는 키워드:")
    for word, count in word_freq.most_common(10):
        print(f"{word}: {count}")

def visualize_llm_results(llm_df):
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Model', y='LLM_Avg_Score', data=llm_df)
    plt.title('모델별 LLM 평가 점수 분포')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='Num_Topics', y='LLM_Avg_Score', hue='Model', data=llm_df)
    plt.title('토픽 수에 따른 LLM 평가 점수')
    plt.show()


# def visualize_results(metrics_df):
#     # 모델별 Coherence 비교
#     plt.figure(figsize=(12, 6))
#     sns.boxplot(x='Model', y='Coherence', data=metrics_df)
#     plt.title('Model Coherence Comparison')
#     plt.show()

#     # 모델별 NPMI 비교
#     plt.figure(figsize=(12, 6))
#     sns.boxplot(x='Model', y='NPMI', data=metrics_df)
#     plt.title('Model NPMI Comparison')
#     plt.show()

#     # 모델별 C_V 비교
#     plt.figure(figsize=(12, 6))
#     sns.boxplot(x='Model', y='C_V', data=metrics_df)
#     plt.title('Model C_V Comparison')
#     plt.show()

#     # 토픽 수에 따른 Coherence 변화
#     plt.figure(figsize=(12, 6))
#     sns.lineplot(x='Num_Topics', y='Coherence', hue='Model', data=metrics_df)
#     plt.title('Coherence vs Number of Topics')
#     plt.show()

# if __name__ == '__main__':
#     # metrics_df 생성
#     metrics_df = pd.DataFrame(metrics_list)

#     # 상관관계 분석 수행
#     correlation_analysis(metrics_df)

#     # 일관성 분석 수행
#     consistency_analysis(metrics_df)

#     # # 계산 시간 비교 수행
#     # computation_time_analysis(comp_times_df)

#     # 도메인 간 성능 비교 수행
#     domain_performance_analysis(metrics_df)

#     # LLM 평가 수행
#     llm_df = run_llm_evaluation(metrics_df, datasets)

#     # LLM 평가 결과 분석
#     analyze_llm_results(llm_df)

#     # LLM 평가와 새 평가지표 간 상관관계 분석
#     llm_auto_metric_correlation(metrics_df, llm_df)

#     # LLM 평가의 일관성 검증 (예시로 첫 번째 토픽 사용)
    # topics = extract_topics('LDA', lda_model, 5)
    # documents = datasets['academy']['business']
    # verify_llm_consistency(topics, documents)

    # LLM 피드백 분석
    # analyze_llm_feedback(llm_df)

    # 결과 시각화
    # visualize_results(metrics_df)
    # visualize_llm_results(llm_df)

In [None]:
def visualize_results(metrics_df):
    metrics = ['Coherence', 'NPMI', 'C_V']
    
    # 모델별 성능 비교 및 토픽 수에 따른 성능 변화
    for metric in metrics:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Model', y=metric, data=metrics_df)
        plt.title(f'Model {metric} Comparison')
        plt.show()

        plt.figure(figsize=(12, 6))
        sns.lineplot(x='Num_Topics', y=metric, hue='Model', data=metrics_df)
        plt.title(f'{metric} vs Number of Topics')
        plt.show()

    # 도메인별 성능 비교
    for metric in metrics:
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='Domain', y=metric, data=metrics_df)
        plt.title(f'{metric} Comparison by Domain')
        plt.show()

    # 상관관계 분석
    correlation_matrix = metrics_df[metrics].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation between Evaluation Metrics')
    plt.show()


def summarize_results(metrics_df, llm_df):
    # 자동 평가 지표 요약
    print("\n자동 평가 지표 요약:")
    print(metrics_df.describe())

    # LLM 평가 지표 요약
    if llm_df is not None:
        print("\nLLM 평가 지표 요약:")
        print(llm_df.describe())

    # 모델별 평균 점수 계산
    model_metrics = metrics_df.groupby('Model').mean()
    print("\n모델별 평균 점수:")
    print(model_metrics)

    # 도메인별 평균 점수 계산
    domain_metrics = metrics_df.groupby('Domain').mean()
    print("\n도메인별 평균 점수:")
    print(domain_metrics)

    # LLM 평가와 자동 평가 지표 간의 상관관계 분석
    if llm_df is not None:
        combined_df = pd.merge(metrics_df, llm_df, on=['Domain', 'Dataset', 'Model', 'Num_Topics'])
        correlation_matrix = combined_df.corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
        plt.title('Correlation between LLM and Automatic Evaluation Metrics')
        plt.show()

# 메인 실행 코드
if __name__ == '__main__':
    # 평가 지표 결과 저장을 위한 데이터프레임 초기화
    metrics_list = []

    # 계산 시간 저장을 위한 딕셔너리 초기화
    computation_times = {}

    # 모델 유형 및 토픽 수 설정
    model_types = ['LDA', 'BERTopic', 'VAE']
    num_topics_list = [2, 4, 6, 8, 10]

    # 토픽 모델링 및 지표 계산
    for domain, domain_datasets in datasets.items():
        for dataset_name, data in domain_datasets.items():
            print(f"\nProcessing {domain} - {dataset_name}")
            for model_type in model_types:
                for num_topics in num_topics_list:
                    print(f"\nModel: {model_type}, Num Topics: {num_topics}")
                    try:
                        start_time = time.time()
                        if model_type == 'VAE':
                            model, vectorizer, topics = perform_topic_modeling(data, num_topics, model_type)
                        else:
                            model, vectorizer = perform_topic_modeling(data, num_topics, model_type)
                            topics = None  # VAE가 아닌 경우
                        topic_modeling_time = time.time() - start_time

                        start_time = time.time()
                        coherence, npmi, c_v = calculate_evaluation_metrics(model, data, model_type, vectorizer, num_topics, topics)
                        evaluation_time = time.time() - start_time

                        # 결과 저장
                        metrics_list.append({
                            'Domain': domain,
                            'Dataset': dataset_name,
                            'Model': model_type,
                            'Num_Topics': num_topics,
                            'Coherence': coherence,
                            'NPMI': npmi,
                            'C_V': c_v
                        })

                        computation_times[f"{domain}_{dataset_name}_{model_type}_{num_topics}"] = {
                            'Topic Modeling': topic_modeling_time,
                            'Evaluation': evaluation_time
                        }

                        print(f"Coherence: {coherence:.4f}")
                        print(f"NPMI: {npmi:.4f}")
                        print(f"C_V: {c_v:.4f}")

                    except Exception as e:
                        print(f"Error processing {domain} - {dataset_name} - {model_type} - {num_topics}: {str(e)}")
                        continue

    # metrics_df 생성
    metrics_df = pd.DataFrame(metrics_list)
    
    # 상관관계 분석 실행
    correlation_analysis(metrics_df)

    # LLM 평가 실행 (선택적)
    llm_df = llm_evaluation(metrics_df, datasets)

    # LLM 평가 결과 분석
    analyze_llm_results(llm_df)

    # LLM 평가와 자동 평가 지표 간의 상관관계 분석
    llm_auto_metric_correlation(metrics_df, llm_df)

    # LLM 평가의 일관성 검증
    verify_llm_consistency(topics, documents)

    # 결과 종합 및 해석
    summarize_results(metrics_df, llm_df)

    # 결과 분석 및 시각화 함수 실행
    visualize_results(metrics_df)

    print("실험 완료")