In [None]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary


# 定义自定义评分函数
def nmf_coherence_scorer(estimator, X):
    # 转换文本为TF-IDF矩阵
    tfidf_matrix = estimator.named_steps['tfidf'].transform(X)
    # 获取NMF主题矩阵
    nmf_model = estimator.named_steps['nmf']
    W = nmf_model.transform(tfidf_matrix)
    H = nmf_model.components_

    # 创建词典和语料库
    texts = [doc.split() for doc in X]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # 计算主题连贯性
    coherence_model = CoherenceModel(
        topics=[[dictionary[word_id] for word_id in topic.argsort()[:-11:-1]] for topic in H],
        texts=texts, dictionary=dictionary, coherence='c_v', processes=1)
    coherence = coherence_model.get_coherence()
    return coherence

if __name__ == '__main__':
    # 加载数据
    df = pd.read_csv('../data/processed/tweets_with_topics.csv')
    # 检查并处理缺失值
    df['cleaned_text'] = df['cleaned_text'].fillna('')

    # 确保所有数据都是字符串类型
    df['cleaned_text'] = df['cleaned_text'].astype(str)

    # 定义最终模型的参数
    final_params = {
        'tfidf__max_features': 6000,
        'tfidf__ngram_range': (1, 2),
        'tfidf__max_df': 0.9,
        'tfidf__min_df': 10,
        'nmf__n_components': 10,  # 0.36615630438929847
    }

    # 定义管道
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('nmf', NMF(random_state=1, max_iter=500))
    ])

    # 设置最终模型参数
    pipeline.set_params(**final_params)

    # 训练模型
    pipeline.fit(df['cleaned_text'])

    # 计算连贯性
    coherence = nmf_coherence_scorer(pipeline, df['cleaned_text'])
    print(f"Coherence: {coherence}")

    # 获取主题分布矩阵
    W = pipeline.named_steps['nmf'].transform(pipeline.named_steps['tfidf'].transform(df['cleaned_text']))

    # 将每个推文的主题分布添加到数据框中
    for i in range(pipeline.named_steps['nmf'].n_components):
        df[f'topic_{i}'] = W[:, i]

    # 保存最佳模型
    joblib.dump(pipeline, 'final_nmf_model.pkl')

    # 保存结果
    df.to_csv('../data/processed/tweets_with_final_nmf_topics_final.csv', index=False)
