In [7]:
%matplotlib inline
from preamble import *

In [70]:
from sklearn.datasets import load_files
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import LatentDirichletAllocation
import spacy
import nltk
import re
import os
from joblib import dump, load
from scipy import sparse

In [12]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# 忽略 ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [10]:
def clean_text(doc):
    doc = doc.replace(b"<br />", b" ")  # 替换换行符
    doc = re.sub(b"<.*?>", b" ", doc)  # 移除 HTML 标签
    doc = re.sub(b"[^\w\s]", b" ", doc)  # 移除标点符号
    doc = re.sub(b"\d+", b" ", doc)  # 移除数字
    doc = re.sub(b"\s+", b" ", doc).strip()  # 移除多余空格
    return doc

In [5]:
reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [6]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

In [14]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)

In [33]:
n_topics = 15  # 主题数量
lda = LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=5,
    learning_method='online',
    learning_offset=50.,
    random_state=0
)
# We build the model and transform the data in one step
# Computing transform takes some time,
# and we can save time by doing both at once

In [34]:
# 训练模型
lda.fit(X)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=15, random_state=0)

In [61]:
def print_top_words(model, feature_names, n_top_words=10):
    # 获取主题数量
    n_topics = len(model.components_)
    
    # 每行显示的主题数
    topics_per_row = 5
    
    # 遍历每组主题
    for group in range(0, n_topics, topics_per_row):
        # 当前组的主题数
        current_topics = min(topics_per_row, n_topics - group)
        
        # 打印表头
        topics = [f"topic {i}" for i in range(group, group + current_topics)]
        header = "     ".join(topics)
        print("\n" + header)
        print("--------    " * current_topics)
        
        # 获取当前组主题的关键词
        top_words = []
        for topic_idx in range(group, group + current_topics):
            topic = model.components_[topic_idx]
            top_indices = topic.argsort()[:-n_top_words-1:-1]
            top_words.append([feature_names[i] for i in top_indices])
        
        # 按行打印词
        for i in range(n_top_words):
            row = []
            for topic_words in top_words:
                row.append(f"{topic_words[i]:<12}")
            print("".join(row))


In [62]:
# 显示每个主题的前15个关键词
n_top_words = 15
print_top_words(lda, vect.get_feature_names_out(), n_top_words)


topic 0     topic 1     topic 2     topic 3     topic 4
--------    --------    --------    --------    --------    
didn        music       comedy      role        sex         
thought     musical     funny       john        women       
thing       songs       fun         performance tom         
lot         song        zombie      cast        american    
though      dance       horror      play        black       
doesn       dancing     house       actor       woman       
want        singing     dr          played      around      
going       kelly       comedies    mr          another     
10          number      humor       james       christmas   
things      numbers     doctor      british     white       
re          allen       afraid      plays       house       
real        band        hilarious   new         our         
actually    stage       laughs      young       doesn       
few         voice       grant       jack        three       
every       tarzan      were

In [37]:
# 获取文档-主题分布
doc_topics = lda.transform(X)

In [44]:
doc_topics.shape

(25000, 15)

In [40]:
# 打印每个文档最主要的主题
def print_document_topics(doc_topics, n_docs=15):
    print(f"\n前{n_docs}个文档的主题分布:")
    for i in range(min(n_docs, len(doc_topics))):
        topic_distribution = doc_topics[i]
        dominant_topic = topic_distribution.argmax()
        print(f"文档 #{i + 1}: 主要属于主题 {dominant_topic + 1} (概率: {topic_distribution[dominant_topic]:.2f})")

In [41]:
print_document_topics(doc_topics)


前15个文档的主题分布:
文档 #1: 主要属于主题 15 (概率: 0.80)
文档 #2: 主要属于主题 1 (概率: 0.77)
文档 #3: 主要属于主题 1 (概率: 0.70)
文档 #4: 主要属于主题 1 (概率: 0.41)
文档 #5: 主要属于主题 1 (概率: 0.44)
文档 #6: 主要属于主题 13 (概率: 0.54)
文档 #7: 主要属于主题 1 (概率: 0.98)
文档 #8: 主要属于主题 1 (概率: 0.58)
文档 #9: 主要属于主题 15 (概率: 0.58)
文档 #10: 主要属于主题 4 (概率: 0.46)
文档 #11: 主要属于主题 10 (概率: 0.62)
文档 #12: 主要属于主题 7 (概率: 0.76)
文档 #13: 主要属于主题 10 (概率: 0.48)
文档 #14: 主要属于主题 4 (概率: 0.40)
文档 #15: 主要属于主题 3 (概率: 0.78)


In [71]:
topic_mapping = {
    0: "观后感",
    1: "音乐舞蹈",
    2: "喜剧",
    3: "表演",
    4: "社会议题",
    5: "恐怖惊悚",
    6: "负面评价",
    7: "经典电影",
    8: "政治历史",
    9: "动作警匪",
    10: "人物",
    11: "电视",
    12: "科幻",
    13: "视觉",
    14: "家庭"
}

In [72]:
def predict_topic(text, lda_model, vectorizer):

    # 清理和预处理文本
    cleaned_text = clean_text(text.encode())
    
    # 转换文本为向量
    text_vector = vectorizer.transform([cleaned_text])
    
    # 预测主题分布
    topic_dist = lda_model.transform(text_vector)[0]
    
    # 获取主要主题
    main_topic = topic_dist.argmax()
    
    print(f"\n最可能的主题是: {topic_mapping[main_topic]} (Topic {main_topic}, 概率: {topic_dist[main_topic]:.3f})")

In [73]:
# 使用示例
test_text = "This movie was really funny and made me laugh a lot. The comedy was great and the jokes were hilarious."
predict_topic(test_text, lda, vect)


最可能的主题是: 喜剧 (Topic 2, 概率: 0.867)


In [74]:
# 保存到models文件夹
dump(lda, 'models3/lda_model.joblib')
dump(vect, 'models3/vectorizer.joblib')
dump(topic_mapping, 'models3/topic_mapping.joblib')


# 加载示例：
'''
lda = load('models/lda_model.joblib')
vect = load('models/vectorizer.joblib')
topic_mapping = load('models/topic_mapping.joblib')
'''

"\nlda = load('models/lda_model.joblib')\nvect = load('models/vectorizer.joblib')\ntopic_mapping = load('models/topic_mapping.joblib')\n"