任务目标：\
从中均匀抽取1000个段落作为数据集（每个段落可以有 K 个 token, K 可以取20，100，500, 1000, 3000），每个段落的标签就是对应段落所属的小说。利用LDA模型在给定的语料库上进行文本建模，主题数量为 T，并把每个段落表示为主题分布后进行分类（分类器自由选择），分类结果使用 10 次交叉验证（i.e. 900 做训练，剩余100 做测试循环十次）。实现和讨论如下的方面：（1）在设定不同的主题个数T的情况下，分类性能是否有变化？
（2）以"词"和以"字"为基本单元下分类结果有什么差异？
（3）不同的取值的K的短文本和长文本，主题模型性能上是否有差异？ 要求三个方面用代码测试结果


In [109]:
# read class data
with open("../data/inf.txt", "r", encoding="GB2312") as f:
    classes = f.readline().split(",")
classes


['白马啸西风',
 '碧血剑',
 '飞狐外传',
 '连城诀',
 '鹿鼎记',
 '三十三剑客图',
 '射雕英雄传',
 '神雕侠侣',
 '书剑恩仇录',
 '天龙八部',
 '侠客行',
 '笑傲江湖',
 '雪山飞狐',
 '倚天屠龙记',
 '鸳鸯刀',
 '越女剑']

In [110]:
# read document, split by word or word group
import jieba
doc_dic = {}
split_by_word = False
para_len = 3000
total_words = 0
n_topics = 100

if split_by_word:
        data_path = 'para_word.csv'
else:
    data_path = 'para_words.csv'

for cla in classes:
    with open(f"../data/{cla}.txt", "r", encoding="GBK", errors='ignore') as f:
        data = f.read()
        data = data.replace(
            '本书来自www.cr173.com免费txt小说下载站\n更多更新免费电子书请关注www.cr173.com', '')
        f.close()
    with open("../cn_stopwords.txt", "r", encoding='utf-8') as fp:
        stop_words = fp.read().split('\n')
        fp.close()
    split_words = []
    if split_by_word:
        for word in data:
            if word not in stop_words and not word.isspace():
                split_words.append(word)
    else:
        for words in jieba.cut(data):
            if words not in stop_words and not words.isspace():
                split_words.append(words)
    total_words += len(split_words)
    doc_dic[f"{cla}"] = split_words
            
doc_dic

{'白马啸西风': ['白马',
  '啸',
  '西风',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '黄沙',
  '莽莽',
  '回疆',
  '大漠',
  '之上',
  '尘沙',
  '飞',
  '两丈',
  '高',
  '两',
  '骑马',
  '一前',
  '後',
  '急',
  '驰而来',
  '前面',
  '匹',
  '高腿',
  '长身',
  '白马',
  '马上',
  '骑著个',
  '少妇',
  '怀中',
  '搂',
  '著个',
  '七八岁',
  '小姑娘',
  '後',
  '面',
  '匹',
  '枣红马',
  '马背上',
  '伏著',
  '高瘦',
  '汉子',
  '汉子',
  '左边',
  '背心',
  '插',
  '著',
  '一枝',
  '长箭',
  '鲜血',
  '背心',
  '流',
  '马背上',
  '流到',
  '地下',
  '滴入',
  '黄沙',
  '之中',
  '不敢',
  '伸手',
  '拔箭',
  '这枝',
  '箭',
  '拔下来',
  '会',
  '支持',
  '不住',
  '立时',
  '倒毙',
  '死',
  '没什',
  '麽',
  '照料',
  '前面',
  '娇妻',
  '幼女',
  '身',
  '後',
  '凶悍',
  '毒辣',
  '敌人',
  '正在',
  '紧紧',
  '追踪',
  '跨',
  '枣红马',
  '奔驰',
  '数十里',
  '早已',
  '筋疲力尽',
  '主人',
  '没命',
  '价',
  '鞭打',
  '催',
  '踢',
  '之下',
  '逼得气',
  '喘',
  '过来',
  '嘴边',
  '已全',
  '白沫',
  '猛地',
  '里',
  '前',
  '腿',
  '一软',
  '跪倒',
  '汉子',
  '用力',
  '一提',
  '缰绳',
  '那红马',
  '一声',
  '哀嘶',
  '抽搐',
  '几下',
  '便',
  '脱力',
  '死',
  '少妇',

In [111]:
doc_dic

{'白马啸西风': ['白马',
  '啸',
  '西风',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '…',
  '黄沙',
  '莽莽',
  '回疆',
  '大漠',
  '之上',
  '尘沙',
  '飞',
  '两丈',
  '高',
  '两',
  '骑马',
  '一前',
  '後',
  '急',
  '驰而来',
  '前面',
  '匹',
  '高腿',
  '长身',
  '白马',
  '马上',
  '骑著个',
  '少妇',
  '怀中',
  '搂',
  '著个',
  '七八岁',
  '小姑娘',
  '後',
  '面',
  '匹',
  '枣红马',
  '马背上',
  '伏著',
  '高瘦',
  '汉子',
  '汉子',
  '左边',
  '背心',
  '插',
  '著',
  '一枝',
  '长箭',
  '鲜血',
  '背心',
  '流',
  '马背上',
  '流到',
  '地下',
  '滴入',
  '黄沙',
  '之中',
  '不敢',
  '伸手',
  '拔箭',
  '这枝',
  '箭',
  '拔下来',
  '会',
  '支持',
  '不住',
  '立时',
  '倒毙',
  '死',
  '没什',
  '麽',
  '照料',
  '前面',
  '娇妻',
  '幼女',
  '身',
  '後',
  '凶悍',
  '毒辣',
  '敌人',
  '正在',
  '紧紧',
  '追踪',
  '跨',
  '枣红马',
  '奔驰',
  '数十里',
  '早已',
  '筋疲力尽',
  '主人',
  '没命',
  '价',
  '鞭打',
  '催',
  '踢',
  '之下',
  '逼得气',
  '喘',
  '过来',
  '嘴边',
  '已全',
  '白沫',
  '猛地',
  '里',
  '前',
  '腿',
  '一软',
  '跪倒',
  '汉子',
  '用力',
  '一提',
  '缰绳',
  '那红马',
  '一声',
  '哀嘶',
  '抽搐',
  '几下',
  '便',
  '脱力',
  '死',
  '少妇',

In [112]:
import csv
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import random

def one_hot(index, length):
    # 创建一个全为0的数组
    one_hot_vector = np.zeros(length)
    # 根据索引设置对应位置为1
    one_hot_vector[index] = 1
    # print(index)
    return one_hot_vector

def read_paraphs():
    # 
    for pos, cla in enumerate(classes):
        particial = len(doc_dic[f"{cla}"]) / total_words           
        para_samples = int(particial * 1000) + 1
        st_pos = int(len(doc_dic[f"{cla}"]) // para_samples)
        data_list = []
        for index in range(para_samples):
            para = doc_dic[f"{cla}"][index * st_pos : index * st_pos + para_len]
            # 可以全取出来然后shuffer
            item = {
                "content": para,
                "label": one_hot(pos, len(classes))
            }
            data_list.append(item)
        # random.shuffle(data_list)
        # data_list = data_list[:1000]
        #  modify
        with open(data_path, 'a', newline='', encoding='utf-8') as f:
            csv_header = ['content', 'label']
            csv_writer = csv.DictWriter(f, csv_header)
            if f.tell() == 0:
                csv_writer.writeheader()
            csv_writer.writerows(data_list)  # 写入数据
            f.close()
with open(data_path, 'w', newline='', encoding='utf-8') as f:
    f.write("")
    f.close()
read_paraphs()


In [113]:
# data pre-process
pattern = u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!\t"@#$%^&*\\-_=+a-zA-Z，。\n《》、？：；“”‘’｛｝【】（）…￥！—┄－]+'


In [114]:
# 两种不同的词频统计方式
# from sklearn.feature_extraction.text import CountVectorizer
# import pandas as pd
# with open("../cn_stopwords.txt", "r") as f:
#     chinese_stop_words = f.readlines()
#     for index in range(len(chinese_stop_words)):
#         chinese_stop_words[index] = chinese_stop_words[index].rstrip()
# data = pd.read_csv(f"{data_path}")
# contents = data["content"].apply(lambda x: " ".join(eval(x))).tolist()
# # use analyzer='word' when split by words
# tf_vecorizer = CountVectorizer(stop_words=chinese_stop_words, analyzer='char')
# tf = tf_vecorizer.fit_transform(contents)
# print(tf.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
with open("../cn_stopwords.txt", "r") as f:
    chinese_stop_words = f.readlines()
    for index in range(len(chinese_stop_words)):
        chinese_stop_words[index] = chinese_stop_words[index].rstrip()
data = pd.read_csv(f"{data_path}")
contents = data["content"].apply(lambda x: " ".join(eval(x))).tolist()
labels = data["label"]
# modify needed when splitby words  
tf_idf_vectorizer = TfidfVectorizer(analyzer='word')
tf_idf = tf_idf_vectorizer.fit_transform(contents)
tf_idf.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [115]:
# test LDA's performance
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

X_train, X_test= train_test_split(tf_idf, test_size=0.2, random_state=42)

lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=50,
                                learning_method='batch')
lda.fit(X_train) #tf即为Document_word Sparse Matrix                  

# 评估LDA的性能
X = tf_idf.toarray()

perplexity = lda.perplexity(X_test)
print("Perplexity:", perplexity)

Perplexity: 4.0856346125250107e+30


In [121]:
# train LDA Model with full dataset
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=n_topics, 
                                max_iter=50,
                                learning_method='batch')
lda.fit(tf_idf) #tf即为Document_word Sparse Matrix        

In [117]:
import numpy as np
# 主题词分布情况
def top_words_data_frame(model: LatentDirichletAllocation,
                         tf_idf_vectorizer: TfidfVectorizer,
                         n_top_words: int) -> pd.DataFrame:
    rows = []
    feature_names = tf_idf_vectorizer.get_feature_names_out()
    for topic in model.components_:
        top_words = [feature_names[i]
                     for i in topic.argsort()[:-n_top_words - 1:-1]]
        rows.append(top_words)
    columns = [f'topic {i+1}' for i in range(n_top_words)]
    df = pd.DataFrame(rows, columns=columns)

    return df

# 主题分布情况
def predict_to_data_frame(model: LatentDirichletAllocation, X: np.ndarray) -> pd.DataFrame:
    matrix = model.transform(X)
    columns = [f'P(topic {i+1})' for i in range(len(model.components_))]
    df = pd.DataFrame(matrix, columns=columns)
    return df

X = tf_idf.toarray()
predict_df = predict_to_data_frame(lda, X)
predict_df



Unnamed: 0,P(topic 1),P(topic 2),P(topic 3),P(topic 4),P(topic 5),P(topic 6),P(topic 7),P(topic 8),P(topic 9),P(topic 10),...,P(topic 91),P(topic 92),P(topic 93),P(topic 94),P(topic 95),P(topic 96),P(topic 97),P(topic 98),P(topic 99),P(topic 100)
0,0.000801,0.000801,0.000801,0.000801,0.000801,0.005215,0.000801,0.000801,0.007914,0.000801,...,0.045205,0.000801,0.000801,0.017220,0.000801,0.000801,0.000801,0.006175,0.000801,0.080375
1,0.001095,0.001095,0.001095,0.001095,0.001095,0.004325,0.001095,0.001095,0.008044,0.001095,...,0.043872,0.001095,0.001095,0.007829,0.001095,0.001095,0.001095,0.007490,0.001095,0.076814
2,0.001002,0.001002,0.001002,0.001002,0.001002,0.119635,0.001002,0.001002,0.006379,0.001002,...,0.041067,0.001002,0.001002,0.012431,0.001002,0.001002,0.001002,0.008802,0.001002,0.067462
3,0.001218,0.001218,0.001218,0.001218,0.001218,0.001218,0.001218,0.001218,0.006732,0.001218,...,0.045368,0.001218,0.001218,0.017820,0.001218,0.001218,0.001218,0.001218,0.001218,0.063176
4,0.000992,0.000992,0.000992,0.000992,0.000992,0.037642,0.000992,0.000992,0.002275,0.000992,...,0.037988,0.000992,0.000992,0.027756,0.000992,0.000992,0.000992,0.008390,0.000992,0.094418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,0.000980,0.000980,0.000980,0.000980,0.000980,0.000980,0.000980,0.000980,0.008470,0.000980,...,0.035036,0.000980,0.000980,0.025664,0.155774,0.000980,0.000980,0.009477,0.000980,0.058558
1004,0.001100,0.001100,0.001100,0.001100,0.001100,0.001100,0.001100,0.001100,0.001100,0.001100,...,0.037182,0.001100,0.001100,0.007261,0.292713,0.001100,0.001100,0.001100,0.001100,0.043313
1005,0.000948,0.000948,0.000948,0.000948,0.000948,0.000948,0.000948,0.000948,0.000948,0.000948,...,0.050620,0.000948,0.000948,0.013518,0.132709,0.000948,0.000948,0.006434,0.000948,0.064734
1006,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,0.001314,...,0.033017,0.001314,0.001314,0.001314,0.069488,0.001314,0.001314,0.029292,0.001314,0.038309


In [118]:
n_top_words = 20
top_words_df = top_words_data_frame(lda, tf_idf_vectorizer, n_top_words)
top_words_df

Unnamed: 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9,topic 10,topic 11,topic 12,topic 13,topic 14,topic 15,topic 16,topic 17,topic 18,topic 19,topic 20
0,包三,苏克萨,黄真,佛经,崔希敏,黄真道,佛堂,翻看,玉簪,一千六百,发米,白米,千手观音,袁师弟,遗言,黄真笑,亲政,老宅,库房,向阳
1,单铁生,小蛇,童子,水阁,冰蟾,乞婆,赵小姐,公差,库银,剃刀,独眼,红衣,头晕,老臣,铁尺,小个子,齐云,走水,别府,涌泉穴
2,陆立鼎,陆二娘,武修文,德布,独臂,何沅君,陆氏,兄嫂,苏辙,拙夫,男孩,陆爷,藏僧,当当,汉武帝,令兄,大鹰,犯规,天山童姥,范祖禹
3,葛尔丹,桑结,陈圆圆,骆冰笑,姥姥,韦春芳,崔百泉,过彦之,石双英,周绮道,蒙哥,祝寿,四爷,天镜,人厨子,旭烈,黄前帮,大宴,周大奶奶,百草仙
4,华筝,狗王,书记,参合,红菱,介末,白师伯,面幕,大舰,阿碧笑,老婆婆,金算盘,小舟,闷香,老公公,崔过,野战,石兄,摩诃,我报
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,樊一翁,玉玑子,穆人清,木桑,楞伽,觉远,穆易,尹克西,五绝,剪刀,棋子,猩猩,那小王爷,木桑道人,千变,承志,四卷,长胡子,南僧,北侠
96,游迅,钟镇,麻雀,米香主,五霸,高克新,阿三,令狐公子,齐元凯,冈上,玉如意,方东白,五毒,断骨,阿二,尹章垓,范中恩,邓八公,黄伯流,补药
97,陈家洛,徐天宏,乾隆,福康安,无忌,卫士,无尘,王维扬,空智,朱子柳,胡涂,李可秀,矮子,八名,部属,福大帅,武官,下令,小说,尊师
98,竹签,费要多罗,俄罗斯,赵良栋,本参,尼布楚,华伯斯基,天津,大胡子,牟尼堂,李师伯,本观,此经,俄国,蓝风凰,蓝风凰道,神剑经,油灯,中年人,祖师爷


In [119]:
# virtualization
# import pyLDAvis.sklearn
# html_path = "local.html"
# vis_data = pyLDAvis.sklearn.prepare(lda, tf_idf, tf_idf_vectorizer)
# pyLDAvis.save_html(data, html_path)
# # pyLDAvis.show(vis_data, open_browser=False)

In [123]:
# feature: 段落的主题分布 label: 小说种类
from  catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score



clf = CatBoostClassifier(task_type="GPU", iterations=1000)
# clf = RandomForestClassifier()
# clf = KNeighborsClassifier()

# 使用 10 次交叉验证评估分类器的性能
scores = cross_val_score(clf, lda.transform(tf_idf), labels.to_numpy(), cv=10)

# 输出交叉验证得分的平均值和标准差
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Learning rate set to 0.063626
0:	learn: 2.5965796	total: 12.1ms	remaining: 12.1s
1:	learn: 2.4572191	total: 21ms	remaining: 10.5s
2:	learn: 2.3411745	total: 30.4ms	remaining: 10.1s
3:	learn: 2.2520613	total: 41ms	remaining: 10.2s
4:	learn: 2.1653555	total: 50.2ms	remaining: 9.99s
5:	learn: 2.0865437	total: 59.7ms	remaining: 9.89s
6:	learn: 2.0157704	total: 69.2ms	remaining: 9.82s
7:	learn: 1.9539917	total: 79.2ms	remaining: 9.82s
8:	learn: 1.8937924	total: 89.4ms	remaining: 9.84s
9:	learn: 1.8364364	total: 98.6ms	remaining: 9.76s
10:	learn: 1.7869605	total: 108ms	remaining: 9.67s
11:	learn: 1.7370311	total: 116ms	remaining: 9.57s
12:	learn: 1.6948616	total: 125ms	remaining: 9.49s
13:	learn: 1.6498781	total: 135ms	remaining: 9.49s
14:	learn: 1.6063115	total: 144ms	remaining: 9.46s
15:	learn: 1.5711564	total: 153ms	remaining: 9.43s
16:	learn: 1.5304581	total: 163ms	remaining: 9.41s
17:	learn: 1.4958515	total: 172ms	remaining: 9.37s
18:	learn: 1.4621866	total: 181ms	remaining: 9.34s
19:	l