In [1]:
! pip install transformers
! pip install datasets
! pip install sentence-transformers

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Col

# 调用“知识图谱”网站诗词库API

In [2]:
import requests
import pandas as pd


url = "https://open.cnkgraph.com/api/Writing/Find"
pageNo = 0
poetry_data = []
key = '鷓鴣'
path = '/content/drive/MyDrive/诗歌意象分析/鷓鴣.xlsx'

next_page = True
while next_page:
    payload = {
        "key": key,
        "exactlyMatch": False,
        "showMatchedClauseOnly": True,
        "pageNo": pageNo
    }

    response = requests.post(url, json=payload)

    if response.status_code == 200:
        data = response.json()
        data = data['WritingSentences']
        poetry_data += data
        pageNo += 1
    elif response.status_code == 500:
        print("已获取完所有页面。")
        df = pd.DataFrame(poetry_data)
        del df['WritingId']
        df = df[~(df['Author'].str.contains('民國') |df['Author'].str.contains('近現代') | df['Author'].str.contains('當代'))] # 删除近当代的诗词
        df.to_excel(path, index=False)
        next_page = False
    else:
        print(f"请求失败，状态码: {response.status_code}。")
        break

已获取完所有页面。


In [3]:
df.head()

Unnamed: 0,Author,Title,Sentence
0,東漢·楊孚,鷓鴣,鳥象雌雞，自名鷓鴣。
1,初唐·宋之問,在荆州重赴嶺南,還將鵷鷺羽，重入鷓鴣羣。
2,初唐·沈佺期,從驩州廨宅移住山間水亭贈蘇使君,願陪鸚鵡樂，希並鷓鴣留。
3,盛唐·孟浩然,送吳悅遊韶陽,五色憐鳳雛，南飛適鷓鴣。
4,盛唐·李白,醉題王漢陽廳,我似鷓鴣鳥，南遷嬾北飛。


# 句子层面：句子语义聚类

In [4]:
import torch
from datasets import Dataset
from sentence_transformers import SentenceTransformer as SBert
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SBert('SIKU-BERT/sikubert').to(device)

column = 'Sentence'
dataset = Dataset.from_pandas(df)

# 创建数据加载器
batch_size = 5000

def collate_fn(batch):
    sentences = [item[column] for item in batch]
    return {column: sentences}

dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=80, shuffle=False, collate_fn=collate_fn)

# 获取句子嵌入并显示进度条
sentences_embeddings = []
with torch.no_grad(), tqdm(total=len(dataloader), desc='Processing') as pbar:
    for batch in dataloader:
        sentences = batch[column]
        encoded_sentences = model.encode(sentences, convert_to_tensor=True, device=device)
        sentences_embeddings.append(encoded_sentences)

        pbar.update(1)

sentences_embeddings = torch.cat(sentences_embeddings, dim=0)

# 转换为NumPy数组
sentences_np = sentences_embeddings.cpu().numpy()

Downloading (…)2226a/.gitattributes:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading (…)843f42226a/README.md:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

Downloading (…)843f42226a/REDAME.md:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

Downloading (…)26a/bert_config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading (…)3f42226a/config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)843f42226a/vocab.txt:   0%|          | 0.00/144k [00:00<?, ?B/s]

Processing: 100%|██████████| 1/1 [00:10<00:00, 10.25s/it]


In [5]:
from sklearn.cluster import DBSCAN

# 进行密度聚类
dbscan = DBSCAN(eps=6.5, min_samples=5)
labels = dbscan.fit_predict(sentences_np)

df["cluster"] = labels
df = df.sort_values(by='cluster')

df.head()

Unnamed: 0,Author,Title,Sentence,cluster
0,東漢·楊孚,鷓鴣,鳥象雌雞，自名鷓鴣。,-1
1204,明末清初·施閏章,袁州讀鄭谷詩,都官當日真名士，笑殺人呼鄭鷓鴣。,-1
1203,明末清初·施閏章,從制府江行 其四,昨宵愁不寐，恰有鷓鴣啼。,-1
1202,明末清初·施閏章,奉贈梁大司農棠村 其二,玉節坐銷豺虎亂，塗歌時和鷓鴣聲。,-1
1201,明末清初·施閏章,將次袁州聞袁勝之進士客遊悵然有作,向夕鷓鴣城裏宿，高臺明月共誰論。,-1


In [6]:
df.to_excel(path, index=False)

# 词汇层面：Apriori算法挖掘关联规则

Apriori算法的实现代码参考《python数据挖掘入门与实践（第二版）》

In [7]:
# 分词
import torch
from transformers import pipeline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe = pipeline("token-classification", model="ckiplab/bert-base-chinese-ws", device=device)

results = [''.join([' ' + result['word']
                    if result['entity'] == 'B'
                    else result['word']
                    for result in pipe(text)]) for text in df['Sentence']]
df['WordSegment'] = results

removes = [' 。', ' ，', ' ？', ' ！', ' ；', ' 鷓鴣', ' 鹧鸪', ' 鷓', ' 鹧', ' 鴣', ' 鸪'] # 由于意象关键词一定会出现在诗句中，需删除它以避免查找多余的关联规则。注意：由于分词结果用空格隔开，删除的内容前一定要添加空格。
for remove in removes:
    df['WordSegment'] = df['WordSegment'].str.replace(remove, '')
df['word_list'] =df['WordSegment'].str.split()
del df['WordSegment']

Downloading (…)lve/main/config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/407M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [8]:
# 创建词频表
from collections import Counter
word_counter = Counter()

for word_list in df['word_list']:
    word_counter.update(word_list)

words_df = pd.DataFrame(list(word_counter.items()), columns=['word', 'count'])
words_df = words_df.sort_values(by='count', ascending=False)

In [9]:
frequent_itemsets = {}
min_support = 20 # 设置最小支持度（频次）
min_confidence = 0.7 # 设置最小置信度

In [10]:
# 创建长度为1的频繁项集
from collections import Counter

df['word_set'] = df['word_list'].apply(frozenset)
frequent_itemsets[1] = dict((frozenset((raw['word'], )), raw['count'])
                            for i, raw in words_df.iterrows() if raw['count'] >= min_support)

In [11]:
from collections import defaultdict

def find_frequent_itemsets(df, k_1_itemsets, min_support):
    '''
    df:有word_set列，其中数据格式是frozenset；
    k_1_itemsets：k-1个项集，也就是当前需要找出的项集的前面项集；
    min_support：最小支持度，用频次衡量。
    '''
    counts = defaultdict(int)
    for index, raw in df.iterrows():
        for itemset in k_1_itemsets:
            if itemset.issubset(raw['word_set']):
                for other_wordset in raw['word_set'] - itemset:
                    current_superset = itemset | frozenset((other_wordset, ))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in
                    counts.items() if frequency >= min_support])

for k in range(2, 20):
    # 从长度为k-1的频繁项集生成长度为k的候选项集，且只保留频繁项集
    cur_frequent_itemsets = find_frequent_itemsets(
        df, frequent_itemsets[k-1], min_support)
    if len(cur_frequent_itemsets) == 0:
        print(f"找不到长度为{k}的频繁项集。")
        break
    else:
        print(f"找到{len(cur_frequent_itemsets)}个长度为{k}的频繁项集。")
        frequent_itemsets[k] = cur_frequent_itemsets

找到87个长度为2的频繁项集。
找到4个长度为3的频繁项集。
找不到长度为4的频繁项集。


In [12]:
# 创建关联规则
print('正在创建关联规则...')
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    if itemset_length > 1:
        for itemset in itemset_counts.keys():
            for conclusion in itemset:
                premise = itemset - set((conclusion, )) # 前提
                candidate_rules.append((premise, conclusion))
# 检验关联规则是否有效
print('正在检验关联规则...')
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for index, raw in df.iterrows():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(raw['word_set']):
            if conclusion in raw['word_set']:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

# 筛选符合条件的项
print('正在筛选符合条件的项...')
rule_confidence_filtered = {}
for candidate_rule in candidate_rules:
    confidence = correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
    if confidence >= min_confidence:
        rule_confidence_filtered[candidate_rule] = confidence

正在创建关联规则...
正在检验关联规则...
正在筛选符合条件的项...


In [13]:
premises = []
conclusions = []
confidences = []
for k, v in rule_confidence_filtered.items():
    premise, conclusion = k
    premise = list(premise)
    confidence = v
    premises.append(premise)
    conclusions.append(conclusion)
    confidences.append(confidence)

rule_df = pd.DataFrame({'Premise':premises, 'Conclusion':conclusions, 'Confidence':confidences})
rule_df.head()

Unnamed: 0,Premise,Conclusion,Confidence
0,[處],啼,0.792208
1,[憶],啼,0.714286
2,"[啼, 苦]",竹,0.733333
3,"[深, 山]",啼,0.8
4,"[時, 處]",啼,0.769231


In [14]:
rule_df.to_excel('/content/drive/MyDrive/诗歌意象分析/rule.xlsx', index=False)