In [2]:
import json

keyword_message = json.load(open("dataset/keyword_message.json"))
eval_initial = json.load(open("dataset/eval_initial.json"))
finetuning_initial = json.load(open("dataset/finetuning_initial.json"))

data_all = keyword_message + eval_initial + finetuning_initial
print(len(data_all))

194981


In [None]:
import jieba.posseg as pseg  # 使用带词性标注的分词器
import re
from collections import Counter

risk_data = [d for d in data_all if d["风险类别"] != "无风险"]
print(len(risk_data))

# 加载停用词表
stop_words = set()
with open("cn_stopwords.txt", "r", encoding="utf-8") as f:
    stop_words = set([line.strip() for line in f])

# 正则表达式配置
url_pattern = r"https?://\S+|www\.\S+"  # 匹配URL
punctuation = r"[\s+\!\/_,$%^*(+\"\')]+|[+——！，。？、~@#￥%……&*（）《》“”；：]+"

# 词性白名单
ALLOWED_POS = {'n', 'v', 'vn', 'nt'}  # 名词/动词/动名词/机构名

word_counter = Counter()

for item in risk_data:
    # 文本清洗
    text = re.sub(url_pattern, "", item["文本"])  # 去除链接
    text = re.sub(punctuation, "", text)         # 去除标点
    text = re.sub(r"\d+", "", text)               # 去除纯数字
    
    # 带词性标注的分词 (修改点)
    words = pseg.lcut(text)
    
    # 过滤条件 (核心修改)
    filtered_words = [
        word for word, flag in words
        if (flag in ALLOWED_POS) and              # 词性过滤
        (word not in stop_words) and
        (len(word) > 1) and
        (not word.isdigit()) and
        (not word.isspace())
    ]
    
    word_counter.update(filtered_words)

sorted_dict = dict(word_counter.most_common())
print(list(sorted_dict.items())[:20])

134977
[('链接', 64321), ('账户', 51139), ('点击', 48666), ('下载', 38287), ('投资', 37659), ('进行', 32352), ('完成', 29588), ('加入', 29469), ('转账', 29087), ('获得', 27955), ('提供', 27804), ('操作', 27622), ('机会', 26461), ('参与', 26128), ('服务', 24633), ('注意', 23678), ('客服', 20872), ('需要', 19635), ('尊敬', 18943), ('任务', 18919)]


In [9]:
if "详情请" in word_counter:
    detail_count = word_counter.pop("详情请")
    word_counter["详情"] += detail_count

word_counter.pop("可能", None)

sorted_dict = dict(word_counter.most_common())

In [13]:
print(word_counter["详情"])
from pyecharts.charts import WordCloud

words = [(k, v) for k, v in sorted_dict.items()][:50]
WordCloud().add("", words).render("wordcloud.html")

9408


'c:\\Users\\ZeroVector\\Desktop\\资料\\资料\\Mindspore开源仓库\\wordcloud.html'

In [14]:
with open("fraud_keywords.json", "w", encoding="utf-8") as f:
    json.dump(words, f, ensure_ascii=False, indent=4)