In [2]:
import json

keyword_message = json.load(open("dataset/keyword_message.json"))
eval_initial = json.load(open("dataset/eval_initial.json"))
finetuning_initial = json.load(open("dataset/finetuning_initial.json"))

data_all = keyword_message + eval_initial + finetuning_initial
print(len(data_all))

194981


In [3]:
import jieba.posseg as pseg  # 使用带词性标注的分词器
import re
from collections import Counter

risk_data = [d for d in data_all if d["风险类别"] != "无风险"]
print(len(risk_data))

# 加载停用词表
stop_words = set()
with open("cn_stopwords.txt", "r", encoding="utf-8") as f:
    stop_words = set([line.strip() for line in f])

# 正则表达式配置
url_pattern = r"https?://\S+|www\.\S+"  # 匹配URL
punctuation = r"[\s+\!\/_,$%^*(+\"\')]+|[+——！，。？、~@#￥%……&*（）《》“”；：]+"

# 词性白名单
ALLOWED_POS = {'n', 'v', 'vn', 'nt'}  # 名词/动词/动名词/机构名

word_counter = Counter()

for item in risk_data:
    # 文本清洗
    text = re.sub(url_pattern, "", item["文本"])  # 去除链接
    text = re.sub(punctuation, "", text)         # 去除标点
    text = re.sub(r"\d+", "", text)               # 去除纯数字
    
    # 带词性标注的分词 (修改点)
    words = pseg.lcut(text)
    
    # 过滤条件 (核心修改)
    filtered_words = [
        word for word, flag in words
        if (flag in ALLOWED_POS) and              # 词性过滤
        (word not in stop_words) and
        (len(word) > 1) and
        (not word.isdigit()) and
        (not word.isspace())
    ]
    
    word_counter.update(filtered_words)

sorted_dict = dict(word_counter.most_common())
print(list(sorted_dict.items())[:20])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\niang\AppData\Local\Temp\jieba.cache


134977


Loading model cost 0.500 seconds.
Prefix dict has been built successfully.


KeyboardInterrupt: 

In [9]:
if "详情请" in word_counter:
    detail_count = word_counter.pop("详情请")
    word_counter["详情"] += detail_count

word_counter.pop("可能", None)

sorted_dict = dict(word_counter.most_common())

In [None]:
print(word_counter["详情"])
from pyecharts.charts import WordCloud

words = [(k, v) for k, v in sorted_dict.items()][:50]


9408


'c:\\Users\\ZeroVector\\Desktop\\资料\\资料\\Mindspore开源仓库\\wordcloud.html'

In [None]:
WordCloud().add("", words).render("wordcloud.html")

In [None]:
with open("fraud_keywords.json", "w", encoding="utf-8") as f:
    json.dump(words, f, ensure_ascii=False, indent=4)


NameError: name 'words' is not defined

In [4]:

with open("fraud_keywords.json", "r", encoding="utf-8") as f:
    words = json.load(f)

In [8]:
import plotly.express as px
import pandas as pd

# 将词频数据转换为 DataFrame
word_freq_df = pd.DataFrame(words, columns=["Word", "Frequency"])

# 使用 plotly 生成更美观的直方图
fig = px.bar(
    word_freq_df,
    x="Word",
    y="Frequency",
    title="Word Frequency Histogram",
    text="Frequency",
    color="Frequency",
    color_continuous_scale="Viridis",
    labels={"Word": "关键词", "Frequency": "频率"}
)

# 设置图表样式
fig.update_traces(
    texttemplate='%{text:.2s}', 
    textposition='outside',
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5
)

# fig.update_layout(
#     title_font_size=24,
#     xaxis_tickangle=-45,
#     xaxis_title_font_size=18,
#     yaxis_title_font_size=18,
#     margin=dict(l=40, r=40, t=60, b=40)
# )

fig.show()