In [1]:
import pandas as pd
import ast
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读取 CSV 文件
df = pd.read_csv("/Users/sky/Library/CloudStorage/GoogleDrive-skyjin0127@gmail.com/其他计算机/我的笔记本电脑/jzt/BU课程/Spring_2025/CS506/project/cs506-project/noteboooks/plain_resume_keyword_freq.csv")

# 遍历每个类别，将字符串转换为实际的列表
for idx, row in df.iterrows():
    category = row["Category"]
    top_keywords_str = row["top_keywords"]
    
    # 预处理：使用正则表达式将 np.int64(数字) 替换成数字字符串
    if isinstance(top_keywords_str, str):
        top_keywords_str = re.sub(r'np\.int64\((\d+)\)', r'\1', top_keywords_str)
    else:
        print(f"非字符串内容：{top_keywords_str}")
        continue
    
    try:
        keywords_list = ast.literal_eval(top_keywords_str)
    except Exception as e:
        print(f"转换错误：{e}")
        continue

    # 构造关键词频率字典（确保关键词是字符串）
    freq_dict = {kw: score for kw, score in keywords_list if isinstance(kw, str)}
    
    if not freq_dict:
        print(f"类别 {category} 没有有效关键词")
        continue

    # 生成词云，调整了一些参数
    wc = WordCloud(
        width=800,
        height=400,
        background_color="white",
        colormap="viridis",       # 可尝试不同的颜色映射，如 "plasma", "inferno", "magma", "cividis"
        max_words=100,            # 最多显示 100 个词
        max_font_size=80,         # 最大字体大小
        contour_width=2,          # 边框宽度
        contour_color="steelblue" # 边框颜色
    ).generate_from_frequencies(freq_dict)

    # 绘制词云图
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Category: {category}")
    
    safe_category = re.sub(r'\W+', '_', category)
    filename = f"wordcloud_{safe_category}.png"
    plt.savefig(filename, bbox_inches="tight")
    plt.close()