In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

# 加载Iconclass名称数据
def load_iconclass_names(file_path='iconclass_names.csv'):
    """
    加载iconclass名称数据
    
    参数:
    file_path: iconclass名称CSV文件路径
    
    返回:
    包含iconclass ID和名称的字典
    """
    try:
        names_df = pd.read_csv(file_path)
        # 创建iconclass_id到description的映射
        names_dict = dict(zip(names_df['iconclass_id'], names_df['description']))
        return names_dict
    except Exception as e:
        print(f"加载iconclass名称文件时出错: {str(e)}")
        # 返回一个空字典，使程序能够继续运行
        return {}

# 1. 修改Level 0热力图函数，在图表下方添加Iconclass名称标注
def create_iconclass_heatmap(df, iconclass_names, title_suffix="", save_path=None, save_svg=False):
    """
    为单个数据框创建热力图
    
    参数:
    df: 包含 room_functions 和 iconclassses_level_0 列的 DataFrame
    iconclass_names: iconclass ID到名称的映射字典
    title_suffix: 标题后缀，用于区分不同的热力图
    save_path: 保存文件的路径（不包含扩展名）
    save_svg: 是否保存SVG格式
    """
    # 将 iconclassses_level_0 字符串转换为列表
    df['iconclass_numbers'] = df['iconclassses_level_0'].str.split(',')
    
    # 展开 iconclass 数字列表
    exploded_df = df.explode('iconclass_numbers')
    
    # 创建频率矩阵
    freq_matrix = pd.crosstab(exploded_df['room_functions'], 
                             exploded_df['iconclass_numbers'])
    
    # 确保包含所有数字 1-9
    for i in range(1, 10):
        if str(i) not in freq_matrix.columns:
            freq_matrix[str(i)] = 0
    
    # 排序列
    freq_matrix = freq_matrix.reindex(sorted(freq_matrix.columns), axis=1)
    
    # 创建图形 - 增加底部空间以容纳标签
    fig, ax = plt.subplots(figsize=(15, 16))
    
    # 创建热力图
    sns.heatmap(freq_matrix, 
                cmap='YlGnBu',
                annot=True,
                fmt='d',
                cbar_kws={'label': 'Frequency'},
                vmax=600,
                ax=ax)
    
    plt.title(f'Frequency of Iconclass Level 0 by Room Functions {title_suffix}', fontsize=14)
    plt.xlabel('Iconclass Level 0', fontsize=12)
    plt.ylabel('Room Functions', fontsize=12)
    
    # 在底部添加Iconclass名称标注
    # 首先获取当前图形底部位置
    pos = ax.get_position()
    fig.text(0.5, pos.y0 - 0.02, "Iconclass Level 0 names:", ha='center', fontweight='bold')
    
    # 添加每个Iconclass类别的标注
    y_position = pos.y0 - 0.07  # 标注起始位置
    spacing = 0.02  # 每行标注之间的间距
    
    for i in sorted(freq_matrix.columns):
        if i in iconclass_names:
            text = f"{i}: {iconclass_names[i]}"
        else:
            # 如果在映射中找不到，只显示ID
            text = f"{i}: Unknown Category"
        
        fig.text(0.1, y_position, text, ha='left')
        y_position -= spacing
    
    # 调整布局确保标注不被截断
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.30)  # 调整底部边距
    
    # 保存图形
    if save_path:
        # 添加后缀以区分不同的图
        full_path = f'{save_path}_{title_suffix.lower().replace(" ", "_").replace("(", "").replace(")", "")}'
        # 保存PNG版本
        plt.savefig(f'{full_path}.png', dpi=300, bbox_inches='tight')
        # 保存SVG版本
        if save_svg:
            plt.savefig(f'{full_path}.svg', format='svg', bbox_inches='tight')
    
    plt.close()  # 关闭图形，避免内存泄漏
    return freq_matrix

# 2. 添加Iconclass Level 1与房间功能相关性分析函数
def analyze_iconclass_level1(df, iconclass_names, save_path=None):
    """
    分析Iconclass Level 1与房间功能的相关性
    
    参数:
    df: 包含iconclassses_level_1和room_functions列的DataFrame
    iconclass_names: iconclass ID到名称的映射字典
    save_path: 保存结果的文件路径
    """
    # 确保iconclassses_level_1列存在
    if 'iconclassses_level_1' not in df.columns:
        print("Error: 'iconclassses_level_1'列不存在。请检查数据结构。")
        return None, None, None
    
    # 将 iconclassses_level_1 字符串转换为列表
    df['iconclass_level1'] = df['iconclassses_level_1'].str.split(',')
    
    # 展开 iconclass level1 列表
    exploded_df = df.explode('iconclass_level1')
    
    # 移除空值和无效值
    exploded_df = exploded_df.dropna(subset=['iconclass_level1', 'room_functions'])
    exploded_df = exploded_df[exploded_df['iconclass_level1'] != '']
    
    # 创建频率矩阵
    freq_matrix = pd.crosstab(exploded_df['room_functions'], 
                             exploded_df['iconclass_level1'])
    
    # 计算每个房间功能中最常见的Level 1 iconclass
    top_iconclass = {}
    for room in freq_matrix.index:
        # 获取该房间功能的前5个最常见iconclass
        top_for_room = freq_matrix.loc[room].nlargest(5)
        top_iconclass[room] = top_for_room
    
    # 计算每个Level 1 iconclass最常见的房间功能
    top_rooms = {}
    for iconclass in freq_matrix.columns:
        # 获取该iconclass最常见的前5个房间功能
        top_for_iconclass = freq_matrix[iconclass].nlargest(5)
        top_rooms[iconclass] = top_for_iconclass
    
    # 保存结果到CSV
    if save_path:
        # 保存完整矩阵
        freq_matrix.to_csv(f'{save_path}_level1_full_matrix.csv')
        
        # 创建一个DataFrame来存储每个房间功能的前5个iconclass
        rows = []
        for room, top_icons in top_iconclass.items():
            for icon, count in top_icons.items():
                icon_name = iconclass_names.get(icon, "Unknown")
                rows.append({
                    'room_function': room,
                    'iconclass_level1': icon,
                    'iconclass_name': icon_name,
                    'count': count
                })
        top_iconclass_df = pd.DataFrame(rows)
        top_iconclass_df.to_csv(f'{save_path}_top_iconclass_by_room.csv', index=False)
        
        # 创建一个DataFrame来存储每个iconclass的前5个房间功能
        rows = []
        for icon, top_rooms_for_icon in top_rooms.items():
            icon_name = iconclass_names.get(icon, "Unknown")
            for room, count in top_rooms_for_icon.items():
                rows.append({
                    'iconclass_level1': icon,
                    'iconclass_name': icon_name,
                    'room_function': room,
                    'count': count
                })
        top_rooms_df = pd.DataFrame(rows)
        top_rooms_df.to_csv(f'{save_path}_top_rooms_by_iconclass.csv', index=False)
    
    return freq_matrix, top_iconclass, top_rooms

# 3. 创建Iconclass Level 1热力图函数，显示前10个最常见的类别
def create_iconclass_level1_heatmap(df, iconclass_names, title_suffix="", save_path=None, save_svg=False):
    """
    为Iconclass Level 1创建热力图，显示前10个最常见的类别
    
    参数:
    df: 包含 room_functions 和 iconclassses_level_1 列的 DataFrame
    iconclass_names: iconclass ID到名称的映射字典
    title_suffix: 标题后缀，用于区分不同的热力图
    save_path: 保存文件的路径（不包含扩展名）
    save_svg: 是否保存SVG格式
    """
    # 将 iconclassses_level_1 字符串转换为列表
    df['iconclass_level1'] = df['iconclassses_level_1'].str.split(',')
    
    # 展开 iconclass level1 列表
    exploded_df = df.explode('iconclass_level1')
    
    # 移除空值和无效值
    exploded_df = exploded_df.dropna(subset=['iconclass_level1', 'room_functions'])
    exploded_df = exploded_df[exploded_df['iconclass_level1'] != '']
    
    # 创建频率矩阵
    freq_matrix = pd.crosstab(exploded_df['room_functions'], 
                             exploded_df['iconclass_level1'])
    
    # 找出前10个最常见的Iconclass Level 1类别
    top10_iconclasses = freq_matrix.sum().nlargest(10).index.tolist()
    
    # 筛选出只包含前10个类别的矩阵
    top10_matrix = freq_matrix[top10_iconclasses]
    
    # 创建包含前10个类别完整名称的列表
    top10_names = []
    for code in top10_iconclasses:
        if code in iconclass_names:
            top10_names.append(f"{code}: {iconclass_names[code]}")
        else:
            top10_names.append(f"{code}: Unknown Category")
    
    # 创建图形 - 增加底部空间以容纳标签
    fig, ax = plt.subplots(figsize=(15, 18))
    
    # 创建热力图
    sns.heatmap(top10_matrix, 
                cmap='YlGnBu',
                annot=True,
                fmt='d',
                cbar_kws={'label': 'Frequency'},
                ax=ax)
    
    plt.title(f'Top 10 Iconclass Level 1 by Room Functions {title_suffix}', fontsize=14)
    plt.xlabel('Iconclass Level 1', fontsize=12)
    plt.ylabel('Room Functions', fontsize=12)
    
    # 在底部添加Iconclass名称标注
    pos = ax.get_position()
    fig.text(0.5, pos.y0 - 0.02, "Top 10 Iconclass Level 1 names:", ha='center', fontweight='bold')
    
    # 添加每个Iconclass类别的标注
    y_position = pos.y0 - 0.07  # 标注起始位置
    spacing = 0.02  # 每行标注之间的间距
    
    for name in top10_names:
        fig.text(0.1, y_position, name, ha='left')
        y_position -= spacing
    
    # 调整布局确保标注不被截断
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.3)  # 调整底部边距，确保有足够空间显示10个类别名称
    
    # 保存图形
    if save_path:
        # 添加后缀以区分不同的图
        full_path = f'{save_path}_level1_top10_{title_suffix.lower().replace(" ", "_").replace("(", "").replace(")", "")}'
        # 保存PNG版本
        plt.savefig(f'{full_path}.png', dpi=300, bbox_inches='tight')
        # 保存SVG版本
        if save_svg:
            plt.savefig(f'{full_path}.svg', format='svg', bbox_inches='tight')
    
    plt.close()  # 关闭图形，避免内存泄漏
    return top10_matrix

# 主函数：处理多个数据集并生成热力图
def process_multiple_datasets(dataframes, names, iconclass_names, base_save_path=None, save_svg=True):
    """
    处理多个数据框并为每个创建热力图
    
    参数:
    dataframes: DataFrame列表
    names: 每个DataFrame对应的名称列表
    iconclass_names: iconclass ID到名称的映射字典
    base_save_path: 基础保存路径
    save_svg: 是否保存SVG格式
    """
    results_level0 = {}
    results_level1 = {}
    
    for df, name in zip(dataframes, names):
        print(f"\n处理数据集: {name}")
        
        # 1. 创建Level 0热力图
        results_level0[name] = create_iconclass_heatmap(
            df,
            iconclass_names,
            title_suffix=f"({name})",
            save_path=base_save_path,
            save_svg=save_svg
        )
        
        # 2. 分析Level 1与房间功能的相关性
        print(f"分析 {name} 的Level 1数据...")
        try:
            level1_matrix, top_iconclass, top_rooms = analyze_iconclass_level1(
                df,
                iconclass_names,
                save_path=f"{base_save_path}_{name.lower().replace('-', '_')}"
            )
            print(f"Level 1分析完成，结果已保存")
            
            # 3. 创建Level 1前10位热力图
            results_level1[name] = create_iconclass_level1_heatmap(
                df,
                iconclass_names,
                title_suffix=f"({name})",
                save_path=base_save_path,
                save_svg=save_svg
            )
        except Exception as e:
            print(f"处理Level 1数据时出错: {str(e)}")
    
    # 在热力图生成后添加统计信息
    print("\n==== 热力图统计信息 ====")
    print("\n-- Level 0 热力图信息 --")
    for name, matrix in results_level0.items():
        print(f"\n{name}:")
        print(f"Number of rows (room functions): {len(matrix.index)}")
        print(f"Number of columns (iconclass categories): {len(matrix.columns)}")
        print("\nRoom functions:")
        for room in matrix.index:
            print(f"- {room}")
    
    print("\n-- Level 1 热力图信息 --")
    for name, matrix in results_level1.items():
        if matrix is not None:
            print(f"\n{name}:")
            print(f"Number of rows (room functions): {len(matrix.index)}")
            print(f"Number of columns (top 10 iconclass categories): {len(matrix.columns)}")
            print("\nTop 10 Iconclass Level 1 categories:")
            for category in matrix.columns:
                category_name = iconclass_names.get(category, "Unknown")
                print(f"- {category}: {category_name}")
    
    return results_level0, results_level1

# 主程序
if __name__ == "__main__":
    # 加载iconclass名称
    iconclass_names = load_iconclass_names('iconclass_names.csv')
    
    # 加载数据
    df1 = pd.read_csv('rooms_1550_1800.csv')
    df2 = pd.read_csv('rooms_1600_1800.csv')
    df3 = pd.read_csv('rooms_cleaned.csv')
    
    # 处理数据并生成图表
    results_level0, results_level1 = process_multiple_datasets(
        dataframes=[df1, df2, df3],
        names=['1550-1800', '1600-1800', 'full'],
        iconclass_names=iconclass_names,
        base_save_path='iconclass_heatmap',
        save_svg=True
    )
    
    print("\n所有处理完成。")


处理数据集: 1550-1800
分析 1550-1800 的Level 1数据...
Level 1分析完成，结果已保存

处理数据集: 1600-1800
分析 1600-1800 的Level 1数据...
Level 1分析完成，结果已保存

处理数据集: full
分析 full 的Level 1数据...
Level 1分析完成，结果已保存

==== 热力图统计信息 ====

-- Level 0 热力图信息 --

1550-1800:
Number of rows (room functions): 35
Number of columns (iconclass categories): 9

Room functions:
- ANTEROOMS
- ANTE_HALLS
- BANQUET_HALLS
- BATHROOMS
- BEDROOMS
- BOARDROOMS
- CABINETS_ROOMS
- CHAPELS_ROOMS
- CORRIDORS
- COUNCIL_CHAMBERS
- COUNTING_ROOMS
- DINING_ROOMS
- GALLERIES_DISPLAY_SPACES
- GUARD_HALLS
- GUEST_ROOMS
- HALLS
- IMPERIAL_HALLS
- LIBRARIES_ROOMS
- LOGGIAS
- LONG_GALLERIES
- MAIN_HALLS
- MANSION_DINING_ROOMS
- OFFICES_WORK_SPACES
- PATRONAGE_LODGES
- PRESENCE_CHAMBERS
- PRIVATE_CHAPELS
- ROYAL_CHAPELS_ROOMS
- SALA_TERRENA
- SALONS_ROOMS_FOR_ENTERTAINING
- SITTING_ROOMS
- STAIRWELLS
- THEATERS_ROOMS
- TOILET_ROOMS
- VESTIBULES
- WARDROBES_ROOMS

1600-1800:
Number of rows (room functions): 34
Number of columns (iconclass categories): 9

Room 