In [None]:
import pandas as pd
import os
import glob
import math

def generate_stratified_dataset(
    csv_path=r'sampledata_2.csv', 
    groundtruth_folder=r'groundtruth',
    output_file='dataset_selection_result.xlsx',
    target_total=3
):
    # --- 1. Read and preprocess sampledata.csv ---
    print("Reading the original data table...")
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: File not found {csv_path}")
        return

    # Assume that file_name in CSV is {name}.html, we need to extract {name}
    # Using os.path.splitext can safely handle filenames containing '.' (if there are other dots besides the extension)
    df['clean_name'] = df['file_name'].apply(lambda x: os.path.splitext(x)[0])
    
    # Check for duplicate clean_name to prevent matching confusion
    if df['clean_name'].duplicated().any():
        print("Warning: There are duplicate base filenames in sampledata.csv, which may affect matching accuracy.")

    # --- 2. Read and preprocess the groundtruth folder ---
    print("Scanning the Groundtruth folder...")
    if not os.path.exists(groundtruth_folder):
        print(f"Error: Folder not found {groundtruth_folder}")
        return

    # Get all .zip files in the folder
    existing_files = glob.glob(os.path.join(groundtruth_folder, '*.zip'))
    # Extract base filenames {name}, be careful with path separators
    existing_basenames = [os.path.splitext(os.path.basename(f))[0] for f in existing_files]
    
    print(f"There are {len(existing_basenames)} files in Groundtruth.")

    # --- 3. Mark existing data ---
    # Mark in the dataframe whether the row data already exists in groundtruth
    df['in_groundtruth'] = df['clean_name'].isin(existing_basenames)
    
    # Check if there are groundtruth files not found in CSV (to prevent filename mismatch issues)
    matched_count = df['in_groundtruth'].sum()
    if matched_count < len(existing_basenames):
        missing = set(existing_basenames) - set(df[df['in_groundtruth']]['clean_name'])
        print(f"Warning: {len(existing_basenames) - matched_count} files in Groundtruth were not found in CSV.")
        print(f"Unmatched examples: {list(missing)[:5]}")

    # --- 4. Calculate distribution and target quotas ---
    # Count the topic distribution ratio of the 3500 data points
    total_count = len(df)
    topic_dist = df['topic'].value_counts(normalize=True) # Get proportions
    
    # Initialize statistics results list
    stats_list = []
    files_to_add_indices = []

    print("Calculating quotas for each Topic and filling data...")
    
    # Iterate through each topic (total 24)
    for topic, ratio in topic_dist.items():
        # 1. Calculate how many should theoretically be in the 400 data points for this topic (round to nearest)
        target_count = int(round(target_total * ratio))
        if target_count == 0: target_count = 1 # Ensure at least 1 per category to avoid loss of small categories
        
        # 2. Get all data rows for this topic
        topic_rows = df[df['topic'] == topic]
        
        # 3. Count how many in this topic are already in groundtruth
        current_existing = topic_rows[topic_rows['in_groundtruth'] == True]
        current_count = len(current_existing)
        
        # 4. Calculate the gap
        needed = target_count - current_count
        
        added_count = 0
        
        if needed > 0:
            # Need to add data
            # Randomly sample from data in this topic that are not in groundtruth
            candidates = topic_rows[topic_rows['in_groundtruth'] == False]
            
            if len(candidates) >= needed:
                # Enough candidates, random sample (set random_state for reproducibility)
                sampled = candidates.sample(n=needed, random_state=42)
                files_to_add_indices.extend(sampled.index.tolist())
                added_count = needed
            else:
                # Not enough candidates (shouldn't happen theoretically unless 3500 data itself is insufficient), select all
                files_to_add_indices.extend(candidates.index.tolist())
                added_count = len(candidates)
                print(f"Note: Insufficient data for Topic '{topic}', unable to fully meet target quota.")
        
        # Record statistics
        stats_list.append({
            'Topic': topic,
            'Original_Ratio': f"{ratio:.2%}",
            'Target_Count_Total': target_count,
            'Existing_In_Groundtruth': current_count,
            'To_Add': added_count,
            'Final_Total': current_count + added_count,
            'Status': 'Over Budget' if needed < 0 else 'Filled'
        })

    # --- 5. Generate result DataFrame ---
    
    # Sheet 1: Distribution of existing 100+ data points
    df_existing = df[df['in_groundtruth'] == True][['file_name', 'topic', 'category', 'clean_name']]
    
    # Sheet 2: List of file_names to add
    df_to_add = df.loc[files_to_add_indices][['file_name', 'topic', 'category']]
    
    # Sheet 3: Overall distribution statistics table
    df_stats = pd.DataFrame(stats_list)
    # Adjust column order for easy viewing
    df_stats = df_stats[['Topic', 'Original_Ratio', 'Target_Count_Total', 'Existing_In_Groundtruth', 'To_Add', 'Final_Total', 'Status']]

    # Can also generate Sheet 4: Complete list of final 400 data points
    df_final_list = pd.concat([df_existing, df_to_add])

    # --- 6. Write to Excel ---
    print(f"Writing results to {output_file}...")
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df_existing.to_excel(writer, sheet_name='Existing_Distribution', index=False)
        df_to_add.to_excel(writer, sheet_name='Files_To_Add', index=False)
        df_stats.to_excel(writer, sheet_name='Distribution_Summary', index=False)
        df_final_list.to_excel(writer, sheet_name='Final_Full_List', index=False)

    print("Task completed!")
    print(f"Total existing data: {len(df_existing)}")
    print(f"Suggested data to add: {len(df_to_add)}")
    print(f"Expected final total: {len(df_final_list)}")

# --- Execute function ---
# Ensure sampledata.csv and groundtruth folder are in the current directory
if __name__ == "__main__":
    generate_stratified_dataset()

In [None]:
import pandas as pd
import os
import zipfile  # 引入 zipfile 库
import math

def generate_stratified_dataset(
    csv_path=r'C:\Users\MaXin\Desktop\HSBC\GroundTruth_Dataset\sampledata_2.csv', 
    # 修改这里：指向具体的 zip 文件路径
    groundtruth_zip_path=r'C:\Users\MaXin\Desktop\HSBC\GroundTruth_Dataset\groundtruth.zip',
    output_file='dataset_selection_result.xlsx',
    target_total=3 # 注意：你这里设的是3，正式跑可能要改成400
):
    # --- 1. 读取并预处理 sampledata.csv ---
    print("正在读取原始数据表...")
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: 找不到文件 {csv_path}")
        return

    # 假设 CSV 中的 file_name 是 {name}.html，我们需要提取 {name}
    df['clean_name'] = df['file_name'].apply(lambda x: os.path.splitext(x)[0])
    
    if df['clean_name'].duplicated().any():
        print("Warning: sampledata.csv 中存在重复的基础文件名，可能会影响匹配准确性。")

    # --- 2. 直接读取 Groundtruth Zip 文件 (无需解压) ---
    print("正在扫描 Groundtruth Zip 包...")
    if not os.path.exists(groundtruth_zip_path):
        print(f"Error: 找不到压缩文件 {groundtruth_zip_path}")
        return

    existing_basenames = []
    
    try:
        # 使用 zipfile 打开压缩包
        with zipfile.ZipFile(groundtruth_zip_path, 'r') as z:
            # 获取压缩包内所有文件的列表
            all_files_in_zip = z.namelist()
            
            for f in all_files_in_zip:
                # 过滤掉文件夹路径（以/结尾）和可能存在的隐藏文件（如 __MACOSX）
                if f.endswith('/') or '__MACOSX' in f:
                    continue
                
                # 获取文件名（去掉路径前缀，例如 "groundtruth/abc.zip" -> "abc.zip"）
                filename = os.path.basename(f)
                
                # 如果文件名为空（可能是纯路径），跳过
                if not filename:
                    continue
                    
                # 去掉扩展名，例如 "abc.zip" -> "abc"
                # 这里假设压缩包里是 .zip 或 .html 文件，只要文件名能对上就行
                basename = os.path.splitext(filename)[0]
                existing_basenames.append(basename)
                
    except zipfile.BadZipFile:
        print("Error: 无法读取 zip 文件，文件可能已损坏。")
        return

    print(f"Groundtruth Zip 包中包含 {len(existing_basenames)} 个有效文件。")

    # --- 3. 标记现有数据 ---
    # 在 dataframe 中标记该行数据是否已存在于 groundtruth 中
    df['in_groundtruth'] = df['clean_name'].isin(existing_basenames)
    
    # 检查是否有 groundtruth 文件在 CSV 中未找到
    matched_count = df['in_groundtruth'].sum()
    # 注意：这里只做简单对比，因为 zip 里可能包含一些非数据文件
    print(f"CSV 中已匹配到 {matched_count} 条 Groundtruth 数据。")

    # --- 4. 计算分布并填充目标名额 ---
    total_count = len(df)
    topic_dist = df['topic'].value_counts(normalize=True) # 获取比例
    
    stats_list = []
    files_to_add_indices = []

    print("正在计算各 Topic 配额并填充数据...")
    
    for topic, ratio in topic_dist.items():
        # 1. 计算该 Topic 理论上在目标总数中应占多少个 (四舍五入)
        target_count = int(round(target_total * ratio))
        if target_count == 0: target_count = 1 
        
        # 2. 获取该 Topic 的所有数据行
        topic_rows = df[df['topic'] == topic]
        
        # 3. 统计该 Topic 下已经存在于 groundtruth 的数量
        current_existing = topic_rows[topic_rows['in_groundtruth'] == True]
        current_count = len(current_existing)
        
        # 4. 计算缺口
        needed = target_count - current_count
        
        added_count = 0
        
        if needed > 0:
            # 需要补充数据
            # 从该 Topic 中未在 groundtruth 的数据里随机抽取
            candidates = topic_rows[topic_rows['in_groundtruth'] == False]
            
            if len(candidates) >= needed:
                # 候选够多，随机抽 (设置 random_state 保证可复现)
                sampled = candidates.sample(n=needed, random_state=42)
                files_to_add_indices.extend(sampled.index.tolist())
                added_count = needed
            else:
                # 候选不够，全选
                files_to_add_indices.extend(candidates.index.tolist())
                added_count = len(candidates)
                print(f"Note: Topic '{topic}' 数据不足，无法完全满足目标配额。")
        
        stats_list.append({
            'Topic': topic,
            'Original_Ratio': f"{ratio:.2%}",
            'Target_Count_Total': target_count,
            'Existing_In_Groundtruth': current_count,
            'To_Add': added_count,
            'Final_Total': current_count + added_count,
            'Status': 'Over Budget' if needed < 0 else 'Filled'
        })

    # --- 5. 生成结果 DataFrame ---
    
    # Sheet 1: 现有的分布情况
    df_existing = df[df['in_groundtruth'] == True][['file_name', 'topic', 'category', 'clean_name']]
    
    # Sheet 2: 需要新增的文件列表
    df_to_add = df.loc[files_to_add_indices][['file_name', 'topic', 'category']]
    
    # Sheet 3: 整体分布统计表
    df_stats = pd.DataFrame(stats_list)
    df_stats = df_stats[['Topic', 'Original_Ratio', 'Target_Count_Total', 'Existing_In_Groundtruth', 'To_Add', 'Final_Total', 'Status']]

    # Sheet 4: 最终完整列表
    df_final_list = pd.concat([df_existing, df_to_add])

    # --- 6. 写入 Excel ---
    print(f"正在将结果写入 {output_file}...")
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df_existing.to_excel(writer, sheet_name='Existing_Distribution', index=False)
        df_to_add.to_excel(writer, sheet_name='Files_To_Add', index=False)
        df_stats.to_excel(writer, sheet_name='Distribution_Summary', index=False)
        df_final_list.to_excel(writer, sheet_name='Final_Full_List', index=False)

    print("任务完成！")
    print(f"现有数据: {len(df_existing)}")
    print(f"建议新增: {len(df_to_add)}")
    print(f"预计最终总数: {len(df_final_list)}")

if __name__ == "__main__":
    generate_stratified_dataset()