In [None]:
import pandas as pd
import os
import glob
import math

def generate_stratified_dataset(
    csv_path=r'sampledata_2.csv', 
    groundtruth_folder=r'groundtruth',
    output_file='dataset_selection_result.xlsx',
    target_total=3
):
    # --- 1. Read and preprocess sampledata.csv ---
    print("Reading the original data table...")
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: File not found {csv_path}")
        return

    # Assume that file_name in CSV is {name}.html, we need to extract {name}
    # Using os.path.splitext can safely handle filenames containing '.' (if there are other dots besides the extension)
    df['clean_name'] = df['file_name'].apply(lambda x: os.path.splitext(x)[0])
    
    # Check for duplicate clean_name to prevent matching confusion
    if df['clean_name'].duplicated().any():
        print("Warning: There are duplicate base filenames in sampledata.csv, which may affect matching accuracy.")

    # --- 2. Read and preprocess the groundtruth folder ---
    print("Scanning the Groundtruth folder...")
    if not os.path.exists(groundtruth_folder):
        print(f"Error: Folder not found {groundtruth_folder}")
        return

    # Get all .zip files in the folder
    existing_files = glob.glob(os.path.join(groundtruth_folder, '*.zip'))
    # Extract base filenames {name}, be careful with path separators
    existing_basenames = [os.path.splitext(os.path.basename(f))[0] for f in existing_files]
    
    print(f"There are {len(existing_basenames)} files in Groundtruth.")

    # --- 3. Mark existing data ---
    # Mark in the dataframe whether the row data already exists in groundtruth
    df['in_groundtruth'] = df['clean_name'].isin(existing_basenames)
    
    # Check if there are groundtruth files not found in CSV (to prevent filename mismatch issues)
    matched_count = df['in_groundtruth'].sum()
    if matched_count < len(existing_basenames):
        missing = set(existing_basenames) - set(df[df['in_groundtruth']]['clean_name'])
        print(f"Warning: {len(existing_basenames) - matched_count} files in Groundtruth were not found in CSV.")
        print(f"Unmatched examples: {list(missing)[:5]}")

    # --- 4. Calculate distribution and target quotas ---
    # Count the topic distribution ratio of the 3500 data points
    total_count = len(df)
    topic_dist = df['topic'].value_counts(normalize=True) # Get proportions
    
    # Initialize statistics results list
    stats_list = []
    files_to_add_indices = []

    print("Calculating quotas for each Topic and filling data...")
    
    # Iterate through each topic (total 24)
    for topic, ratio in topic_dist.items():
        # 1. Calculate how many should theoretically be in the 400 data points for this topic (round to nearest)
        target_count = int(round(target_total * ratio))
        if target_count == 0: target_count = 1 # Ensure at least 1 per category to avoid loss of small categories
        
        # 2. Get all data rows for this topic
        topic_rows = df[df['topic'] == topic]
        
        # 3. Count how many in this topic are already in groundtruth
        current_existing = topic_rows[topic_rows['in_groundtruth'] == True]
        current_count = len(current_existing)
        
        # 4. Calculate the gap
        needed = target_count - current_count
        
        added_count = 0
        
        if needed > 0:
            # Need to add data
            # Randomly sample from data in this topic that are not in groundtruth
            candidates = topic_rows[topic_rows['in_groundtruth'] == False]
            
            if len(candidates) >= needed:
                # Enough candidates, random sample (set random_state for reproducibility)
                sampled = candidates.sample(n=needed, random_state=42)
                files_to_add_indices.extend(sampled.index.tolist())
                added_count = needed
            else:
                # Not enough candidates (shouldn't happen theoretically unless 3500 data itself is insufficient), select all
                files_to_add_indices.extend(candidates.index.tolist())
                added_count = len(candidates)
                print(f"Note: Insufficient data for Topic '{topic}', unable to fully meet target quota.")
        
        # Record statistics
        stats_list.append({
            'Topic': topic,
            'Original_Ratio': f"{ratio:.2%}",
            'Target_Count_Total': target_count,
            'Existing_In_Groundtruth': current_count,
            'To_Add': added_count,
            'Final_Total': current_count + added_count,
            'Status': 'Over Budget' if needed < 0 else 'Filled'
        })

    # --- 5. Generate result DataFrame ---
    
    # Sheet 1: Distribution of existing 100+ data points
    df_existing = df[df['in_groundtruth'] == True][['file_name', 'topic', 'category', 'clean_name']]
    
    # Sheet 2: List of file_names to add
    df_to_add = df.loc[files_to_add_indices][['file_name', 'topic', 'category']]
    
    # Sheet 3: Overall distribution statistics table
    df_stats = pd.DataFrame(stats_list)
    # Adjust column order for easy viewing
    df_stats = df_stats[['Topic', 'Original_Ratio', 'Target_Count_Total', 'Existing_In_Groundtruth', 'To_Add', 'Final_Total', 'Status']]

    # Can also generate Sheet 4: Complete list of final 400 data points
    df_final_list = pd.concat([df_existing, df_to_add])

    # --- 6. Write to Excel ---
    print(f"Writing results to {output_file}...")
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df_existing.to_excel(writer, sheet_name='Existing_Distribution', index=False)
        df_to_add.to_excel(writer, sheet_name='Files_To_Add', index=False)
        df_stats.to_excel(writer, sheet_name='Distribution_Summary', index=False)
        df_final_list.to_excel(writer, sheet_name='Final_Full_List', index=False)

    print("Task completed!")
    print(f"Total existing data: {len(df_existing)}")
    print(f"Suggested data to add: {len(df_to_add)}")
    print(f"Expected final total: {len(df_final_list)}")

# --- Execute function ---
# Ensure sampledata.csv and groundtruth folder are in the current directory
if __name__ == "__main__":
    generate_stratified_dataset()

In [None]:
import pandas as pd
import os
import zipfile
import math

def generate_stratified_dataset(
    csv_path=r'C:\Users\MaXin\Desktop\HSBC\GroundTruth_Dataset\sampledata_2.csv', 
    groundtruth_zip_path=r'C:\Users\MaXin\Desktop\HSBC\GroundTruth_Dataset\groundtruth.zip',
    output_file='dataset_selection_result_400_overlap.xlsx',
    target_total_unique=400  # 目标：至少400个唯一文件
):
    # --- 1. 读取数据 ---
    print("Step 1: 读取原始数据...")
    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"Error: 找不到文件 {csv_path}")
        return

    # 提取 clean_name
    df['clean_name'] = df['file_name'].apply(lambda x: os.path.splitext(x)[0])
    
    # --- 2. 扫描 Zip 包 ---
    print("\nStep 2: 扫描 Groundtruth Zip 包...")
    existing_basenames = set()
    if os.path.exists(groundtruth_zip_path):
        try:
            with zipfile.ZipFile(groundtruth_zip_path, 'r') as z:
                for f in z.namelist():
                    if f.endswith('/') or '__MACOSX' in f: continue
                    filename = os.path.basename(f)
                    if not filename: continue
                    existing_basenames.add(os.path.splitext(filename)[0])
        except zipfile.BadZipFile:
            print("Error: Zip 文件损坏")
            return
    else:
        print("Warning: Zip文件不存在，视为全量新增。")

    print(f"Groundtruth Zip 中包含 {len(existing_basenames)} 个唯一文件。")

    # --- 3. 数据预处理 ---
    print("\nStep 3: 计算分布 (允许同名不同Topic)...")
    
    # 1. 彻底去重脏数据：完全一样的行 (File+Topic相同) 删掉
    df_clean = df.drop_duplicates(subset=['clean_name', 'topic']).copy()
    
    # 2. 标记是否在 Zip 中 (辅助列)
    df_clean['in_groundtruth'] = df_clean['clean_name'].isin(existing_basenames)
    
    # 3. 计算 Topic 分布 (分母是 df_clean 的行数，反映真实 Topic 权重)
    # 注意：target_total_unique 是唯一文件数，但这里的 quota 是“人次”
    # 如果有很多多Topic文件，总“人次”可能会超过 400。
    # 我们这里先按 400 为基数计算“最小配额”，不够再补。
    topic_dist = df_clean['topic'].value_counts(normalize=True)
    
    # 用于记录最终选中的唯一文件名
    selected_filenames_set = set()
    
    stats_list = []

    # --- 4. 第一轮：满足 Topic 配额 (允许复用) ---
    print("\nStep 4: 第一轮抽样 - 满足各 Topic 配额 (允许复用)...")
    
    # 为了让复用最大化，可以考虑先处理已经在 Zip 里的文件
    # 这里我们按 Topic 遍历
    for topic, ratio in topic_dist.items():
        # 计算该 Topic 至少需要的“人次”
        quota = int(round(target_total_unique * ratio))
        if quota == 0: quota = 1
        
        # 获取该 Topic 下的所有候选行
        candidates = df_clean[df_clean['topic'] == topic]
        
        # 1. 检查【已选集合】里有多少能覆盖这个 Topic
        # 这些文件虽然是在处理别的 Topic 时选进去的，但它们也能填当前 Topic 的坑
        covered_by_existing = candidates[candidates['clean_name'].isin(selected_filenames_set)]
        count_covered = len(covered_by_existing)
        
        needed = quota - count_covered
        added_this_round = 0
        
        if needed > 0:
            # 需要从【未选集合】里选新文件
            pool = candidates[~candidates['clean_name'].isin(selected_filenames_set)]
            
            # 优先选 Zip 里的
            pool_priority = pool[pool['in_groundtruth'] == True]
            pool_normal = pool[pool['in_groundtruth'] == False]
            
            # 抽 Zip 里的
            take_priority = min(len(pool_priority), needed)
            if take_priority > 0:
                picked = pool_priority.sample(n=take_priority, random_state=42)
                selected_filenames_set.update(picked['clean_name'].tolist())
                needed -= take_priority
                added_this_round += take_priority
                
            # 抽 普通的
            if needed > 0:
                take_normal = min(len(pool_normal), needed)
                if take_normal > 0:
                    picked = pool_normal.sample(n=take_normal, random_state=42)
                    selected_filenames_set.update(picked['clean_name'].tolist())
                    needed -= take_normal
                    added_this_round += take_normal
        
        stats_list.append({
            'Topic': topic,
            'Ratio': f"{ratio:.2%}",
            'Quota_Slots': quota,
            'Filled_By_Overlap': count_covered, # 被别的Topic顺带填上的
            'Filled_By_New': added_this_round,  # 专门为此Topic新选的
            'Total_Filled': count_covered + added_this_round
        })

    # --- 5. 第二轮：检查总数并兜底补齐 ---
    current_unique_count = len(selected_filenames_set)
    print(f"\n第一轮结束，当前选中唯一文件数: {current_unique_count}")
    
    gap = target_total_unique - current_unique_count
    
    if gap > 0:
        print(f"Step 5: 数量未达标 (需 {target_total_unique}, 差 {gap})，执行随机补齐...")
        
        # 从所有尚未被选中的文件中抽取
        # 这里使用 df_clean 并在 clean_name 上去重，得到所有候选池
        all_unique_candidates = df_clean.drop_duplicates(subset=['clean_name'])
        remaining_pool = all_unique_candidates[~all_unique_candidates['clean_name'].isin(selected_filenames_set)]
        
        if len(remaining_pool) >= gap:
            # 优先补 Zip 里的? 还是随机? 这里简单处理：优先 Zip
            remaining_priority = remaining_pool[remaining_pool['in_groundtruth'] == True]
            remaining_normal = remaining_pool[remaining_pool['in_groundtruth'] == False]
            
            top_up_list = []
            
            # 先拿剩下的 Zip
            take_pri = min(len(remaining_priority), gap)
            if take_pri > 0:
                picked = remaining_priority.sample(n=take_pri, random_state=42)
                selected_filenames_set.update(picked['clean_name'].tolist())
                gap -= take_pri
                
            # 再拿剩下的普通
            if gap > 0:
                picked = remaining_normal.sample(n=gap, random_state=42)
                selected_filenames_set.update(picked['clean_name'].tolist())
                
            print("  -> 补齐完成。")
        else:
            print(f"  -> Warning: 剩余可用文件不足，全部选入。最终数量: {current_unique_count + len(remaining_pool)}")
            selected_filenames_set.update(remaining_pool['clean_name'].tolist())

    # --- 6. 生成最终 Excel 列表 ---
    print("\nStep 6: 生成最终文件列表...")
    
    # 我们需要输出详细信息。因为一个文件可能对应多个 Topic，
    # 为了 Excel 清爽，我们这里输出：文件名 | 对应的主Topic (或第一个) | 是否在Zip中
    
    # 这里的策略是：从原始 df_clean 中找出所有 clean_name 在 selected_filenames_set 里的行
    # 并按照 clean_name 去重 (One File One Row)，确保输出只有 400 行
    
    # 先筛选出所有相关行
    df_final_rows = df_clean[df_clean['clean_name'].isin(selected_filenames_set)].copy()
    
    # 为了展示清晰，我们对每个文件只保留一行 (keep='first')
    # *注意*：这样输出的 Topic 列只是该文件众多 Topic 中的一个。
    df_export_unique = df_final_rows.drop_duplicates(subset=['clean_name'], keep='first')
    
    # 分 Sheet
    df_existing_export = df_export_unique[df_export_unique['in_groundtruth'] == True]
    df_new_export = df_export_unique[df_export_unique['in_groundtruth'] == False]
    
    df_stats = pd.DataFrame(stats_list)

    print(f"写入结果到 {output_file}...")
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df_existing_export.to_excel(writer, sheet_name='1_Existing_In_Zip', index=False)
        df_new_export.to_excel(writer, sheet_name='2_Files_To_Add', index=False)
        df_stats.to_excel(writer, sheet_name='3_Distribution_Logic', index=False)
        # 完整的 400 个文件列表
        df_export_unique.to_excel(writer, sheet_name='4_Final_List_400', index=False)

    print("-" * 30)
    print(f"最终结果统计:")
    print(f"唯一文件总数: {len(df_export_unique)} (目标: {target_total_unique})")
    print(f"  - Zip内 (无需处理): {len(df_existing_export)}")
    print(f"  - 新增 (需复制/下载): {len(df_new_export)}")
    print("-" * 30)

if __name__ == "__main__":
    generate_stratified_dataset()