In [1]:
import os
import heapq
import math

INPUT_DIR = r"C:\Users\znancy\Desktop\tnic_all_data"
OUTPUT_DIR = r"C:\Users\znancy\Desktop\output_top10pct"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def process_file(file_path, output_path, top_pct=0.1):

    print(f"\n=== Processing {file_path} ===")

    # ---------- 第一次扫描：统计每个 gvkey1 的行数 ----------
    counts = {}
    with open(file_path, "r") as f:
        header = next(f)  
        for line in f:
            parts = line.strip().split()
            if len(parts) < 5:   
                continue

            gvkey1 = parts[1]   
            counts[gvkey1] = counts.get(gvkey1, 0) + 1

    # ---------- 第二次扫描：构建各 gvkey1 的 Top K heaps ----------
    heaps = {g: [] for g in counts}
    top_k = {g: max(1, math.ceil(counts[g] * top_pct)) for g in counts}

    with open(file_path, "r") as f:
        header = next(f)  
        for line in f:
            parts = line.strip().split()
            if len(parts) < 5:
                continue

            try:
                score = float(parts[0])  # score
            except:
                continue

            gvkey1 = parts[1]
            gvkey2 = parts[2]
            year = parts[4]       

            h = heaps[gvkey1]
            k = top_k[gvkey1]

            if len(h) < k:
                heapq.heappush(h, (score, line))
            else:
                if score > h[0][0]:
                    heapq.heapreplace(h, (score, line))

    # ---------- 输出 ----------
    with open(output_path, "w") as out:
        out.write(header)   
        for g in heaps:
            for score, line in sorted(heaps[g], key=lambda x: -x[0]):
                out.write(line)

    print(f"Done → {output_path}")


# ========== 主程序：遍历所有 txt ==========
for filename in os.listdir(INPUT_DIR):
    if filename.lower().endswith(".txt"):
        process_file(
            os.path.join(INPUT_DIR, filename),
            os.path.join(OUTPUT_DIR, f"top10pct_{filename}")
        )


=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1988.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1988.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1989.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1989.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1990.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1990.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1991.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1991.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1992.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1992.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1993.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top10pct_tnicall1993.txt

=== Processing C:\Users\znancy\Desktop\tnic_all_data\tnicall1994.txt ===
Done → C:\Users\znancy\Desktop\output_top10pct\top