## 竞赛论文基本信息的提取

In [None]:
import json
import re
import os
import pandas as pd


def analyze_document(data):
    # 初始化结果字典
    results = {
        "paper_number": "", 
        "paper_title": "",
        "total_pages": 0,
        "total_words": 0,
        "abstract": {"pages": 0, "words": 0},
        "table_of_contents": {"pages": 0},
        "main_text": {"pages": 0, "words": 0, "images": 0, "image_proportion": 0, "tables": 0, "equations": 0},
        "paragraphs": {"count": 0, "avg_sentences": 0, "avg_words": 0},
        "references": {"count": 0},
        "appendix": {"pages": 0, "code_lines": 0}
    }
    
    # 提取参赛编号
    for item in data:
        if item.get("type") == "text":
            matches = re.findall(r'(?<!\d)202190000\d{3}(?!\d)', item.get("text", ""))
            if matches:
                results['paper_number'] = matches[0]
                break

    # 提取论文标题
    keywords = ["摘要", "摘  要", "摘 要", "摘   要", "目录", "录", "前言", "前 言", "第十届", "泰迪杯", "杯", "引言", "A", "关键词", "Keywords", "一", "（一）", "二", "三", "四", "五", "六", "七", "1", "2", "3", "题 目", "问题背景", "研究背景", "章", "报告", "简介", "绪论", "正文", "参赛论文"]
    for item in data[:10]:
        flag = True
        if item.get("type") == "text" and item.get("text_level") == 1:
            text = item.get("text", "")
            for keyword in keywords:
                if keyword in text:
                    flag = False
                    break
            if flag:
                results["paper_title"] = text
                break
    
    # 计算总页数
    if data:
        max_page_idx = max(item.get("page_idx", 0) for item in data)
        results["total_pages"] = max_page_idx + 1 
    
    # 统计图片和表格数量
    images = [item for item in data if item.get("type") == "image"]
    tables = [item for item in data if item.get("type") == "table"]
    results["main_text"]["images"] = len(images)
    results["main_text"]["tables"] = len(tables)
    
    equation_count = 0
    
    equations = [item for item in data if item.get("type") == "equation"]
    equation_count += len(equations)
    
    
    results["main_text"]["equations"] = equation_count
    
    # 代码识别模式
    code_patterns = [
        # Python代码模式
        r'^import\s+[\w\.]+',  # import语句
        r'^from\s+[\w\.]+\s+import',  # from ... import语句
        r'^def\s+\w+\s*\(',  # 函数定义
        r'^class\s+\w+',  # 类定义
        r'^\s*#.*',  # 井号开头的注释
        r'^\s*for\s+.*:',  # for循环
        r'^\s*if\s+.*:',  # if条件
        r'^\s*while\s+.*:',  # while循环
        r'^\s*try\s*:',  # try块
        r'^\s*except',  # except块
        r'^\s*with\s+.*:',  # with语句
        r'^\s*return\s+',  # return语句
        r'^\s*print\s*\(',  # print函数调用
        r'^\s*def\s+',  # 函数定义
        r'^!python',  # jupyter notebook命令

        # Matlab代码模式
        r'^import\s+[\w\.]+\.\w+',  # import语句（如 import package.Class）
        r'^function\s+\w+\s*\(',  # 函数定义（如 function myfunc(...)）
        r'^classdef\s+\w+',  # 类定义（如 classdef MyClass）
        r'^\s*%.*',  # 注释（% 开头）
        r'^\s*for\s+\w+\s*=\s*.*',  # for循环（如 for i=1:10）
        r'^\s*if\s+.+',  # if条件（如 if condition）
        r'^\s*while\s+.+',  # while循环（如 while condition）
        r'^\s*(try|catch)\s*',  # try/catch块（如 try 或 catch）
        r'^\s*return',  # return语句
        r'^\s*disp\s*\(',  # disp函数调用（如 disp(...)）
        r'^\s*!.*',  # Jupyter命令（如 !command）

        # R语言代码模式
        r'^\s*(library|require)\s*\(',  # 导入包（如 library(...) 或 require(...)）
        r'^\s*\w+\s*<-\s*function\s*\(',  # 函数定义（如 myfunc <- function(...)）
        r'^\s*setClass\s*\(',  # 类定义（如 setClass(...)）
        r'^\s*#.*',  # 注释（# 开头）
        r'^\s*for\s*\(',  # for循环（如 for(...)）
        r'^\s*if\s*\(',  # if条件（如 if(...)）
        r'^\s*while\s*\(',  # while循环（如 while(...)）
        r'^\s*try\s*\(',  # try块（如 try(...)）
        r'^\s*with\s*\(',  # with语句（如 with(...)）
        r'^\s*return\s*\(',  # return语句（如 return(...)）
        r'^\s*print\s*\(',  # print函数调用（如 print(...)）
        r'^\s*!.*',  # Jupyter命令（如 !command）
    ]
    

    def count_words(text):
        # 中文按字符计数
        chinese_chars = len(re.findall(r'[\u4e00-\u9fa5]', text))
        # 英文单词按空格分隔计数
        english_words = len(re.findall(r'\b[a-zA-Z]+\b', text))
        # 数字按连续出现计为一个
        numbers = len(re.findall(r'\b[0-9]+\b', text))
        return chinese_chars + english_words + numbers

    # 识别段落（非标题的正文文本块）
    paragraphs = []
    for item in data:
        if (item.get("type") == "text" and 
            item.get("text_level") != 1 and
            len(item.get("text", "").strip()) > 0 and
            "目录" not in item.get("text", "") and
            not re.match(r'\[\d+\]', item.get("text", "").strip())):
            paragraphs.append(item)
    
    results["paragraphs"]["count"] = len(paragraphs)
    
    # 计算总字数
    total_words = 0
    for item in data:
        if item.get("type") == "text":
            total_words += count_words(item.get("text", ""))
    
    results["total_words"] = total_words
    
    # 确保各部分的页面集合不重叠
    title_pages = set()
    abstract_pages = set()
    toc_pages = set()
    ref_pages = set()
    appendix_pages = set()
    
    abstract_section = False
    abstract_words = 0
    
    # 标记用于追踪当前处理的部分
    current_section = "title" 
    
    for i, item in enumerate(data):
        page_idx = item.get("page_idx", 0)

        if page_idx == 0:  # 首页视为标题页
            title_pages.add(page_idx)
        
        if item.get("type") == "text":
            text = item.get("text", "")
            text_level = item.get("text_level")
            
            # 检测章节变化
            if text_level == 1:
                if "摘要" in text or "Abstract" in text:
                    current_section = "abstract"
                    abstract_section = True
                elif "目录" in text:
                    current_section = "toc"
                    abstract_section = False
                elif "参考文献" in text:
                    current_section = "references"
                    abstract_section = False
                elif "附录" in text:
                    current_section = "appendix"
                    abstract_section = False
                # 如果是标题，且不是摘要/目录等，则认为是正文
                elif current_section not in ["abstract", "toc"]:
                    current_section = "main"
                    abstract_section = False
            
            # 处理当前部分的内容
            if current_section == "abstract" and abstract_section:
                if text_level != 1:  # 不是标题
                    abstract_pages.add(page_idx)
                    abstract_words += count_words(text)
                    
                    # 检查是否遇到关键词部分，如果是则结束摘要部分
                    if "关键字" in text or "Keywords" in text:
                        abstract_section = False
                
            elif current_section == "toc":
                toc_pages.add(page_idx)
                
            elif current_section == "references":
                ref_pages.add(page_idx)
                
            elif current_section == "appendix":
                appendix_pages.add(page_idx)
    
    # 计算附录中的代码行数
    appendix_code_lines = 0
    for item in data:
        if item.get("type") == "text" and item.get("page_idx") in appendix_pages:
            text = item.get("text", "")
            lines = text.split('\n')
            
            # 检查是否包含代码
            code_line_count = 0
            
            for line in lines:
                if any(re.match(pattern, line) for pattern in code_patterns) or \
                   '=' in line and not re.search(r'[，。！？]', line) or \
                   line.strip().startswith('#') or \
                   line.strip().startswith('!') or \
                   '{' in line and '}' in line or \
                   '[' in line and ']' in line and '=' in line:
                    code_line_count += 1
            
            if "```" in text:
                code_blocks = text.split("```")[1::2]  # 提取代码块内容
                for block in code_blocks:
                    code_line_count += block.count("\n") + 1
            
            if code_line_count > 0:
                appendix_code_lines += code_line_count
    
    results["appendix"]["code_lines"] = appendix_code_lines
    
    # 统计参考文献数量
    ref_count = 0
    in_ref_section = False
    reference_patterns = [r'\[(\d+(?:[,-]\d+)*)\]', r'【(\d+(?:[,-]\d+)*)】']
    
    for item in data:
        if item.get("type") == "text":
            text = item.get("text", "")
            
            # 检查是否进入参考文献部分
            if "参考文献" in text and item.get("text_level") == 1:
                in_ref_section = True
                continue
                
            if in_ref_section:
                if item.get("text_level") == 1 and "参考文献" not in text:
                    in_ref_section = False
                    continue
                # 计数参考文
                if text.strip():
                    lines = text.strip().split("\n")
                    for line in lines:
                        for reference_pattern in reference_patterns:
                            match = re.search(reference_pattern, line)
                            if match:
                                citation_number = int(match.group(1))
                                ref_count = max(ref_count, citation_number)
    
    # 设置各部分页数
    results["abstract"]["pages"] = len(abstract_pages)
    results["abstract"]["words"] = abstract_words
    results["table_of_contents"]["pages"] = len(toc_pages)
    results["references"]["count"] = ref_count
    results["appendix"]["pages"] = len(appendix_pages)
    
    # 计算正文页数和字数
    all_pages = set(range(results["total_pages"]))  # 所有页面
    non_main_pages = title_pages.union(abstract_pages).union(toc_pages).union(ref_pages).union(appendix_pages)  # 非正文页面
    main_pages = all_pages - non_main_pages  # 正文页面
    results["main_text"]["pages"] = len(main_pages)
    
    # 计算正文字数
    main_text_words = 0
    for item in data:
        if item.get("type") == "text" and item.get("page_idx") in main_pages:
            main_text_words += count_words(item.get("text", ""))
    
    results["main_text"]["words"] = main_text_words
    
    # 计算段落的平均句子数和字数
    if paragraphs:
        total_sentences = 0
        total_para_words = 0
        
        for para in paragraphs:
            text = para.get("text", "")
            sentences = re.split(r'[。！？.!?]+', text)  # 中英文句子断句
            sentences = [s for s in sentences if s.strip()]
            total_sentences += len(sentences)
            
            total_para_words += count_words(text)
        
        results["paragraphs"]["avg_sentences"] = int(round(total_sentences / len(paragraphs))) if len(paragraphs) > 0 else 0
        results["paragraphs"]["avg_words"] = int(round(total_para_words / len(paragraphs))) if len(paragraphs) > 0 else 0
    
    # 计算图片比例
    if results["main_text"]["pages"] > 0:
        results["main_text"]["image_proportion"] = results["main_text"]["images"] / results["main_text"]["pages"]
    else:
        results["main_text"]["image_proportion"] = 0
    
    return results

def results_to_dataframe_row(results):
    row = [
        results['paper_number'],
        results['paper_title'],
        results['total_pages'],
        results['total_words'],
        results['abstract']['pages'],
        results['abstract']['words'],
        results['table_of_contents']['pages'],
        results['main_text']['pages'],
        results['main_text']['words'],
        results['main_text']['images'],
        results['main_text']['image_proportion'],
        results['main_text']['tables'],
        results['main_text']['equations'],
        results['paragraphs']['count'],
        results['paragraphs']['avg_sentences'],
        results['paragraphs']['avg_words'],
        results['references']['count'],
        results['appendix']['pages'],
        results['appendix']['code_lines']
    ]
    return row

def append_results_to_csv(results, csv_file_path):
    """将结果作为新行追加到CSV文件"""
    row_data = results_to_dataframe_row(results)
    columns = [
        "paper_number", "paper_title", "total_pages", "total_words",
        "abstract_pages", "abstract_words",
        "table_of_contents_pages",
        "main_text_pages", "main_text_words", "main_text_images", 
        "main_text_image_proportion", "main_text_tables", "main_text_equations",
        "paragraphs_count", "paragraphs_avg_sentences", "paragraphs_avg_words",
        "references_count",
        "appendix_pages", "appendix_code_lines"
    ]
    
    df_row = pd.DataFrame([row_data], columns=columns)
    file_exists = os.path.isfile(csv_file_path)
    if file_exists:
        df_row.to_csv(csv_file_path, mode='a', header=False, index=False, encoding='utf-8-sig')
    else:
        df_row.to_csv(csv_file_path, index=False, header=True, encoding='utf-8-sig')
    return df_row

def main():
    base_path = './data/transform/'
    result_path = './data/result1.csv'  
    
    for path_fold_name in os.listdir(base_path):
        text_path = f"{base_path}/{path_fold_name}/md/{path_fold_name}_content_list.json"
        if not os.path.exists(text_path):
            print(f"跳过 {path_fold_name}: 找不到JSON文件")
            continue
        try:
            with open(text_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
            results = analyze_document(data)
            df_row = append_results_to_csv(results, result_path)
            print(f"已添加 {path_fold_name} 的结果到 {result_path}")
        except Exception as e:
            print(f"处理 {path_fold_name} 时出错: {str(e)}")

if __name__ == "__main__":
    main()

已添加 B1000 的结果到 ./data/result1.csv
已添加 B1135 的结果到 ./data/result1.csv
已添加 B1206 的结果到 ./data/result1.csv
已添加 B1242 的结果到 ./data/result1.csv
已添加 B1302 的结果到 ./data/result1.csv
已添加 B1357 的结果到 ./data/result1.csv
已添加 B1415 的结果到 ./data/result1.csv
已添加 B1472 的结果到 ./data/result1.csv
已添加 B1503 的结果到 ./data/result1.csv
已添加 B1585 的结果到 ./data/result1.csv
已添加 B1655 的结果到 ./data/result1.csv
已添加 B1729 的结果到 ./data/result1.csv
已添加 B1782 的结果到 ./data/result1.csv
已添加 B1810 的结果到 ./data/result1.csv
已添加 B1824 的结果到 ./data/result1.csv
已添加 B1858 的结果到 ./data/result1.csv
已添加 B1974 的结果到 ./data/result1.csv
已添加 B2000 的结果到 ./data/result1.csv
已添加 B2018 的结果到 ./data/result1.csv
已添加 B2078 的结果到 ./data/result1.csv
已添加 B2142 的结果到 ./data/result1.csv
已添加 B2148 的结果到 ./data/result1.csv
已添加 B2156 的结果到 ./data/result1.csv
已添加 B2184 的结果到 ./data/result1.csv
已添加 B2202 的结果到 ./data/result1.csv
已添加 B2477 的结果到 ./data/result1.csv
已添加 B2494 的结果到 ./data/result1.csv
已添加 B2528 的结果到 ./data/result1.csv
已添加 B2549 的结果到 ./data/result1.csv
已添加 B2551 的结果到

## 数据挖掘

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置默认字体为黑体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题

data = pd.read_csv('./data/result1.csv', encoding='')
df = pd.DataFrame(data)

# 选择特征列
column_mapping = {
    'total_pages': '论文总页数',
    'total_words': '论文总字数',
    'abstract_pages': '摘要页数',
    'abstract_words': '摘要字数',
    'table_of_contents_pages': '目录页数',
    'main_text_pages': '正文页数',
    'main_text_words': '正文字数',
    'main_text_images': '正文图片数',
    'main_text_image_proportion': '正文图片比例',
    'main_text_tables': '正文表格数',
    'main_text_equations': '正文独立公式数',
    'paragraphs_count': '正文段落数量',
    'paragraphs_avg_sentences': '正文段落平均句数',
    'paragraphs_avg_words': '正文段落平均字数',
    'references_count': '参考文献数量'
}
df.rename(columns=column_mapping, inplace=True)
features = ["论文总页数", "论文总字数", "摘要页数", "摘要字数", "目录页数", "正文页数",
            "正文字数", "正文图片数", "正文图片比例", "正文表格数", "正文独立公式数", 
            "正文段落数量", "正文段落平均句数", "正文段落平均字数", "参考文献数量"]
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 聚类
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# PCA 降维
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['pca_component_1'] = X_pca[:, 0]
df['pca_component_2'] = X_pca[:, 1]


plt.figure(figsize=(16, 12))
MARKERS = ['o', '^', 's']  
COLORS = ['#1f77b4', '#ff7f0e', '#2ca02c'] 
ALPHA = 0.15  
MARKER_SIZE = 120  
EDGE_WIDTH = 1.2  

cluster_mapping = {0: 'A类论文', 1: 'B类论文', 2: 'C类论文'}
df['cluster_label'] = df['cluster'].map(cluster_mapping)  

for cluster_id in df['cluster'].unique(): 
    cluster_data = df[df['cluster'] == cluster_id]
    sns.kdeplot(
        x=cluster_data['pca_component_1'],
        y=cluster_data['pca_component_2'],
        fill=True,
        alpha=ALPHA,
        levels=5,
        thresh=0.08,
        color=COLORS[cluster_id], 
        zorder=0
    )

# 散点图
for cluster_id in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(
        x=cluster_data['pca_component_1'],
        y=cluster_data['pca_component_2'],
        c=[COLORS[cluster_id]], 
        marker=MARKERS[cluster_id], 
        s=MARKER_SIZE,
        edgecolor='w',
        linewidth=EDGE_WIDTH,
        label=cluster_mapping[cluster_id],
        zorder=2,
        alpha=0.85
    )
plt.title('论文特征聚类分析（多维度可视化）', 
         fontsize=18, pad=25, fontweight='bold')
plt.xlabel('主成分 1', fontsize=14, labelpad=12)
plt.ylabel('主成分 2', fontsize=14, labelpad=12)
legend = plt.legend(
    title='聚类分布',
    title_fontsize=13,
    fontsize=12,
    loc='upper right',
    frameon=True,
    framealpha=0.95,
    edgecolor='#333333'
)
plt.grid(True, 
        linestyle='--', 
        linewidth=0.8, 
        alpha=0.6,
        color='#999999')
plt.xlim(X_pca[:, 0].min()-0.8, X_pca[:, 0].max()+0.8)
plt.ylim(X_pca[:, 1].min()-0.8, X_pca[:, 1].max()+0.8)
plt.gca().set_facecolor('#f5f5f5')
plt.text(0.98, 0.02, 
        '注：不同形状代表不同聚类，阴影区域表示密度分布',
        transform=plt.gca().transAxes,
        ha='right',
        fontsize=10,
        color='#666666')
plt.tight_layout()
plt.show()

# 特征相关性热力图
plt.figure(figsize=(12, 10))
correlations = df[features].corr()
mask = np.triu(np.ones_like(correlations, dtype=bool))
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt='.2f', mask=mask)
plt.title('论文特征相关性矩阵')
plt.xticks(rotation=45) 
plt.show()

# 小提琴图
plt.figure(figsize=(18, 12))
for i, feature in enumerate(features, 1):
    plt.subplot(5, 3, i) 
    sns.violinplot(x='cluster', y=feature, data=df, palette='viridis')
    plt.title(f'{feature} 在不同聚类中的分布')
    plt.xlabel('聚类')
    plt.ylabel(feature)
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()