## 基于resnet50的图像识别模型

In [None]:
import os
import json
import pandas as pd
import numpy as np
from PIL import Image
import random
from tqdm import tqdm
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from scipy.spatial.distance import cosine

# 加载预训练的CNN模型
def load_cnn_model():
    # ResNet50特征提取
    model = models.resnet50(pretrained=False)
    model_path = './model/resnet50-0676ba61.pth'
    model.load_state_dict(torch.load(model_path))
    # 移除最后的全连接层，只保留特征提取部分
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model.eval()  
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)  
    return model, device

# 定义图像预处理流程
def get_transform():
    return transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

# 计算单个图片的CNN特征
def compute_image_features(image_path, model, device, transform):
    try:
        image = Image.open(image_path).convert('RGB')
        
        image_tensor = transform(image).unsqueeze(0).to(device)
        
        with torch.no_grad():  # 提取特征
            features = model(image_tensor)
            
        # 将特征张量转换为一维数组
        features = features.squeeze().cpu().numpy()
        return features
    except Exception as e:
        print(f"计算图片特征时出错 {image_path}: {e}")
        return np.zeros(2048)  # 特征维度为2048

# 计算两个特征向量之间的相似度
def compute_feature_similarity(features1, features2):
    try:
        # 余弦相似度（1-余弦距离）
        similarity = 1 - cosine(features1, features2)
        return similarity
    except Exception as e:
        print(f"计算特征相似度时出错: {e}")
        return 0

# 查找相似图片
def find_similar_images_sampled(all_papers_content, paper_names, team_ids, team_image_counts, base_dir="./data/transform/", sample_size=2000): 
    # 加载CNN模型
    print("加载预训练的CNN模型...")
    model, device = load_cnn_model()
    transform = get_transform()
    
    # 收集所有图片信息
    image_info = []
    
    total_papers = len(all_papers_content)
    
    for paper_idx, paper_content in enumerate(tqdm(all_papers_content, desc="处理论文")):
        paper_desc = f"处理第 {paper_idx+1}/{total_papers} 篇论文"
        
        for item_idx, item in enumerate(paper_content):
            if item['type'] == 'image':
                if os.path.isabs(item['img_path']):
                    img_path = item['img_path']
                else:
                    img_path = os.path.normpath(item['img_path'])
                
                if not os.path.exists(img_path):
                    print(f"\r{paper_desc} - 图片不存在: {img_path}")
                    continue
                
                # 计算图片的页内序号
                in_page_idx = sum(1 for x in paper_content[:item_idx] 
                               if x['type'] == 'image' and x['page_idx'] == item['page_idx'])
                
                # 计算图片CNN特征
                image_features = compute_image_features(img_path, model, device, transform)
                
                info = {
                    'paper_idx': paper_idx,
                    'team_id': team_ids[paper_idx],
                    'item_idx': item_idx,
                    'page_idx': item['page_idx'],
                    'img_path': img_path,
                    'in_page_idx': in_page_idx,
                    'features': image_features
                }
                image_info.append(info)
    
    total_images = len(image_info)
    print(f"共收集和处理了 {total_images} 张图片")
    
    if total_images == 0:
        print("没有找到图片，跳过比较")
        return []
    
    actual_sample_size = min(sample_size, total_images)
    # 随机抽样
    sampled_images = random.sample(image_info, actual_sample_size)
    
    similar_images = []
    similarity_threshold = 0.95  # CNN特征使用更高的阈值
    
    total_comparisons = (actual_sample_size * (actual_sample_size - 1)) // 2

    progress_bar = tqdm(total=total_comparisons, desc="比较图片相似度")
    
    for i in range(len(sampled_images)):
        img1 = sampled_images[i]
        paper_idx1 = img1['paper_idx']
        features1 = img1['features']
        
        for j in range(i+1, len(sampled_images)):
            img2 = sampled_images[j]
            paper_idx2 = img2['paper_idx']
            
            progress_bar.update(1)
        
            if paper_idx1 == paper_idx2:
                continue
            
            # 计算相似度
            features2 = img2['features']
            similarity = compute_feature_similarity(features1, features2)
            
            if similarity > similarity_threshold:
                print(f"\n发现相似图片!")
                similar_images.append({
                    '参赛队号': img1['team_id'],
                    '雷同图片所在页码': img1['page_idx'],
                    '雷同图片的页内序号': img1['in_page_idx'],
                    '论文中的图片数量': team_image_counts[img1['team_id']]
                })
    progress_bar.close()  
    return similar_images

# 对结果进行去重操作
def deduplicate_similar(df):
    if df.empty:
        return df
    deduplicated_df = df.drop_duplicates(
        subset=['参赛队号', '雷同图片所在页码', '雷同图片的页内序号'],
        keep='first'
    )
    
    return deduplicated_df

# 统计每篇论文中的图片数量
def count_images_per_paper(all_papers_content, team_ids):
    team_image_counts = {}
    
    for paper_idx, paper_content in enumerate(all_papers_content):
        team_id = team_ids[paper_idx]
        # 计算该论文中图片的总数
        image_count = sum(1 for item in paper_content if item['type'] == 'image')
        team_image_counts[team_id] = image_count
        
    return team_image_counts

def main():
    os.makedirs(os.path.dirname('./data/result4.xlsx'), exist_ok=True)

    print("读取附件1的参赛队号和加密号映射...")
    attachment1 = pd.read_excel('./data/附件1.xlsx')
    team_id_mapping = dict(zip(attachment1['加密号'], attachment1['参赛队号']))
    print(f"成功加载 {len(team_id_mapping)} 个队伍信息")
    
    data_dir = "./data/transform/"
    print(f"开始在 {data_dir} 中查找论文文件...")

    paper_files = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith("_content_list.json"):
                paper_files.append(os.path.join(root, file))
    
    print(f"找到 {len(paper_files)} 个论文文件")
    
    # 加载所有论文的内容
    all_papers_content = []
    paper_names = []
    team_ids = []  # 存储队伍ID
    
    for i, paper_file in enumerate(paper_files):  
        print(f"加载第 {i+1}/{len(paper_files)} 篇论文: {paper_file}")
        
        # 提取加密码
        file_name = os.path.basename(paper_file)
        encryption_code = file_name.split('_')[0]
        
        # 找到对应的队伍ID
        team_id = team_id_mapping.get(encryption_code, '未知')
        team_ids.append(team_id)
        
        with open(paper_file, 'r', encoding='utf-8') as f:
            try:
                paper_content = json.load(f)
                all_papers_content.append(paper_content)
                paper_names.append(os.path.relpath(paper_file, data_dir))
                print(f"成功加载论文，参赛队号: {team_id}")
            except json.JSONDecodeError as e:
                print(f"解析JSON文件失败: {paper_file}")
                print(f"错误信息: {str(e)}")
    
    # 计算每篇论文中的图片数量
    team_image_counts = count_images_per_paper(all_papers_content, team_ids)
    print("已完成各参赛队论文图片总数统计")
    
    similar_images = find_similar_images_sampled(all_papers_content, paper_names, team_ids, team_image_counts, data_dir, sample_size=2000)
    similar_images = pd.DataFrame(similar_images)
    df_images = deduplicate_similar(similar_images)

    try:
        with pd.ExcelWriter('./data/result4.xlsx') as writer:
            print("写入雷同图片工作表...")
            if not df_images.empty:
                df_images.to_excel(writer, sheet_name='雷同图片', index=False)
            else:
                empty_images = pd.DataFrame(columns=['参赛队号', '雷同图片所在页码', '雷同图片的页内序号', '论文中的图片数量'])
                empty_images.to_excel(writer, sheet_name='雷同图片', index=False)
        print("结果已成功写入!")
    except Exception as e:
        print(f"写入Excel文件时出错: {str(e)}")

if __name__ == "__main__":
    main()

## 计算最终论文重复率

In [5]:
import os
import pandas as pd

def combine_and_calculate_final_rate(excel_path):
    try:
        duplication_sheet = pd.read_excel(excel_path, sheet_name='文本重复率')
        image_sheet = pd.read_excel(excel_path, sheet_name='雷同图片')
        formula_sheet = pd.read_excel(excel_path, sheet_name='雷同公式')

        # 获取图片和公式的统计数据
        image_data = image_sheet.groupby('参赛队号').agg({
            '论文中的图片数量': 'first',  # 保留论文中的图片数
            '参赛队号': 'size'  # 计算雷同图片数量
        }).rename(columns={'参赛队号': '雷同图片数量'}).reset_index()
        
        formula_counts = formula_sheet.groupby('参赛队号').size().reset_index(name='雷同公式数量')

        # 合并所有数据
        combined_df = duplication_sheet.merge(image_data, on='参赛队号', how='left').merge(
            formula_counts, on='参赛队号', how='left')
        
        # 填充缺失值
        combined_df['雷同图片数量'] = combined_df['雷同图片数量'].fillna(0).astype(int)
        combined_df['雷同公式数量'] = combined_df['雷同公式数量'].fillna(0).astype(int)
        combined_df['论文中的图片数量'] = combined_df['论文中的图片数量'].fillna(0).astype(int)

        # 根据图片数量调整权重
        condition = combined_df['论文中的图片数量'] > 25
        
        # 应用不同的权重计算
        combined_df['重复率'] = 0.0
        # 图片数量 > 25 的情况
        combined_df.loc[condition, '重复率'] = (
            0.6 * combined_df.loc[condition, '文本重复率'] +
            0.3 * combined_df.loc[condition, '雷同图片数量'] +
            0.1 * combined_df.loc[condition, '雷同公式数量']
        )
        # 图片数量 <= 25 的情况
        combined_df.loc[~condition, '重复率'] = (
            0.8 * combined_df.loc[~condition, '文本重复率'] +
            0.1 * combined_df.loc[~condition, '雷同图片数量'] +
            0.1 * combined_df.loc[~condition, '雷同公式数量']
        )

        # 保存结果
        with pd.ExcelWriter(excel_path, mode='a', engine='openpyxl') as writer:
            # writer.book.remove(writer.book['文本重复率'])
            result_df = combined_df[['参赛队号', '重复率']]
            result_df.to_excel(writer, sheet_name='重复率', index=False)
    except FileNotFoundError:
        print(f"错误：文件 '{excel_path}' 未找到")
    except Exception as e:
        print(f"处理文件时出错: {str(e)}")

def main():
    excel_path = './data/result4.xlsx'
    if not os.path.exists(excel_path):
        print(f"错误：文件 '{excel_path}' 不存在")
        return
    # 执行合并和计算
    combine_and_calculate_final_rate(excel_path)

if __name__ == "__main__":
    main()