In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
图片相似性分析工具 - 定制版
根据指定路径自动分析图片相似性
"""

import os
import argparse
from PIL import Image
import imagehash
import numpy as np
import cv2
from collections import defaultdict
import json
from datetime import datetime
import shutil

class ImageSimilarityAnalyzer:
    def __init__(self, hash_size=8, highfreq_factor=4, image_scale=64):
        """
        初始化图片相似性分析器
        
        Args:
            hash_size: 哈希值大小
            highfreq_factor: 高频因子（用于pHash）
            image_scale: 图像缩放大小（用于wHash）
        """
        self.hash_size = hash_size
        self.highfreq_factor = highfreq_factor
        self.image_scale = image_scale
        self.supported_formats = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')
        
    def is_image_file(self, file_path):
        """检查文件是否为支持的图片格式"""
        return file_path.lower().endswith(self.supported_formats)
    
    def get_image_paths(self, folder_path):
        """获取文件夹中所有图片的路径"""
        image_paths = []
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                if self.is_image_file(file_path):
                    image_paths.append(file_path)
        return image_paths
    
    def calculate_image_hashes(self, image_path):
        """
        计算图片的多种哈希值
        
        Returns:
            dict: 包含各种哈希值的字典
        """
        try:
            # 使用PIL打开图片
            with Image.open(image_path) as img:
                # 转换为RGB模式（处理透明度）
                if img.mode in ('RGBA', 'LA'):
                    background = Image.new(img.mode[:-1], img.size, (255, 255, 255))
                    background.paste(img, img.split()[-1])
                    img = background
                else:
                    img = img.convert('RGB')
                
                # 计算各种哈希值
                hashes = {
                    'ahash': str(imagehash.average_hash(img, hash_size=self.hash_size)),
                    'phash': str(imagehash.phash(img, hash_size=self.hash_size, 
                                               highfreq_factor=self.highfreq_factor)),
                    'dhash': str(imagehash.dhash(img, hash_size=self.hash_size)),
                    'whash': str(imagehash.whash(img, hash_size=self.hash_size, 
                                               image_scale=self.image_scale)),
                    'colorhash': str(imagehash.colorhash(img))
                }
                
                # 获取图片基本信息
                info = {
                    'width': img.width,
                    'height': img.height,
                    'mode': img.mode,
                    'file_size': os.path.getsize(image_path)
                }
                
                return {
                    'hashes': hashes,
                    'info': info,
                    'success': True
                }
                
        except Exception as e:
            print(f"计算图片哈希值失败 {image_path}: {str(e)}")
            return {'success': False, 'error': str(e)}
    
    def hamming_distance(self, hash1, hash2):
        """计算两个哈希值之间的汉明距离"""
        if len(hash1) != len(hash2):
            return float('inf')
        return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
    
    def calculate_similarity_score(self, hash1, hash2):
        """
        计算相似度分数（0-1之间，1表示完全相同）
        
        Args:
            hash1, hash2: 两个哈希字符串
        """
        distance = self.hamming_distance(hash1, hash2)
        max_distance = len(hash1)
        return 1 - (distance / max_distance)
    
    def analyze_similarity(self, target_folder, reference_folder, output_folder, threshold=0.7):
        """
        分析图片相似性并输出差异图片
        
        Args:
            target_folder: 要比较的图片文件夹
            reference_folder: 参考图片文件夹
            output_folder: 差异图片输出文件夹
            threshold: 相似度阈值
        """
        print("="*60)
        print("图片相似性分析工具 - 开始分析")
        print("="*60)
        
        # 验证路径
        paths = {
            '目标文件夹': target_folder,
            '参考文件夹': reference_folder,
            '输出文件夹': output_folder
        }
        
        for name, path in paths.items():
            if not os.path.exists(path):
                if name == '输出文件夹':
                    os.makedirs(path, exist_ok=True)
                    print(f"创建 {name}: {path}")
                else:
                    print(f"错误: {name} 不存在 - {path}")
                    return
        
        # 获取参考图片
        print(f"\n获取参考图片 ({reference_folder})...")
        reference_paths = self.get_image_paths(reference_folder)
        if not reference_paths:
            print("错误: 参考文件夹中未找到任何图片")
            return
        print(f"找到 {len(reference_paths)} 张参考图片")
        
        # 获取目标图片
        print(f"\n获取目标图片 ({target_folder})...")
        target_paths = self.get_image_paths(target_folder)
        if not target_paths:
            print("错误: 目标文件夹中未找到任何图片")
            return
        print(f"找到 {len(target_paths)} 张目标图片")
        
        # 计算参考图片哈希值
        print(f"\n计算参考图片哈希值...")
        reference_data = {}
        for i, img_path in enumerate(reference_paths):
            print(f"处理参考图片 {i+1}/{len(reference_paths)}: {os.path.basename(img_path)}")
            result = self.calculate_image_hashes(img_path)
            if result['success']:
                reference_data[img_path] = result
        
        if not reference_data:
            print("错误: 没有成功处理任何参考图片")
            return
        
        # 计算目标图片哈希值
        print(f"\n计算目标图片哈希值...")
        target_data = {}
        for i, img_path in enumerate(target_paths):
            print(f"处理目标图片 {i+1}/{len(target_paths)}: {os.path.basename(img_path)}")
            result = self.calculate_image_hashes(img_path)
            if result['success']:
                target_data[img_path] = result
        
        if not target_data:
            print("错误: 没有成功处理任何目标图片")
            return
        
        # 分析相似性
        print(f"\n分析图片相似性...")
        results = {
            'reference_images': list(reference_data.keys()),
            'similarity_scores': {},
            'dissimilar_images': [],
            'similar_groups': defaultdict(list),
            'analysis_stats': {}
        }
        
        # 对每张目标图片计算与参考图片的平均相似度
        for img_path in target_data:
            avg_similarity = 0
            hash_scores = {}
            
            for ref_img, ref_data in reference_data.items():
                img_hashes = target_data[img_path]['hashes']
                ref_hashes = ref_data['hashes']
                
                # 计算各种哈希算法的相似度
                scores = {}
                for hash_type in ref_hashes:
                    if hash_type in img_hashes:
                        scores[hash_type] = self.calculate_similarity_score(
                            ref_hashes[hash_type], img_hashes[hash_type]
                        )
                
                # 使用多种哈希算法的平均值作为最终分数
                if scores:
                    ref_score = sum(scores.values()) / len(scores)
                    hash_scores[ref_img] = {
                        'overall': ref_score,
                        'details': scores
                    }
            
            # 计算所有参考图片的平均相似度
            if hash_scores:
                overall_scores = [hs['overall'] for hs in hash_scores.values()]
                final_score = sum(overall_scores) / len(overall_scores)
                
                results['similarity_scores'][img_path] = {
                    'overall': final_score,
                    'by_reference': hash_scores
                }
                
                # 根据相似度分组
                if final_score >= 0.95:
                    results['similar_groups']['very_similar'].append(img_path)
                elif final_score >= 0.85:
                    results['similar_groups']['similar'].append(img_path)
                elif final_score >= 0.70:
                    results['similar_groups']['somewhat_similar'].append(img_path)
                else:
                    results['similar_groups']['dissimilar'].append(img_path)
                    results['dissimilar_images'].append(img_path)
        
        # 生成统计信息
        results['analysis_stats'] = {
            'total_target_images': len(target_data),
            'reference_count': len(reference_data),
            'dissimilar_count': len(results['dissimilar_images']),
            'similarity_distribution': {
                group: len(images) for group, images in results['similar_groups'].items()
            },
            'analysis_time': datetime.now().isoformat()
        }
        
        # 显示分析结果
        self.display_results(results)
        
        # 复制差异图片到输出文件夹
        self.copy_dissimilar_images(results, output_folder, threshold)
        
        # 保存报告
        self.save_report(results, output_folder)
        
        print(f"\n" + "="*60)
        print("分析完成！")
        print(f"差异图片已保存到: {output_folder}")
        print("="*60)
        
        return results
    
    def display_results(self, results):
        """显示分析结果"""
        print(f"\n📊 分析结果摘要")
        print("-" * 40)
        
        stats = results['analysis_stats']
        print(f"总目标图片数量: {stats['total_target_images']}")
        print(f"参考图片数量: {stats['reference_count']}")
        print(f"差异较大的图片数量: {stats['dissimilar_count']}")
        print()
        
        print(f"🔍 相似度分布:")
        for group, count in stats['similarity_distribution'].items():
            if count > 0:
                group_name = {
                    'very_similar': '高度相似 (≥95%)',
                    'similar': '相似 (85%-95%)',
                    'somewhat_similar': '有些相似 (70%-85%)',
                    'dissimilar': '差异较大 (<70%)'
                }.get(group, group)
                print(f"  {group_name}: {count} 张")
        
        if results['dissimilar_images']:
            print()
            print(f"❌ 差异较大的图片:")
            for i, img_path in enumerate(results['dissimilar_images'][:10]):  # 只显示前10个
                score = results['similarity_scores'][img_path]['overall']
                print(f"  {i+1}. {os.path.basename(img_path)} (相似度: {score:.3f})")
            if len(results['dissimilar_images']) > 10:
                print(f"  ... 还有 {len(results['dissimilar_images'])-10} 张")
    
    def copy_dissimilar_images(self, results, output_folder, threshold=0.7):
        """复制差异较大的图片到输出文件夹"""
        print(f"\n📁 正在复制差异图片到: {output_folder}")
        
        copied_count = 0
        for img_path in results['dissimilar_images']:
            try:
                # 获取相似度分数
                score = results['similarity_scores'][img_path]['overall']
                if score < threshold:
                    # 复制文件
                    dest_path = os.path.join(output_folder, os.path.basename(img_path))
                    shutil.copy2(img_path, dest_path)
                    copied_count += 1
                    print(f"复制: {os.path.basename(img_path)} (相似度: {score:.3f})")
            except Exception as e:
                print(f"复制文件失败 {img_path}: {str(e)}")
        
        print(f"成功复制 {copied_count} 张差异较大的图片")
    
    def save_report(self, results, output_folder):
        """保存分析报告"""
        # 保存JSON格式的详细报告
        json_path = os.path.join(output_folder, 'similarity_analysis_report.json')
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        # 保存文本格式的摘要报告
        txt_path = os.path.join(output_folder, 'similarity_analysis_summary.txt')
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write("图片相似性分析报告\n")
            f.write("="*60 + "\n")
            f.write(f"分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            stats = results['analysis_stats']
            f.write(f"📊 分析摘要\n")
            f.write(f"总目标图片数量: {stats['total_target_images']}\n")
            f.write(f"参考图片数量: {stats['reference_count']}\n")
            f.write(f"差异较大的图片数量: {stats['dissimilar_count']}\n\n")
            
            f.write(f"🔍 相似度分布:\n")
            for group, count in stats['similarity_distribution'].items():
                if count > 0:
                    group_name = {
                        'very_similar': '高度相似 (≥95%)',
                        'similar': '相似 (85%-95%)',
                        'somewhat_similar': '有些相似 (70%-85%)',
                        'dissimilar': '差异较大 (<70%)'
                    }.get(group, group)
                    f.write(f"  {group_name}: {count} 张\n")
            
            if results['dissimilar_images']:
                f.write(f"\n❌ 差异较大的图片列表:\n")
                for img_path in results['dissimilar_images']:
                    score = results['similarity_scores'][img_path]['overall']
                    f.write(f"- {os.path.basename(img_path)} (相似度: {score:.3f})\n")

def main():
    """主函数"""
    # 设置您的具体路径
    TARGET_FOLDER = r"C:\Users\Administrator\Downloads\骑手筛选\1-2q27"       # 要比较的图片
    REFERENCE_FOLDER = r"C:\Users\Administrator\Downloads\骑手筛选\ref"       # 参考图片
    OUTPUT_FOLDER = r"C:\Users\Administrator\Downloads\骑手筛选\difference"   # 差异图片输出
    
    # 相似度阈值（0-1之间，数值越低表示差异越大）
    SIMILARITY_THRESHOLD = 0.15

    
    try:
        # 创建分析器实例
        analyzer = ImageSimilarityAnalyzer(
            hash_size=8,
            highfreq_factor=4,
            image_scale=64
        )
        
        # 执行分析
        results = analyzer.analyze_similarity(
            target_folder=TARGET_FOLDER,
            reference_folder=REFERENCE_FOLDER,
            output_folder=OUTPUT_FOLDER,
            threshold=SIMILARITY_THRESHOLD
        )
        
    except KeyboardInterrupt:
        print("\n程序被用户中断")
    except Exception as e:
        print(f"程序执行出错: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == '__main__':
    main()

图片相似性分析工具 - 开始分析

获取参考图片 (C:\Users\Administrator\Downloads\骑手筛选\ref)...
找到 6 张参考图片

获取目标图片 (C:\Users\Administrator\Downloads\骑手筛选\1-2q27)...
找到 253 张目标图片

计算参考图片哈希值...
处理参考图片 1/6: 11_q27_初心_在活动开始之际_请上传您的核实骑手身份的截图_52D7D40A_A0C8_45EE_BFEC_D1FDC1.jpeg
处理参考图片 2/6: 19_q27_晚风_在活动开始之际_请上传您的核实骑手身份的截图_Screenshot_20250917_110238_com.png
处理参考图片 3/6: 1_q27_123木头人_在活动开始之际_请上传您的核实骑手身份的截图_IMG_20250917_102702.jpg
处理参考图片 4/6: 4_q27_千与千寻_在活动开始之际_请上传您的核实骑手身份的截图_Screenshot_20250917_102709.jpg
处理参考图片 5/6: 56_q27_乌唧唧_在活动开始之际_请上传您的核实骑手身份的截图_IMG_7549.jpeg
处理参考图片 6/6: 9_q27_芝士_在活动开始之际_请上传您的核实骑手身份的截图_IMG_7877.jpeg

计算目标图片哈希值...
处理目标图片 1/253: 100_q27_筱筱骏骏_在活动开始之际_请上传您的核实骑手身份的截图_mmexport1758074298013.jpg
处理目标图片 2/253: 101_q27_夏虫不可语冰_在活动开始之际_请上传您的核实骑手身份的截图_Screenshot_20250917_110943_com.jpg
处理目标图片 3/253: 102_q27_青雉_在活动开始之际_请上传您的核实骑手身份的截图_IMG_3330.jpeg
处理目标图片 4/253: 103_q27___在活动开始之际_请上传您的核实骑手身份的截图_BD29744B_C024_47A1_92B4_6F01AA.jpeg
处理目标图片 5/253: 104_q27_momo_在活动开始之际_请上传您的核实骑手身份的截图_wx_camera_1758005595277.jpg
处理目标