In [None]:
import requests
import pandas as pd
import time
import math
import re
import json
import os
from typing import List, Dict, Any, Optional, Tuple
import logging

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SegmentedPubChemFetcher:
    """PubChem分段批量数据获取器 - 支持断点续传和避重复"""
    
    def __init__(self, batch_size: int = 50, delay: float = 0.3, max_retries: int = 3):
        """
        初始化分段获取器
        
        Args:
            batch_size: 每批处理的CID数量
            delay: 请求间延迟时间(秒)
            max_retries: 最大重试次数
        """
        self.batch_size = batch_size
        self.delay = delay
        self.max_retries = max_retries
        self.property_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/property/{}/JSON"
        
        # 分段配置
        self.segment_size = 10000  # 每段CID数量
        self.progress_file = "pubchem_progress.json"
        self.data_dir = "pubchem_segments"
        
        # 创建数据目录
        os.makedirs(self.data_dir, exist_ok=True)
    
    def fetch_cid_segments(self, 
                          start_cid: int, 
                          end_cid: int, 
                          segment_size: Optional[int] = None,
                          include_melting_boiling: bool = False,
                          resume: bool = True) -> str:
        """
        分段获取CID范围内的所有分子数据
        
        Args:
            start_cid: 起始CID
            end_cid: 结束CID  
            segment_size: 每段大小，None则使用默认值
            include_melting_boiling: 是否包含熔点沸点
            resume: 是否断点续传
            
        Returns:
            最终合并文件的路径
        """
        if segment_size:
            self.segment_size = segment_size
            
        total_cids = end_cid - start_cid + 1
        total_segments = math.ceil(total_cids / self.segment_size)
        
        logger.info(f"=== 开始分段获取CID数据 ===")
        logger.info(f"CID范围: {start_cid} - {end_cid} ({total_cids:,} 个)")
        logger.info(f"分段大小: {self.segment_size:,}")
        logger.info(f"总段数: {total_segments}")
        logger.info(f"包含熔点沸点: {include_melting_boiling}")
        logger.info(f"断点续传: {resume}")
        
        # 加载或初始化进度
        progress = self._load_progress() if resume else {}
        task_id = f"cid_{start_cid}_{end_cid}"
        
        if task_id not in progress:
            progress[task_id] = {
                'start_cid': start_cid,
                'end_cid': end_cid,
                'segment_size': self.segment_size,
                'include_melting_boiling': include_melting_boiling,
                'completed_segments': [],
                'failed_segments': [],
                'total_segments': total_segments,
                'start_time': time.time()
            }
        
        task_progress = progress[task_id]
        completed = set(task_progress['completed_segments'])
        failed = set(task_progress['failed_segments'])
        
        logger.info(f"已完成段数: {len(completed)}/{total_segments}")
        if failed:
            logger.warning(f"之前失败的段: {len(failed)} 个")
        
        # 分段处理
        for segment_idx in range(total_segments):
            if segment_idx in completed:
                logger.info(f"段 {segment_idx + 1}/{total_segments} 已完成，跳过")
                continue
                
            # 计算当前段的CID范围
            segment_start = start_cid + segment_idx * self.segment_size
            segment_end = min(segment_start + self.segment_size - 1, end_cid)
            segment_cids = list(range(segment_start, segment_end + 1))
            
            logger.info(f"处理段 {segment_idx + 1}/{total_segments}: CID {segment_start}-{segment_end} ({len(segment_cids)} 个)")
            
            try:
                # 获取当前段的数据
                df_segment = self.fetch_molecular_properties(segment_cids, include_melting_boiling)
                
                if len(df_segment) > 0:
                    # 保存段数据
                    segment_file = os.path.join(self.data_dir, f"segment_{segment_idx:04d}_{segment_start}_{segment_end}.csv")
                    df_segment.to_csv(segment_file, index=False, encoding='utf-8')
                    
                    # 更新进度
                    task_progress['completed_segments'].append(segment_idx)
                    if segment_idx in failed:
                        task_progress['failed_segments'].remove(segment_idx)
                    
                    logger.info(f"段 {segment_idx + 1} 完成: 获得 {len(df_segment)} 个有效分子，已保存")
                else:
                    logger.warning(f"段 {segment_idx + 1} 无有效数据")
                    task_progress['failed_segments'].append(segment_idx)
                
            except Exception as e:
                logger.error(f"段 {segment_idx + 1} 失败: {e}")
                if segment_idx not in task_progress['failed_segments']:
                    task_progress['failed_segments'].append(segment_idx)
            
            # 保存进度
            self._save_progress(progress)
            
            # 显示总体进度
            completed_count = len(task_progress['completed_segments'])
            elapsed_time = time.time() - task_progress['start_time']
            if completed_count > 0:
                avg_time_per_segment = elapsed_time / completed_count
                remaining_segments = total_segments - completed_count
                estimated_remaining_time = avg_time_per_segment * remaining_segments
                logger.info(f"总进度: {completed_count}/{total_segments} ({completed_count/total_segments*100:.1f}%)")
                logger.info(f"预计剩余时间: {estimated_remaining_time/60:.1f} 分钟")
        
        # 合并所有段的数据
        logger.info("=== 开始合并段数据 ===")
        final_file = self._merge_segments(task_id, task_progress)
        
        # 标记任务完成
        task_progress['completed'] = True
        task_progress['final_file'] = final_file
        task_progress['end_time'] = time.time()
        self._save_progress(progress)
        
        total_time = task_progress['end_time'] - task_progress['start_time']
        logger.info(f"=== 分段获取完成 ===")
        logger.info(f"总耗时: {total_time/60:.1f} 分钟")
        logger.info(f"最终文件: {final_file}")
        
        return final_file
    
    def fetch_cid_list_segments(self, 
                               cid_list: List[int],
                               segment_size: Optional[int] = None,
                               include_melting_boiling: bool = False,
                               task_name: str = "custom") -> str:
        """
        对自定义CID列表进行分段获取
        
        Args:
            cid_list: CID列表
            segment_size: 每段大小
            include_melting_boiling: 是否包含熔点沸点
            task_name: 任务名称，用于断点续传
            
        Returns:
            最终合并文件的路径
        """
        if segment_size:
            self.segment_size = segment_size
            
        # 去重并排序
        cid_list = sorted(list(set(cid_list)))
        total_cids = len(cid_list)
        total_segments = math.ceil(total_cids / self.segment_size)
        
        logger.info(f"=== 开始分段获取自定义CID列表 ===")
        logger.info(f"总CID数: {total_cids:,}")
        logger.info(f"分段大小: {self.segment_size:,}")
        logger.info(f"总段数: {total_segments}")
        
        # 保存CID列表以便断点续传
        cid_list_file = os.path.join(self.data_dir, f"cid_list_{task_name}.json")
        with open(cid_list_file, 'w') as f:
            json.dump(cid_list, f)
        
        # 加载进度
        progress = self._load_progress()
        task_id = f"list_{task_name}_{len(cid_list)}"
        
        if task_id not in progress:
            progress[task_id] = {
                'task_name': task_name,
                'total_cids': total_cids,
                'segment_size': self.segment_size,
                'include_melting_boiling': include_melting_boiling,
                'completed_segments': [],
                'failed_segments': [],
                'total_segments': total_segments,
                'start_time': time.time(),
                'cid_list_file': cid_list_file
            }
        
        task_progress = progress[task_id]
        completed = set(task_progress['completed_segments'])
        
        # 分段处理
        for segment_idx in range(total_segments):
            if segment_idx in completed:
                logger.info(f"段 {segment_idx + 1}/{total_segments} 已完成，跳过")
                continue
            
            # 获取当前段的CID
            start_idx = segment_idx * self.segment_size
            end_idx = min(start_idx + self.segment_size, total_cids)
            segment_cids = cid_list[start_idx:end_idx]
            
            logger.info(f"处理段 {segment_idx + 1}/{total_segments}: {len(segment_cids)} 个CID")
            
            try:
                df_segment = self.fetch_molecular_properties(segment_cids, include_melting_boiling)
                
                if len(df_segment) > 0:
                    # 保存段数据
                    segment_file = os.path.join(self.data_dir, f"list_segment_{task_name}_{segment_idx:04d}.csv")
                    df_segment.to_csv(segment_file, index=False, encoding='utf-8')
                    
                    task_progress['completed_segments'].append(segment_idx)
                    logger.info(f"段 {segment_idx + 1} 完成: {len(df_segment)} 个分子")
                
            except Exception as e:
                logger.error(f"段 {segment_idx + 1} 失败: {e}")
                if segment_idx not in task_progress['failed_segments']:
                    task_progress['failed_segments'].append(segment_idx)
            
            self._save_progress(progress)
        
        # 合并数据
        final_file = self._merge_segments(task_id, task_progress, is_list=True)
        
        task_progress['completed'] = True
        task_progress['final_file'] = final_file
        task_progress['end_time'] = time.time()
        self._save_progress(progress)
        
        logger.info(f"自定义CID列表获取完成: {final_file}")
        return final_file
    
    def _merge_segments(self, task_id: str, task_progress: Dict[str, Any], is_list: bool = False) -> str:
        """合并所有段的数据"""
        all_dataframes = []
        completed_segments = task_progress['completed_segments']
        
        logger.info(f"合并 {len(completed_segments)} 个段的数据...")
        
        for segment_idx in sorted(completed_segments):
            if is_list:
                # 自定义列表的文件命名
                task_name = task_progress['task_name']
                segment_file = os.path.join(self.data_dir, f"list_segment_{task_name}_{segment_idx:04d}.csv")
            else:
                # CID范围的文件命名
                start_cid = task_progress['start_cid']
                segment_start = start_cid + segment_idx * task_progress['segment_size']
                segment_end = min(segment_start + task_progress['segment_size'] - 1, task_progress['end_cid'])
                segment_file = os.path.join(self.data_dir, f"segment_{segment_idx:04d}_{segment_start}_{segment_end}.csv")
            
            if os.path.exists(segment_file):
                try:
                    df_segment = pd.read_csv(segment_file)
                    all_dataframes.append(df_segment)
                    logger.debug(f"已加载段 {segment_idx}: {len(df_segment)} 条记录")
                except Exception as e:
                    logger.warning(f"加载段 {segment_idx} 失败: {e}")
        
        if all_dataframes:
            # 合并所有数据
            final_df = pd.concat(all_dataframes, ignore_index=True)
            
            # 去重（基于CID）
            initial_count = len(final_df)
            final_df = final_df.drop_duplicates(subset=['CID'], keep='first')
            final_count = len(final_df)
            
            if initial_count != final_count:
                logger.info(f"去重: {initial_count} -> {final_count} 条记录")
            
            # 保存最终文件
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            if is_list:
                final_file = f"pubchem_data_{task_progress['task_name']}_{final_count}_{timestamp}.csv"
            else:
                start_cid = task_progress['start_cid']
                end_cid = task_progress['end_cid']
                final_file = f"pubchem_data_{start_cid}_{end_cid}_{final_count}_{timestamp}.csv"
            
            final_df.to_csv(final_file, index=False, encoding='utf-8')
            
            # 显示统计信息
            self._print_final_statistics(final_df, final_file)
            
            return final_file
        else:
            logger.error("没有找到任何有效的段数据文件")
            return ""
    
    def _print_final_statistics(self, df: pd.DataFrame, filename: str):
        """打印最终统计信息"""
        print(f"\n=== 最终数据统计 ===")
        print(f"文件: {filename}")
        print(f"总分子数: {len(df):,}")
        print(f"有效SMILES: {df['SMILES'].notna().sum():,}")
        print(f"有分子量数据: {df['Molecular_Weight'].notna().sum():,}")
        
        if 'Melting_Point' in df.columns:
            mp_count = df['Melting_Point'].notna().sum()
            bp_count = df['Boiling_Point'].notna().sum()
            print(f"有熔点数据: {mp_count:,} ({mp_count/len(df)*100:.1f}%)")
            print(f"有沸点数据: {bp_count:,} ({bp_count/len(df)*100:.1f}%)")
        
        if df['Molecular_Weight'].notna().any():
            mw_data = df['Molecular_Weight'].dropna()
            print(f"\n分子量统计:")
            print(f"  范围: {mw_data.min():.1f} - {mw_data.max():.1f}")
            print(f"  平均: {mw_data.mean():.1f}")
            print(f"  中位数: {mw_data.median():.1f}")
    
    def _load_progress(self) -> Dict[str, Any]:
        """加载进度文件"""
        if os.path.exists(self.progress_file):
            try:
                with open(self.progress_file, 'r') as f:
                    return json.load(f)
            except Exception as e:
                logger.warning(f"加载进度文件失败: {e}")
        return {}
    
    def _save_progress(self, progress: Dict[str, Any]):
        """保存进度文件"""
        try:
            with open(self.progress_file, 'w') as f:
                json.dump(progress, f, indent=2)
        except Exception as e:
            logger.warning(f"保存进度文件失败: {e}")
    
    def show_progress(self):
        """显示当前所有任务的进度"""
        progress = self._load_progress()
        
        if not progress:
            print("没有正在进行的任务")
            return
        
        print("\n=== 任务进度 ===")
        for task_id, task_info in progress.items():
            print(f"\n任务: {task_id}")
            
            if task_info.get('completed', False):
                print(f"  状态: ✅ 已完成")
                print(f"  最终文件: {task_info.get('final_file', 'Unknown')}")
            else:
                total = task_info['total_segments']
                completed = len(task_info['completed_segments'])
                failed = len(task_info.get('failed_segments', []))
                
                print(f"  状态: 🔄 进行中")
                print(f"  进度: {completed}/{total} ({completed/total*100:.1f}%)")
                
                if failed > 0:
                    print(f"  失败段数: {failed}")
                
                # 时间估算
                if completed > 0:
                    elapsed = time.time() - task_info['start_time']
                    avg_time = elapsed / completed
                    remaining_time = avg_time * (total - completed)
                    print(f"  已耗时: {elapsed/60:.1f} 分钟")
                    print(f"  预计剩余: {remaining_time/60:.1f} 分钟")
    
    def cleanup_segments(self, task_id: str = None):
        """清理段文件（可选保留最终合并文件）"""
        if task_id:
            # 清理特定任务的段文件
            progress = self._load_progress()
            if task_id in progress and progress[task_id].get('completed', False):
                # 只清理已完成任务的段文件
                pattern = f"*{task_id}*"
                # 实现段文件清理逻辑
                logger.info(f"清理任务 {task_id} 的段文件")
        else:
            # 清理所有段文件
            logger.info("清理所有段文件...")
    
    # ===== 以下是原有的分子属性获取方法 =====
    
    def fetch_molecular_properties(self, cids: List[int], include_melting_boiling: bool = True) -> pd.DataFrame:
        """
        批量获取分子属性（原有方法）
        """
        # 基础属性 (通过property端点获取)
        basic_properties = [
            "MolecularWeight",
            "HeavyAtomCount", 
            "MolecularFormula",
            "CanonicalSMILES",
            "IUPACName",
            "XLogP"
        ]
        
        properties_str = ",".join(basic_properties)
        
        all_results = []
        total_batches = math.ceil(len(cids) / self.batch_size)
        
        logger.info(f"开始获取 {len(cids)} 个分子的基础属性，共 {total_batches} 批")
        
        # 第一步：获取基础属性
        for i in range(0, len(cids), self.batch_size):
            batch_num = i // self.batch_size + 1
            cid_batch = cids[i:i + self.batch_size]
            
            logger.debug(f"处理第 {batch_num}/{total_batches} 批基础属性 ({len(cid_batch)} 个分子)")
            
            batch_results = self._fetch_batch_properties(cid_batch, properties_str)
            all_results.extend(batch_results)
            
            # 添加延迟避免API限制
            if i + self.batch_size < len(cids):
                time.sleep(self.delay)
        
        # 第二步：如果需要，获取熔点沸点数据
        if include_melting_boiling and all_results:
            logger.info("开始获取熔点沸点数据...")
            self._add_melting_boiling_data(all_results)
        
        logger.info(f"成功获取 {len(all_results)} 个分子的完整属性数据")
        return pd.DataFrame(all_results)
    
    def _fetch_batch_properties(self, cid_batch: List[int], properties: str) -> List[Dict[str, Any]]:
        """获取一批CID的属性"""
        results = []
        
        for retry in range(self.max_retries):
            try:
                cids_str = ','.join(map(str, cid_batch))
                url = self.property_url.format(cids_str, properties)
                
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                
                data = response.json()
                
                # 处理返回的属性数据
                for prop in data['PropertyTable']['Properties']:
                    result = self._process_property_data(prop)
                    if result:  # 只添加有效记录
                        results.append(result)
                
                break  # 成功则跳出重试循环
                
            except Exception as e:
                logger.warning(f"批次获取失败 (尝试 {retry + 1}/{self.max_retries}): {e}")
                if retry == self.max_retries - 1:
                    logger.error(f"批次 {cid_batch[:3]}... 最终失败")
                else:
                    time.sleep(1)  # 重试前等待1秒
        
        return results
    
    def _process_property_data(self, prop: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """处理单个分子的属性数据"""
        try:
            result = {
                'CID': self._safe_int_convert(prop.get('CID')),
                'SMILES': prop.get('CanonicalSMILES', ''),
                'Molecular_Weight': self._safe_float_convert(prop.get('MolecularWeight')),
                'Heavy_Atom_Count': self._safe_int_convert(prop.get('HeavyAtomCount')),
                'Molecular_Formula': prop.get('MolecularFormula', ''),
                'IUPAC_Name': prop.get('IUPACName', ''),
                'XLogP': self._safe_float_convert(prop.get('XLogP')),
                'Melting_Point': None,  # 初始化为None，后续通过PUG-VIEW获取
                'Boiling_Point': None   # 初始化为None，后续通过PUG-VIEW获取
            }
            
            # 验证关键字段
            if result['CID'] is None:
                return None
                
            return result
            
        except Exception as e:
            logger.warning(f"处理属性数据失败: {e}")
            return None
    
    def _add_melting_boiling_data(self, results: List[Dict[str, Any]]):
        """通过PUG-VIEW API添加熔点沸点数据"""
        total_molecules = len(results)
        logger.info(f"开始为 {total_molecules} 个分子获取熔点沸点数据")
        
        for i, result in enumerate(results):
            cid = result['CID']
            if cid is None:
                continue
                
            if (i + 1) % 10 == 0:  # 每10个分子显示一次进度
                logger.debug(f"熔点沸点数据获取进度: {i + 1}/{total_molecules}")
            
            # 获取熔点
            melting_point = self._get_experimental_property(cid, "Melting Point")
            if melting_point is not None:
                result['Melting_Point'] = melting_point
            
            # 获取沸点
            boiling_point = self._get_experimental_property(cid, "Boiling Point")
            if boiling_point is not None:
                result['Boiling_Point'] = boiling_point
            
            # 添加延迟避免API限制
            time.sleep(self.delay)
        
        # 统计成功获取的数据
        mp_count = sum(1 for r in results if r['Melting_Point'] is not None)
        bp_count = sum(1 for r in results if r['Boiling_Point'] is not None)
        logger.info(f"熔点沸点数据获取完成: 熔点 {mp_count}/{total_molecules}, 沸点 {bp_count}/{total_molecules}")
    
    def _get_experimental_property(self, cid: int, property_name: str) -> Optional[float]:
        """通过PUG-VIEW获取单个分子的实验属性"""
        try:
            # PUG-VIEW URL格式
            heading = property_name.replace(" ", "+")  # 空格替换为+
            url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON?heading={heading}"
            
            response = requests.get(url, timeout=30)
            
            # 404表示没有该属性数据，这是正常情况
            if response.status_code == 404:
                return None
                
            response.raise_for_status()
            data = response.json()
            
            # 解析PUG-VIEW返回的JSON数据
            return self._parse_experimental_value(data, property_name)
            
        except Exception as e:
            logger.debug(f"获取CID {cid} 的 {property_name} 失败: {e}")
            return None
    
    def _parse_experimental_value(self, data: Dict[str, Any], property_name: str) -> Optional[float]:
        """解析PUG-VIEW返回的实验数据"""
        try:
            # PUG-VIEW数据结构: Record -> Section -> Section -> Information
            record = data.get('Record', {})
            sections = record.get('Section', [])
            
            for section in sections:
                # 查找化学物理性质部分
                if 'Chemical and Physical Properties' in section.get('TOCHeading', ''):
                    subsections = section.get('Section', [])
                    
                    for subsection in subsections:
                        if 'Experimental Properties' in subsection.get('TOCHeading', ''):
                            exp_sections = subsection.get('Section', [])
                            
                            for exp_section in exp_sections:
                                heading = exp_section.get('TOCHeading', '')
                                if property_name.lower() in heading.lower():
                                    # 找到目标属性，提取数值
                                    return self._extract_temperature_value(exp_section)
            
            return None
            
        except Exception as e:
            logger.debug(f"解析 {property_name} 数据失败: {e}")
            return None
    
    def _extract_temperature_value(self, section: Dict[str, Any]) -> Optional[float]:
        """从PUG-VIEW section中提取温度数值"""
        try:
            information_list = section.get('Information', [])
            
            for info in information_list:
                value_data = info.get('Value', {})
                
                # 尝试从不同的字段提取数值
                if 'StringWithMarkup' in value_data:
                    for string_data in value_data['StringWithMarkup']:
                        text = string_data.get('String', '')
                        temp_value = self._parse_temperature_text(text)
                        if temp_value is not None:
                            return temp_value
                
                elif 'Number' in value_data:
                    numbers = value_data['Number']
                    if numbers and len(numbers) > 0:
                        return self._safe_float_convert(numbers[0])
            
            return None
            
        except Exception as e:
            logger.debug(f"提取温度数值失败: {e}")
            return None
    
    def _parse_temperature_text(self, text: str) -> Optional[float]:
        """从文本中解析温度数值"""
        try:
            # 常见的温度格式模式
            patterns = [
                r'(-?\d+\.?\d*)\s*°?[Cc]',  # 123°C, 123 C, 123°c
                r'(-?\d+\.?\d*)\s*deg\s*[Cc]',  # 123 deg C
                r'(-?\d+\.?\d*)\s*degrees?\s*[Cc]',  # 123 degrees C
                r'(-?\d+\.?\d*)\s*°?[Ff]',  # 华氏度，需要转换
                r'(-?\d+\.?\d*)\s*K',  # 开尔文，需要转换
                r'(-?\d+\.?\d*)',  # 纯数字，假设为摄氏度
            ]
            
            for pattern in patterns:
                match = re.search(pattern, text)
                if match:
                    value = float(match.group(1))
                    
                    # 单位转换
                    if '°F' in text or 'deg F' in text or 'degrees F' in text:
                        # 华氏度转摄氏度
                        value = (value - 32) * 5 / 9
                    elif 'K' in text:
                        # 开尔文转摄氏度
                        value = value - 273.15
                    
                    # 合理性检查 (熔点沸点通常在-200到2000°C之间)
                    if -200 <= value <= 2000:
                        return value
            
            return None
            
        except Exception as e:
            logger.debug(f"解析温度文本失败: {text}, {e}")
            return None
    
    def _safe_float_convert(self, value: Any) -> Optional[float]:
        """安全地转换为float"""
        try:
            if value is None or value == '' or value == 'Unknown':
                return None
            return float(value)
        except (ValueError, TypeError):
            return None
    
    def _safe_int_convert(self, value: Any) -> Optional[int]:
        """安全地转换为int"""
        try:
            if value is None or value == '':
                return None
            return int(float(value))
        except (ValueError, TypeError):
            return None


# 使用示例
if __name__ == "__main__":
    # 创建分段获取器
    fetcher = SegmentedPubChemFetcher(
        batch_size=30,     # 每批API请求的CID数量
        delay=0.3,         # API请求间隔
        max_retries=3
    )
    
    print("=== PubChem 分段批量获取器 ===")
    print("支持断点续传，避免重复获取\n")
    
    # ===== 模式1: CID范围分段获取 =====
    print("📦 模式1: CID范围分段获取")
    
    try:
        # 分段获取CID范围数据
        final_file = fetcher.fetch_cid_segments(
            start_cid=1673000,              # 起始CID
            end_cid=2000000,           # 结束CID  
            segment_size=5000,           # 每段5000个CID
            include_melting_boiling=False, # 为了速度，暂不包含熔点沸点
            resume=True                  # 启用断点续传
        )
        
        print(f"✅ CID范围获取完成: {final_file}")
        
    except KeyboardInterrupt:
        print("\n⏸️  程序被中断，进度已保存，下次运行时会自动续传")
    except Exception as e:
        print(f"❌ CID范围获取失败: {e}")
    
    # # ===== 模式2: 自定义CID列表分段获取 =====
    # print("\n📋 模式2: 自定义CID列表分段获取")
    
    # # 生成一个较大的测试CID列表
    # test_cids = list(range(2000, 3000, 1)) + list(range(5000, 6000, 1))  # 2000个CID
    
    # try:
    #     final_file = fetcher.fetch_cid_list_segments(
    #         cid_list=test_cids,
    #         segment_size=500,            # 每段500个CID
    #         include_melting_boiling=False,
    #         task_name="test_batch"       # 任务名称
    #     )
        
    #     print(f"✅ 自定义列表获取完成: {final_file}")
        
    # except Exception as e:
    #     print(f"❌ 自定义列表获取失败: {e}")
    
    # ===== 查看进度 =====
    print("\n📊 查看当前任务进度:")
    fetcher.show_progress()
    
    # ===== 使用说明 =====
    print(f"\n💡 使用说明:")
    print(f"🔄 断点续传: 程序中断后再次运行会自动从断点继续")
    print(f"📁 数据文件: 段数据保存在 '{fetcher.data_dir}' 目录")
    print(f"📊 进度文件: 进度保存在 '{fetcher.progress_file}'")
    print(f"🧹 清理: 可以使用 fetcher.cleanup_segments() 清理段文件")
    
    print(f"\n⚡ 性能优势:")
    print(f"• 避免API超时和限制")
    print(f"• 支持断点续传")
    print(f"• 自动去重合并")
    print(f"• 内存友好")
    print(f"• 进度可追踪")

2025-06-12 09:26:38,885 - INFO - === 开始分段获取CID数据 ===
2025-06-12 09:26:38,885 - INFO - CID范围: 1673000 - 2000000 (327,001 个)
2025-06-12 09:26:38,886 - INFO - 分段大小: 5,000
2025-06-12 09:26:38,886 - INFO - 总段数: 66
2025-06-12 09:26:38,887 - INFO - 包含熔点沸点: False
2025-06-12 09:26:38,887 - INFO - 断点续传: True
2025-06-12 09:26:38,890 - INFO - 已完成段数: 0/66
2025-06-12 09:26:38,890 - INFO - 处理段 1/66: CID 1673000-1677999 (5000 个)
2025-06-12 09:26:38,890 - INFO - 开始获取 5000 个分子的基础属性，共 167 批


=== PubChem 分段批量获取器 ===
支持断点续传，避免重复获取

📦 模式1: CID范围分段获取


2025-06-12 09:30:52,590 - INFO - 成功获取 5000 个分子的完整属性数据
2025-06-12 09:30:52,639 - INFO - 段 1 完成: 获得 5000 个有效分子，已保存
2025-06-12 09:30:52,643 - INFO - 总进度: 1/66 (1.5%)
2025-06-12 09:30:52,645 - INFO - 预计剩余时间: 274.9 分钟
2025-06-12 09:30:52,645 - INFO - 处理段 2/66: CID 1678000-1682999 (5000 个)
2025-06-12 09:30:52,645 - INFO - 开始获取 5000 个分子的基础属性，共 167 批
2025-06-12 09:35:16,595 - INFO - 成功获取 5000 个分子的完整属性数据
2025-06-12 09:35:16,630 - INFO - 段 2 完成: 获得 5000 个有效分子，已保存
2025-06-12 09:35:16,633 - INFO - 总进度: 2/66 (3.0%)
2025-06-12 09:35:16,634 - INFO - 预计剩余时间: 276.1 分钟
2025-06-12 09:35:16,634 - INFO - 处理段 3/66: CID 1683000-1687999 (5000 个)
2025-06-12 09:35:16,635 - INFO - 开始获取 5000 个分子的基础属性，共 167 批
2025-06-12 09:39:34,898 - INFO - 成功获取 5000 个分子的完整属性数据
2025-06-12 09:39:34,943 - INFO - 段 3 完成: 获得 5000 个有效分子，已保存
2025-06-12 09:39:34,946 - INFO - 总进度: 3/66 (4.5%)
2025-06-12 09:39:34,947 - INFO - 预计剩余时间: 271.6 分钟
2025-06-12 09:39:34,948 - INFO - 处理段 4/66: CID 1688000-1692999 (5000 个)
2025-06-12 09:39:34,948 -


=== 最终数据统计 ===
文件: pubchem_data_1673000_2000000_326761_20250612_141320.csv
总分子数: 326,761
有效SMILES: 326,758
有分子量数据: 326,758
有熔点数据: 0 (0.0%)
有沸点数据: 0 (0.0%)

分子量统计:
  范围: 84.1 - 600.9
  平均: 419.7
  中位数: 422.9
✅ CID范围获取完成: pubchem_data_1673000_2000000_326761_20250612_141320.csv

📊 查看当前任务进度:

=== 任务进度 ===

任务: cid_0_10000
  状态: ✅ 已完成
  最终文件: pubchem_data_0_10000_9971_20250607_092337.csv

任务: cid_10000_20000
  状态: ✅ 已完成
  最终文件: pubchem_data_10000_20000_10001_20250607_095533.csv

任务: cid_20000_50000
  状态: ✅ 已完成
  最终文件: pubchem_data_20000_50000_30001_20250607_102920.csv

任务: cid_50000_100000
  状态: ✅ 已完成
  最终文件: pubchem_data_50000_100000_50001_20250607_113028.csv

任务: cid_100000_200000
  状态: ✅ 已完成
  最终文件: pubchem_data_100000_200000_100001_20250607_131440.csv

任务: cid_200000_300000
  状态: ✅ 已完成
  最终文件: pubchem_data_200000_300000_100001_20250607_162506.csv

任务: cid_300000_400000
  状态: ✅ 已完成
  最终文件: pubchem_data_300000_400000_100001_20250607_185035.csv

任务: cid_400000_500000
  状态: ✅ 已完成
  最终文件: p

#### 根据SMILES查找熔点、沸点

In [1]:
import requests
import pandas as pd
import time
import math
import re
from typing import List, Dict, Any, Optional, Tuple
import logging
from urllib.parse import quote

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PubChemSMILESFetcher:
    """基于SMILES的PubChem批量数据获取器"""
    
    def __init__(self, batch_size: int = 50, delay: float = 0.3, max_retries: int = 3):
        """
        初始化批量获取器
        
        Args:
            batch_size: 每批处理的分子数量
            delay: 请求间延迟时间(秒)
            max_retries: 最大重试次数
        """
        self.batch_size = batch_size
        self.delay = delay
        self.max_retries = max_retries
        self.property_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{}/property/{}/JSON"
        self.smiles_to_cid_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{}/cids/JSON"
        self.batch_smiles_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/cids/JSON"
        
    def fetch_properties_from_smiles(self, smiles_list: List[str], include_melting_boiling: bool = True) -> pd.DataFrame:
        """
        根据SMILES列表获取分子属性
        
        Args:
            smiles_list: SMILES字符串列表
            include_melting_boiling: 是否包含熔点沸点数据
            
        Returns:
            包含分子属性的DataFrame
        """
        logger.info(f"开始处理 {len(smiles_list)} 个SMILES")
        
        # 第一步：将SMILES转换为CID
        smiles_to_cid_map = self._batch_convert_smiles_to_cid(smiles_list)
        
        # 统计转换结果
        successful_conversions = [(smiles, cid) for smiles, cid in smiles_to_cid_map.items() if cid is not None]
        failed_conversions = [smiles for smiles, cid in smiles_to_cid_map.items() if cid is None]
        
        logger.info(f"SMILES转CID完成: 成功 {len(successful_conversions)}, 失败 {len(failed_conversions)}")
        
        if failed_conversions:
            logger.warning(f"以下SMILES无法转换为CID: {failed_conversions[:5]}{'...' if len(failed_conversions) > 5 else ''}")
        
        # 第二步：获取CID对应的属性
        if successful_conversions:
            cids = [cid for _, cid in successful_conversions]
            properties_df = self._fetch_molecular_properties(cids, include_melting_boiling)
            
            # 第三步：将原始SMILES添加到结果中
            cid_to_smiles = {cid: smiles for smiles, cid in successful_conversions}
            properties_df['Input_SMILES'] = properties_df['CID'].map(cid_to_smiles)
            
            # 重新排列列顺序，将Input_SMILES放在最前面
            cols = ['Input_SMILES'] + [col for col in properties_df.columns if col != 'Input_SMILES']
            properties_df = properties_df[cols]
            
            # 添加失败的SMILES记录（只有SMILES，其他字段为空）
            if failed_conversions:
                failed_df = pd.DataFrame({
                    'Input_SMILES': failed_conversions,
                    'CID': [None] * len(failed_conversions),
                    'SMILES': [None] * len(failed_conversions),
                    'Molecular_Weight': [None] * len(failed_conversions),
                    'Heavy_Atom_Count': [None] * len(failed_conversions),
                    'Molecular_Formula': [None] * len(failed_conversions),
                    'IUPAC_Name': [None] * len(failed_conversions),
                    'XLogP': [None] * len(failed_conversions),
                    'Melting_Point': [None] * len(failed_conversions),
                    'Boiling_Point': [None] * len(failed_conversions)
                })
                properties_df = pd.concat([properties_df, failed_df], ignore_index=True)
            
            return properties_df
        else:
            # 所有SMILES都转换失败
            return pd.DataFrame({
                'Input_SMILES': smiles_list,
                'CID': [None] * len(smiles_list),
                'SMILES': [None] * len(smiles_list),
                'Molecular_Weight': [None] * len(smiles_list),
                'Heavy_Atom_Count': [None] * len(smiles_list),
                'Molecular_Formula': [None] * len(smiles_list),
                'IUPAC_Name': [None] * len(smiles_list),
                'XLogP': [None] * len(smiles_list),
                'Melting_Point': [None] * len(smiles_list),
                'Boiling_Point': [None] * len(smiles_list)
            })
    
    def _batch_convert_smiles_to_cid(self, smiles_list: List[str]) -> Dict[str, Optional[int]]:
        """
        批量将SMILES转换为CID
        
        Args:
            smiles_list: SMILES列表
            
        Returns:
            SMILES到CID的映射字典
        """
        smiles_to_cid = {}
        total_batches = math.ceil(len(smiles_list) / self.batch_size)
        
        logger.info(f"开始批量转换SMILES到CID，共 {total_batches} 批")
        
        for i in range(0, len(smiles_list), self.batch_size):
            batch_num = i // self.batch_size + 1
            batch_smiles = smiles_list[i:i + self.batch_size]
            
            logger.info(f"处理第 {batch_num}/{total_batches} 批SMILES转换 ({len(batch_smiles)} 个分子)")
            
            # 尝试批量转换
            batch_results = self._convert_smiles_batch(batch_smiles)
            
            # 如果批量转换失败，尝试逐个转换
            for j, smiles in enumerate(batch_smiles):
                if smiles in batch_results:
                    smiles_to_cid[smiles] = batch_results[smiles]
                else:
                    # 单独转换失败的SMILES
                    cid = self._convert_single_smiles(smiles)
                    smiles_to_cid[smiles] = cid
            
            # 添加延迟
            if i + self.batch_size < len(smiles_list):
                time.sleep(self.delay)
        
        return smiles_to_cid
    
    def _convert_smiles_batch(self, smiles_batch: List[str]) -> Dict[str, Optional[int]]:
        """
        批量转换SMILES到CID
        
        Args:
            smiles_batch: SMILES批次
            
        Returns:
            SMILES到CID的映射
        """
        results = {}
        
        for retry in range(self.max_retries):
            try:
                # 准备POST请求数据
                post_data = '\n'.join(smiles_batch)
                
                response = requests.post(
                    self.batch_smiles_url,
                    data=post_data,
                    headers={'Content-Type': 'application/x-www-form-urlencoded'},
                    timeout=30
                )
                
                if response.status_code == 200:
                    data = response.json()
                    # 解析返回的数据
                    if 'IdentifierList' in data:
                        cids = data['IdentifierList']['CID']
                        # 假设返回的CID顺序与输入SMILES顺序一致
                        for i, smiles in enumerate(smiles_batch):
                            if i < len(cids):
                                results[smiles] = cids[i]
                    break
                    
            except Exception as e:
                logger.debug(f"批量SMILES转换失败 (尝试 {retry + 1}/{self.max_retries}): {e}")
                if retry == self.max_retries - 1:
                    logger.warning(f"批量转换最终失败")
                else:
                    time.sleep(1)
        
        return results
    
    def _convert_single_smiles(self, smiles: str) -> Optional[int]:
        """
        将单个SMILES转换为CID
        
        Args:
            smiles: SMILES字符串
            
        Returns:
            CID或None
        """
        for retry in range(self.max_retries):
            try:
                # URL编码SMILES（处理特殊字符）
                encoded_smiles = quote(smiles, safe='')
                url = self.smiles_to_cid_url.format(encoded_smiles)
                
                response = requests.get(url, timeout=30)
                
                if response.status_code == 200:
                    data = response.json()
                    if 'IdentifierList' in data and 'CID' in data['IdentifierList']:
                        cids = data['IdentifierList']['CID']
                        # 返回第一个CID
                        return cids[0] if cids else None
                elif response.status_code == 404:
                    # SMILES不存在于PubChem
                    return None
                    
            except Exception as e:
                logger.debug(f"SMILES '{smiles}' 转换失败 (尝试 {retry + 1}/{self.max_retries}): {e}")
                if retry < self.max_retries - 1:
                    time.sleep(0.5)
        
        return None
    
    def _fetch_molecular_properties(self, cids: List[int], include_melting_boiling: bool = True) -> pd.DataFrame:
        """
        批量获取分子属性（内部方法，复用原代码的逻辑）
        """
        # 基础属性
        basic_properties = [
            "MolecularWeight",
            "HeavyAtomCount", 
            "MolecularFormula",
            "CanonicalSMILES",
            "IUPACName",
            "XLogP"
        ]
        
        properties_str = ",".join(basic_properties)
        
        all_results = []
        total_batches = math.ceil(len(cids) / self.batch_size)
        
        logger.info(f"开始获取 {len(cids)} 个分子的基础属性，共 {total_batches} 批")
        
        # 获取基础属性
        for i in range(0, len(cids), self.batch_size):
            batch_num = i // self.batch_size + 1
            cid_batch = cids[i:i + self.batch_size]
            
            logger.info(f"处理第 {batch_num}/{total_batches} 批基础属性 ({len(cid_batch)} 个分子)")
            
            batch_results = self._fetch_batch_properties(cid_batch, properties_str)
            all_results.extend(batch_results)
            
            if i + self.batch_size < len(cids):
                time.sleep(self.delay)
        
        # 获取熔点沸点数据
        if include_melting_boiling and all_results:
            logger.info("开始获取熔点沸点数据...")
            self._add_melting_boiling_data(all_results)
        
        logger.info(f"成功获取 {len(all_results)} 个分子的完整属性数据")
        return pd.DataFrame(all_results)
    
    def fetch_from_file(self, file_path: str, smiles_column: str = 'SMILES', 
                       include_melting_boiling: bool = True) -> pd.DataFrame:
        """
        从文件中读取SMILES并获取属性
        
        Args:
            file_path: 包含SMILES的文件路径 (支持csv, xlsx)
            smiles_column: SMILES列的列名
            include_melting_boiling: 是否包含熔点沸点
            
        Returns:
            包含分子属性的DataFrame
        """
        try:
            # 读取文件
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
            elif file_path.endswith('.xlsx'):
                df = pd.read_excel(file_path)
            else:
                raise ValueError(f"不支持的文件格式: {file_path}")
            
            # 检查列是否存在
            if smiles_column not in df.columns:
                raise ValueError(f"找不到列 '{smiles_column}'。可用的列: {list(df.columns)}")
            
            # 提取SMILES列并去除空值
            smiles_list = df[smiles_column].dropna().astype(str).tolist()
            
            logger.info(f"从文件 {file_path} 读取到 {len(smiles_list)} 个SMILES")
            
            # 获取属性
            results_df = self.fetch_properties_from_smiles(smiles_list, include_melting_boiling)
            
            # 如果原文件有其他列，可以合并回去
            if len(df.columns) > 1:
                # 基于SMILES进行合并
                results_df = pd.merge(
                    df, 
                    results_df, 
                    left_on=smiles_column, 
                    right_on='Input_SMILES', 
                    how='left'
                )
                # 删除重复的Input_SMILES列
                if 'Input_SMILES' in results_df.columns and smiles_column != 'Input_SMILES':
                    results_df = results_df.drop('Input_SMILES', axis=1)
            
            return results_df
            
        except Exception as e:
            logger.error(f"从文件读取SMILES失败: {e}")
            raise
    
    # 以下方法从原代码复制，保持不变
    def _fetch_batch_properties(self, cid_batch: List[int], properties: str) -> List[Dict[str, Any]]:
        """获取一批CID的属性"""
        results = []
        
        for retry in range(self.max_retries):
            try:
                cids_str = ','.join(map(str, cid_batch))
                url = self.property_url.format(cids_str, properties)
                
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                
                data = response.json()
                
                for prop in data['PropertyTable']['Properties']:
                    result = self._process_property_data(prop)
                    if result:
                        results.append(result)
                
                break
                
            except Exception as e:
                logger.warning(f"批次获取失败 (尝试 {retry + 1}/{self.max_retries}): {e}")
                if retry == self.max_retries - 1:
                    logger.error(f"批次 {cid_batch[:3]}... 最终失败")
                else:
                    time.sleep(1)
        
        return results
    
    def _process_property_data(self, prop: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """处理单个分子的属性数据"""
        try:
            result = {
                'CID': self._safe_int_convert(prop.get('CID')),
                'SMILES': prop.get('CanonicalSMILES', ''),
                'Molecular_Weight': self._safe_float_convert(prop.get('MolecularWeight')),
                'Heavy_Atom_Count': self._safe_int_convert(prop.get('HeavyAtomCount')),
                'Molecular_Formula': prop.get('MolecularFormula', ''),
                'IUPAC_Name': prop.get('IUPACName', ''),
                'XLogP': self._safe_float_convert(prop.get('XLogP')),
                'Melting_Point': None,
                'Boiling_Point': None
            }
            
            if result['CID'] is None:
                return None
                
            return result
            
        except Exception as e:
            logger.warning(f"处理属性数据失败: {e}")
            return None
    
    def _add_melting_boiling_data(self, results: List[Dict[str, Any]]):
        """通过PUG-VIEW API添加熔点沸点数据"""
        total_molecules = len(results)
        logger.info(f"开始为 {total_molecules} 个分子获取熔点沸点数据")
        
        for i, result in enumerate(results):
            cid = result['CID']
            if cid is None:
                continue
                
            if (i + 1) % 10 == 0:
                logger.info(f"熔点沸点数据获取进度: {i + 1}/{total_molecules}")
            
            melting_point = self._get_experimental_property(cid, "Melting Point")
            if melting_point is not None:
                result['Melting_Point'] = melting_point
            
            boiling_point = self._get_experimental_property(cid, "Boiling Point")
            if boiling_point is not None:
                result['Boiling_Point'] = boiling_point
            
            time.sleep(self.delay)
        
        mp_count = sum(1 for r in results if r['Melting_Point'] is not None)
        bp_count = sum(1 for r in results if r['Boiling_Point'] is not None)
        logger.info(f"熔点沸点数据获取完成: 熔点 {mp_count}/{total_molecules}, 沸点 {bp_count}/{total_molecules}")
    
    def _get_experimental_property(self, cid: int, property_name: str) -> Optional[float]:
        """通过PUG-VIEW获取单个分子的实验属性"""
        try:
            heading = property_name.replace(" ", "+")
            url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON?heading={heading}"
            
            response = requests.get(url, timeout=30)
            
            if response.status_code == 404:
                return None
                
            response.raise_for_status()
            data = response.json()
            
            return self._parse_experimental_value(data, property_name)
            
        except Exception as e:
            logger.debug(f"获取CID {cid} 的 {property_name} 失败: {e}")
            return None
    
    def _parse_experimental_value(self, data: Dict[str, Any], property_name: str) -> Optional[float]:
        """解析PUG-VIEW返回的实验数据"""
        try:
            record = data.get('Record', {})
            sections = record.get('Section', [])
            
            for section in sections:
                if 'Chemical and Physical Properties' in section.get('TOCHeading', ''):
                    subsections = section.get('Section', [])
                    
                    for subsection in subsections:
                        if 'Experimental Properties' in subsection.get('TOCHeading', ''):
                            exp_sections = subsection.get('Section', [])
                            
                            for exp_section in exp_sections:
                                heading = exp_section.get('TOCHeading', '')
                                if property_name.lower() in heading.lower():
                                    return self._extract_temperature_value(exp_section)
            
            return None
            
        except Exception as e:
            logger.debug(f"解析 {property_name} 数据失败: {e}")
            return None
    
    def _extract_temperature_value(self, section: Dict[str, Any]) -> Optional[float]:
        """从PUG-VIEW section中提取温度数值"""
        try:
            information_list = section.get('Information', [])
            
            for info in information_list:
                value_data = info.get('Value', {})
                
                if 'StringWithMarkup' in value_data:
                    for string_data in value_data['StringWithMarkup']:
                        text = string_data.get('String', '')
                        temp_value = self._parse_temperature_text(text)
                        if temp_value is not None:
                            return temp_value
                
                elif 'Number' in value_data:
                    numbers = value_data['Number']
                    if numbers and len(numbers) > 0:
                        return self._safe_float_convert(numbers[0])
            
            return None
            
        except Exception as e:
            logger.debug(f"提取温度数值失败: {e}")
            return None
    
    def _parse_temperature_text(self, text: str) -> Optional[float]:
        """从文本中解析温度数值"""
        try:
            patterns = [
                r'(-?\d+\.?\d*)\s*°?[Cc]',
                r'(-?\d+\.?\d*)\s*deg\s*[Cc]',
                r'(-?\d+\.?\d*)\s*degrees?\s*[Cc]',
                r'(-?\d+\.?\d*)\s*°?[Ff]',
                r'(-?\d+\.?\d*)\s*K',
                r'(-?\d+\.?\d*)',
            ]
            
            for pattern in patterns:
                match = re.search(pattern, text)
                if match:
                    value = float(match.group(1))
                    
                    if '°F' in text or 'deg F' in text or 'degrees F' in text:
                        value = (value - 32) * 5 / 9
                    elif 'K' in text:
                        value = value - 273.15
                    
                    if -200 <= value <= 2000:
                        return value
            
            return None
            
        except Exception as e:
            logger.debug(f"解析温度文本失败: {text}, {e}")
            return None
    
    def _safe_float_convert(self, value: Any) -> Optional[float]:
        """安全地转换为float"""
        try:
            if value is None or value == '' or value == 'Unknown':
                return None
            return float(value)
        except (ValueError, TypeError):
            return None
    
    def _safe_int_convert(self, value: Any) -> Optional[int]:
        """安全地转换为int"""
        try:
            if value is None or value == '':
                return None
            return int(float(value))
        except (ValueError, TypeError):
            return None
    
    def save_results(self, df: pd.DataFrame, output_path: str, include_stats: bool = True):
        """保存结果并显示统计信息"""
        df.to_csv(output_path, index=False, encoding='utf-8')
        logger.info(f"结果已保存到: {output_path}")
        
        if include_stats and len(df) > 0:
            self._print_statistics(df)
    
    def _print_statistics(self, df: pd.DataFrame):
        """打印数据统计信息"""
        print(f"\n=== 数据统计 ===")
        print(f"总分子数: {len(df)}")
        
        # 检查列是否存在
        if 'Input_SMILES' in df.columns:
            print(f"输入的SMILES数: {df['Input_SMILES'].notna().sum()}")
            print(f"成功转换为CID: {df['CID'].notna().sum()}")
        
        if 'SMILES' in df.columns:
            print(f"有效Canonical SMILES: {df['SMILES'].notna().sum()}")
        
        print(f"有分子量数据: {df['Molecular_Weight'].notna().sum()}")
        print(f"有熔点数据: {df['Melting_Point'].notna().sum()}")
        print(f"有沸点数据: {df['Boiling_Point'].notna().sum()}")
        
        if df['Molecular_Weight'].notna().any():
            print(f"\n分子量统计:")
            print(f"  范围: {df['Molecular_Weight'].min():.2f} - {df['Molecular_Weight'].max():.2f}")
            print(f"  平均: {df['Molecular_Weight'].mean():.2f}")
            print(f"  中位数: {df['Molecular_Weight'].median():.2f}")
        
        if df['Melting_Point'].notna().any():
            print(f"\n熔点统计 (°C):")
            mp_data = df['Melting_Point'].dropna()
            print(f"  数据覆盖率: {len(mp_data)}/{len(df)} ({len(mp_data)/len(df)*100:.1f}%)")
            print(f"  范围: {mp_data.min():.1f} - {mp_data.max():.1f}")
            print(f"  平均: {mp_data.mean():.1f}")
        
        if df['Boiling_Point'].notna().any():
            print(f"\n沸点统计 (°C):")
            bp_data = df['Boiling_Point'].dropna()
            print(f"  数据覆盖率: {len(bp_data)}/{len(df)} ({len(bp_data)/len(df)*100:.1f}%)")
            print(f"  范围: {bp_data.min():.1f} - {bp_data.max():.1f}")
            print(f"  平均: {bp_data.mean():.1f}")


# 使用示例
if __name__ == "__main__":
    # 创建获取器
    fetcher = PubChemSMILESFetcher(
        batch_size=20,
        delay=0.3,
        max_retries=3
    )
    
    # # 示例1: 直接使用SMILES列表
    # example_smiles = [
    #     "CC(=O)Oc1ccccc1C(=O)O",     # 阿司匹林
    #     "CN1C=NC2=C1C(=O)N(C(=O)N2C)C",  # 咖啡因
    #     "CCO",                        # 乙醇
    #     "O",                          # 水
    #     "C1COC(=O)O1",               # 碳酸乙烯酯
    #     "CC1COC(=O)O1",              # 碳酸丙烯酯
    #     "CO",                         # 甲醇
    #     "CN(C)C=O",                   # 二甲基甲酰胺
    #     "INVALID_SMILES"              # 无效的SMILES，用于测试错误处理
    # ]
    
    # print("开始获取分子属性数据...")
    # print("注意: 获取熔点沸点数据需要较长时间，请耐心等待...")
    
    # try:
    #     # 获取数据
    #     df = fetcher.fetch_properties_from_smiles(example_smiles, include_melting_boiling=True)
        
    #     # 保存并显示结果
    #     fetcher.save_results(df, 'molecular_properties_from_smiles.csv')
        
    #     # 显示前几条记录
    #     print(f"\n前5条记录:")
    #     pd.set_option('display.max_columns', None)
    #     pd.set_option('display.width', None)
    #     print(df.head())
        
    # except Exception as e:
    #     logger.error(f"程序执行失败: {e}")
    
    # 示例2: 从文件读取SMILES
    # 假设你有一个包含SMILES列的CSV文件
    df = fetcher.fetch_from_file(r'D:\Project\Code\PubChem\filter data\step 6\predict_dn.csv', smiles_column='SMILES')
    fetcher.save_results(df, 'properties.csv')
    
    # 示例3: 不需要熔点沸点数据（更快）
    # df_fast = fetcher.fetch_properties_from_smiles(example_smiles, include_melting_boiling=False)
    # print("\n快速模式（无熔点沸点）完成！")

2025-06-15 13:20:12,126 - INFO - 从文件 D:\Project\Code\PubChem\filter data\step 6\predict_dn.csv 读取到 12 个SMILES
2025-06-15 13:20:12,127 - INFO - 开始处理 12 个SMILES
2025-06-15 13:20:12,127 - INFO - 开始批量转换SMILES到CID，共 1 批
2025-06-15 13:20:12,127 - INFO - 处理第 1/1 批SMILES转换 (12 个分子)
2025-06-15 13:20:31,204 - INFO - SMILES转CID完成: 成功 12, 失败 0
2025-06-15 13:20:31,204 - INFO - 开始获取 12 个分子的基础属性，共 1 批
2025-06-15 13:20:31,205 - INFO - 处理第 1/1 批基础属性 (12 个分子)
2025-06-15 13:20:32,862 - INFO - 开始获取熔点沸点数据...
2025-06-15 13:20:32,863 - INFO - 开始为 12 个分子获取熔点沸点数据
2025-06-15 13:20:54,938 - INFO - 熔点沸点数据获取进度: 10/12
2025-06-15 13:21:02,604 - INFO - 熔点沸点数据获取完成: 熔点 3/12, 沸点 3/12
2025-06-15 13:21:02,604 - INFO - 成功获取 12 个分子的完整属性数据
2025-06-15 13:21:02,639 - INFO - 结果已保存到: properties.csv



=== 数据统计 ===
总分子数: 12
有分子量数据: 12
有熔点数据: 3
有沸点数据: 3

分子量统计:
  范围: 41.05 - 144.13
  平均: 94.77
  中位数: 99.60

熔点统计 (°C):
  数据覆盖率: 3/12 (25.0%)
  范围: -91.7 - 57.8
  平均: -26.3

沸点统计 (°C):
  数据覆盖率: 3/12 (25.0%)
  范围: 81.6 - 267.2
  平均: 148.7
