### 从生成的pdf能谱拟合数据中，批量提取出拟合信息

注意，如果请在这一步时确保拟合的结果是OK的

In [34]:
import pdfplumber
import re
import os
import logging
from dataclasses import dataclass
from PIL import Image, ImageDraw, ImageFont

# 设置日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@dataclass
class ParsedData:
    mu: tuple
    Q0: tuple
    sigma0: tuple
    Q1: tuple
    sigma1: tuple
    alpha: tuple
    w: tuple
    chi2_NDOF: tuple
    fileName: str
    isGood: int
    HV: int

class PDFDataExtractor:
    def __init__(self, is_good_lower_bound=0.7, is_good_upper_bound=1.5, hv_pattern=r'led-(\d+)--00'):
        self.is_good_lower_bound = is_good_lower_bound
        self.is_good_upper_bound = is_good_upper_bound
        self.hv_pattern = hv_pattern

    def extract_text_from_pdf(self, pdf_path):
        """Extracts all text from a PDF file."""
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text()
            logging.info(f"Successfully extracted text from {pdf_path}")
        except Exception as e:
            logging.error(f"Error extracting text from {pdf_path}: {e}")
        return text

    def find_data_between_keywords(self, text, start_keyword="seirtnE", end_keyword="NDOF"):
        """Finds and extracts the text between two keywords."""
        start_pos = text.find(start_keyword)
        end_pos = text.find(end_keyword, start_pos) + 100
        if start_pos == -1 or end_pos == -1:
            return None
        return text[start_pos:end_pos + len(end_keyword)]

    def parse_data(self, data, file_name):
        """Parses the extracted data and returns a structured ParsedData object."""
        def extract_values(pattern, data):
            match = pattern.search(data)
            return match.groups() if match else (None, None)

        try:
            mu = extract_values(re.compile(r"m\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            Q0 = extract_values(re.compile(r"Q0\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            sigma0 = extract_values(re.compile(r"s\s*0\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            Q1 = extract_values(re.compile(r"Q1\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            sigma1 = extract_values(re.compile(r"s\s*1\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            alpha = extract_values(re.compile(r"a\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            w = extract_values(re.compile(r"w\s*=\s*([\d\.\-]+)\s*–\s*([\d\.\-]+)"), data)
            chi2_NDOF = extract_values(re.compile(r"c\s*2/NDOF\s*=\s*([\d\.\-]+)\s*/\s*([\d\.\-]+)"), data)

            chi2_ratio = float(chi2_NDOF[0]) / float(chi2_NDOF[1]) if chi2_NDOF[0] and chi2_NDOF[1] else None
            isGood = 1 if chi2_ratio and self.is_good_lower_bound <= chi2_ratio <= self.is_good_upper_bound else 0

            hv_match = re.search(self.hv_pattern, file_name)
            HV = int(hv_match.group(1)) if hv_match else None

            logging.info(f"Successfully parsed data from {file_name}")
            return ParsedData(mu, Q0, sigma0, Q1, sigma1, alpha, w, chi2_NDOF, file_name, isGood, HV)
        except Exception as e:
            logging.error(f"Error parsing data from {file_name}: {e}")
            return None

    def extract_and_parse(self, pdf_path):
        """Extracts text from a PDF, finds the relevant data section, and parses it."""
        text = self.extract_text_from_pdf(pdf_path)
        data_section = self.find_data_between_keywords(text)

        if data_section:
            return self.parse_data(data_section, pdf_path)
        else:
            logging.warning(f"Data section not found in the document {pdf_path}.")
            return None

def process_all_pdfs_in_directory(directory, extractor):
    """Processes all PDF files in the specified directory."""
    parsed_data_list = []

    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(directory, file_name)
            parsed_data = extractor.extract_and_parse(pdf_path)
            if parsed_data:
                parsed_data_list.append(parsed_data)

    parsed_data_list.sort(key=lambda x: x.HV) # Sort the list based on HV
    return parsed_data_list


def create_combined_image(parsed_data_list, output_image_path, images_per_row=2, resolution=300, font_size=160):
    """Creates a combined image from all PDF files and highlights based on isGood."""
    images = []
    for parsed_data in parsed_data_list:
        try:
            with pdfplumber.open(parsed_data.fileName) as pdf:
                for page in pdf.pages:
                    img = page.to_image(resolution=resolution).original  # Set resolution
                    images.append((img, parsed_data.isGood, parsed_data.HV, parsed_data.fileName))
        except Exception as e:
            logging.error(f"Error processing image from {parsed_data.fileName}: {e}")

    if not images:
        logging.error("No images to combine.")
        return

    # Calculate the dimensions for the combined image
    widths, heights = zip(*(img.size for img, _, _, _ in images))
    max_width = max(widths)
    max_height = max(heights)
    num_rows = (len(images) + images_per_row - 1) // images_per_row  # Calculate the number of rows needed

    combined_image_width = max_width * images_per_row
    combined_image_height = max_height * num_rows

    # Create a new blank image with the calculated dimensions
    combined_image = Image.new('RGB', (combined_image_width, combined_image_height))
    x_offset = 0
    y_offset = 0

    # Load fonts
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
        small_font = ImageFont.truetype("arial.ttf", font_size // 10)
    except IOError:
        font = ImageFont.load_default()
        small_font = ImageFont.load_default()
        logging.warning("Default font loaded as 'arial.ttf' was not found.")

    # Paste each image into the combined image and draw the border
    for i, (img, isGood, HV, fileName) in enumerate(images):
        combined_image.paste(img, (x_offset, y_offset))
        draw = ImageDraw.Draw(combined_image)
        color = "blue" if isGood == 1 else "red"
        draw.rectangle([x_offset, y_offset, x_offset + max_width, y_offset + max_height], outline=color, width=5)
        
        # Draw HV value at the bottom left corner
        text_position = (x_offset + 0.2 * font_size, y_offset + max_height - 1.1 * font_size)
        draw.text(text_position, f"HV: {HV}", fill="black", font=font)

        # Draw fileName below HV value
        file_name_position = (x_offset + 4.8 * font_size, y_offset + max_height - 0.3*font_size)
        draw.text(file_name_position, fileName, fill="black", font=small_font)

        x_offset += max_width
        if (i + 1) % images_per_row == 0:
            x_offset = 0
            y_offset += max_height

    # Save the combined image to the specified output path
    combined_image.save(output_image_path)
    logging.info(f"Combined image saved to {output_image_path}")


def filter_and_print(parsed_data_list, hv_threshold):
    """Filters the parsed data based on HV threshold and prints the results."""
    filtered_list = [data for data in parsed_data_list if data.isGood == 1 and data.HV > hv_threshold]
    for data in filtered_list:
        print(data)



In [42]:
# Example usage
directory = "notia"  # pdf文件所在目录
hv_pattern = r'led-(\d+)--00' # 从fileName中匹配HV值的正则表达式

# 用于对拟合结果进行判断和过滤的参数
is_good_lower_bound = 0.7 # chi2 ratio lower bound
is_good_upper_bound = 1.5 # chi2 ratio upper bound
hv_threshold = 1000  # HV threshold for filtering

extractor = PDFDataExtractor(is_good_lower_bound, is_good_upper_bound, hv_pattern)
parsed_data_list = process_all_pdfs_in_directory(directory, extractor)

# for parsed_data in parsed_data_list:
#     print(parsed_data)

output_image_path = directory+"/combined_image.png"
create_combined_image(parsed_data_list, output_image_path)
filter_and_print(parsed_data_list, hv_threshold)


2025-01-03 12:11:59,840 - INFO - Successfully extracted text from notia\F1--pmt-notia-led-1000--00000_Gain=0.003462nVs.pdf
2025-01-03 12:11:59,840 - INFO - Successfully parsed data from notia\F1--pmt-notia-led-1000--00000_Gain=0.003462nVs.pdf
2025-01-03 12:11:59,985 - INFO - Successfully extracted text from notia\F1--pmt-notia-led-1100--00000_Gain=0.005533nVs.pdf
2025-01-03 12:11:59,986 - INFO - Successfully parsed data from notia\F1--pmt-notia-led-1100--00000_Gain=0.005533nVs.pdf
2025-01-03 12:12:00,164 - INFO - Successfully extracted text from notia\F1--pmt-notia-led-1200--00000_Gain=0.009006nVs.pdf
2025-01-03 12:12:00,165 - INFO - Successfully parsed data from notia\F1--pmt-notia-led-1200--00000_Gain=0.009006nVs.pdf
2025-01-03 12:12:00,269 - INFO - Successfully extracted text from notia\F1--pmt-notia-led-1300--00000_Gain=0.013138nVs.pdf
2025-01-03 12:12:00,270 - INFO - Successfully parsed data from notia\F1--pmt-notia-led-1300--00000_Gain=0.013138nVs.pdf
2025-01-03 12:12:00,423 - IN

ParsedData(mu=('1.310', '0.022'), Q0=('-0.00026', '0.00005'), sigma0=('0.00172', '0.00003'), Q1=('0.00528', '0.00006'), sigma1=('0.00132', '0.00008'), alpha=('628', '797'), w=('0.000', '0.090'), chi2_NDOF=('387', '400'), fileName='notia\\F1--pmt-notia-led-1100--00000_Gain=0.005533nVs.pdf', isGood=1, HV=1100)
ParsedData(mu=('1.530', '0.113'), Q0=('-0.00048', '0.00014'), sigma0=('0.00166', '0.00006'), Q1=('0.00852', '0.00015'), sigma1=('0.00197', '0.00020'), alpha=('251', '51'), w=('0.277', '0.107'), chi2_NDOF=('247', '290'), fileName='notia\\F1--pmt-notia-led-1200--00000_Gain=0.009006nVs.pdf', isGood=1, HV=1200)
ParsedData(mu=('1.435', '0.029'), Q0=('-0.00049', '0.00004'), sigma0=('0.00170', '0.00003'), Q1=('0.01264', '0.00007'), sigma1=('0.00317', '0.00010'), alpha=('168', '71'), w=('0.116', '0.035'), chi2_NDOF=('270', '233'), fileName='notia\\F1--pmt-notia-led-1300--00000_Gain=0.013138nVs.pdf', isGood=1, HV=1300)
ParsedData(mu=('1.404', '0.012'), Q0=('-0.00049', '0.00003'), sigma0=('0

### 生成适用于Mathematica做数据分析的格式

In [43]:
def format_parsed_data(parsed_data_list):
    """Formats parsed data into the specified structure."""
    Q1ListM = []
    Q1ListS = []
    Q0ListM = []
    Q0ListS = []
    HVList = []

    for data in parsed_data_list:
        Q1ListM.append(data.Q1[0])
        Q1ListS.append(data.Q1[1])
        Q0ListM.append(data.Q0[0])
        Q0ListS.append(data.Q0[1])
        HVList.append(data.HV)

    formatted_data = {
        "Q1ListM": Q1ListM,
        "Q1ListS": Q1ListS,
        "Q0ListM": Q0ListM,
        "Q0ListS": Q0ListS,
        "HVList": HVList
    }

    return formatted_data

formatted_data = format_parsed_data(parsed_data_list)


print("Q1ListM =", "{", ", ".join(map(str, formatted_data["Q1ListM"])), "};")
print("Q1ListS =", "{", ", ".join(map(str, formatted_data["Q1ListS"])), "};")
print("Q0ListM =", "{", ", ".join(map(str, formatted_data["Q0ListM"])), "};")
print("Q0ListS =", "{", ", ".join(map(str, formatted_data["Q0ListS"])), "};")
print("HVList =", "{", ", ".join(map(str, formatted_data["HVList"])), "};")

Q1ListM = { 0.00312, 0.00329, 0.00528, 0.00852, 0.01264, 0.02603, 0.02628 };
Q1ListS = { 0.00024, 0.00015, 0.00006, 0.00015, 0.00007, 0.00011, 0.00013 };
Q0ListM = { -0.00009, -0.00018, -0.00026, -0.00048, -0.00049, -0.00049, -0.00043 };
Q0ListS = { 0.00003, 0.00018, 0.00005, 0.00014, 0.00004, 0.00003, 0.00003 };
HVList = { 900, 1000, 1100, 1200, 1300, 1400, 1500 };


In [19]:
parsed_data_list


[ParsedData(mu=('0.137', '0.001'), Q0=('0.00115', '0.00000'), sigma0=('0.00211', '0.00002'), Q1=('0.00421', '0.00009'), sigma1=('0.00015', '0.00013'), alpha=('53', '1562'), w=('0.000', '0.005'), chi2_NDOF=('266', '162'), fileName='notia\\F1--pmt-notia-led-900--00000_Gain=0.003060nVs.pdf', isGood=0, HV=900),
 ParsedData(mu=('0.090', '0.006'), Q0=('0.00021', '0.00002'), sigma0=('0.00181', '0.00002'), Q1=('0.00341', '0.00012'), sigma1=('0.00008', '0.00108'), alpha=('3965', '3463'), w=('0.000', '0.064'), chi2_NDOF=('151', '131'), fileName='notia\\F1--pmt-notia-led-900--00001_Gain=0.003198nVs.pdf', isGood=1, HV=900),
 ParsedData(mu=('0.058', '0.007'), Q0=('-0.00009', '0.00003'), sigma0=('0.00178', '0.00002'), Q1=('0.00312', '0.00024'), sigma1=('0.00006', '0.00385'), alpha=('136', '151'), w=('0.221', '0.164'), chi2_NDOF=('137', '120'), fileName='notia\\F1--pmt-notia-led-900--00002_Gain=0.003214nVs.pdf', isGood=1, HV=900),
 ParsedData(mu=('0.327', '0.000'), Q0=('0.00140', '0.00000'), sigma0=(