In [None]:
%pip install -q git+https://github.com/THU-MIG/yolov10.git
%pip install -q supervision

In [None]:
!wget https://github.com/moured/YOLOv10-Document-Layout-Analysis/releases/download/doclaynet_weights/yolov10x_best.pt
!wget https://raw.githubusercontent.com/moured/YOLOv10-Document-Layout-Analysis/main/images/input_sample.png

In [5]:
!pip install pdf2image
!pip install pillow
!pip install python-docx
!pip install comtypes 
!pip install pdf2image

Collecting comtypes
  Downloading comtypes-1.4.7-py3-none-any.whl.metadata (6.5 kB)
Downloading comtypes-1.4.7-py3-none-any.whl (226 kB)
Installing collected packages: comtypes
Successfully installed comtypes-1.4.7


In [80]:
import cv2
import supervision as sv # pip install supervision
from ultralytics import YOLOv10
import os
from pdf2image import convert_from_path
import comtypes.client  
from pathlib import Path
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
import nltk
from nltk.corpus import stopwords
from torch.nn.functional import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score
import torch
import re


In [81]:
def pdf_to_images(pdf_path, output_folder):
    file_name = Path(pdf_path).stem  
    file_output_folder = os.path.join(output_folder, file_name)  

    if not os.path.exists(file_output_folder):
        os.makedirs(file_output_folder)

    images = convert_from_path(pdf_path)
    for i, image in enumerate(images):
        page_folder = os.path.join(file_output_folder, f"page_{i+1}")  
        if not os.path.exists(page_folder):
            os.makedirs(page_folder)

        image_path = os.path.join(page_folder, f"page_{i+1}.png")
        image.save(image_path, "PNG")
        print(f"Page {i+1} of {pdf_path} saved as {image_path}")

In [82]:
def word_to_pdf(docx_path, pdf_path):
    wdFormatPDF = 17
    word = comtypes.client.CreateObject('Word.Application')
    doc = word.Documents.Open(docx_path)
    doc.SaveAs(pdf_path, FileFormat=wdFormatPDF)
    doc.Close()
    word.Quit()


In [83]:
def word_to_images(docx_path, output_folder):
    pdf_path = os.path.splitext(docx_path)[0] + ".pdf"
    word_to_pdf(docx_path, pdf_path)
    pdf_to_images(pdf_path, output_folder)

In [84]:
def image_to_png(image_path, output_folder):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Failed to load image: {image_path}")
        return
    
    file_name = Path(image_path).stem
    file_output_folder = os.path.join(output_folder, file_name)

    if not os.path.exists(file_output_folder):
        os.makedirs(file_output_folder)

    output_path = os.path.join(file_output_folder, f"{file_name}.png")
    cv2.imwrite(output_path, image)
    print(f"Image {file_name} saved as {output_path}")

In [19]:
def convert_files_in_folder(folder_path, output_folder):
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_extension = Path(file_path).suffix.lower()

            if file_extension == ".pdf":
                print(f"Detected PDF file: {file_path}")
                pdf_to_images(file_path, output_folder)
            elif file_extension in [".docx", ".doc"]:
                print(f"Detected Word file: {file_path}")
                word_to_images(file_path, output_folder)
            elif file_extension in [".png", ".jpg", ".jpeg"]:
                print(f"Detected image file: {file_path}")
                image_to_png(file_path, output_folder)
            else:
                print(f"Unsupported file type: {file_extension}. Skipping {file_path}")


In [20]:
def get_image_paths_from_folder(folder, extensions=['.png', '.jpg', '.jpeg']):
    image_paths = []
    for root, _, files in os.walk(folder):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                image_paths.append(os.path.join(root, file))
    return image_paths

In [33]:
def process_images(model, image_paths, output_root_dir, conf_threshold=0.2, iou_threshold=0.8):
    class_names = model.names

    if not os.path.exists(output_root_dir):
        os.makedirs(output_root_dir)

    for image_path in image_paths:
        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load image: {image_path}")
            continue

        results = model(source=image_path, conf=conf_threshold, iou=iou_threshold)[0]
        detections = sv.Detections.from_ultralytics(results)

        bounding_boxes = detections.xyxy
        labels = detections.class_id

        cropped_images = []
        file_name = os.path.splitext(os.path.basename(image_path))[0]
        file_root_dir = os.path.dirname(image_path)

        sorted_boxes = sorted(enumerate(bounding_boxes), key=lambda x: int(x[1][1]))

        for idx, (original_idx, box) in enumerate(sorted_boxes):
            x1, y1, x2, y2 = map(int, box)

            cropped_image = image[y1:y2, x1:x2]

            label_id = labels[original_idx]
            label_name = class_names[label_id]

            label_dir = os.path.join(file_root_dir, label_name)
            if not os.path.exists(label_dir):
                os.makedirs(label_dir)

            cropped_image_path = os.path.join(label_dir, f"cropped_{label_name}_{idx}.png")
            cv2.imwrite(cropped_image_path, cropped_image)
            cropped_images.append(cropped_image)

            # print(f"Saved cropped image for label '{label_name}' as '{cropped_image_path}'")

        bounding_box_annotator = sv.BoundingBoxAnnotator()
        label_annotator = sv.LabelAnnotator()

        annotated_image = bounding_box_annotator.annotate(scene=image, detections=detections)
        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)

        annotated_image_path = os.path.join(file_root_dir, f"annotated_{file_name}.png")
        cv2.imwrite(annotated_image_path, annotated_image)

        print(f"Saved annotated image as '{annotated_image_path}'")

In [86]:
model = YOLOv10('yolov10x_best.pt')
file_to_convert = "pdf_folder"
output_folder = "images_folder"

convert_files_in_folder(file_to_convert, output_folder)
image_paths = get_image_paths_from_folder(output_folder)

process_images(model=model, image_paths=image_paths, output_root_dir=output_folder)
# 结果保存在images_folder文件夹中， images_folder文件夹中包含了所有的页面， 以及裁剪后的页面信息和标注的页面信息 annotated_page_1.png 可作为前端展示的图片

Detected PDF file: pdf_folder\EconAgent(1).pdf
Page 1 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_1\page_1.png
Page 2 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_2\page_2.png
Page 3 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_3\page_3.png
Page 4 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_4\page_4.png
Page 5 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_5\page_5.png
Page 6 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_6\page_6.png
Page 7 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_7\page_7.png
Page 8 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_8\page_8.png
Page 9 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_9\page_9.png
Page 10 of pdf_folder\EconAgent(1).pdf saved as images_folder\EconAgent(1)\page_10\page_10.png
Page 11 of pdf_folder\EconAgen



Saved annotated image as 'images_folder\EconAgent(1)\page_10\annotated_page_10.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_11\page_11.png: 640x480 23 List-items, 2 Section-headers, 8 Texts, 1 Title, 17.9ms
Speed: 1.0ms preprocess, 17.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_11\annotated_page_11.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_12\page_12.png: 640x480 3 Formulas, 2 List-items, 2 Pictures, 3 Section-headers, 8 Texts, 17.9ms
Speed: 1.0ms preprocess, 17.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)




Saved annotated image as 'images_folder\EconAgent(1)\page_12\annotated_page_12.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_13\page_13.png: 640x480 1 Caption, 1 Formula, 3 Pictures, 1 Section-header, 8 Texts, 16.9ms
Speed: 2.0ms preprocess, 16.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_13\annotated_page_13.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_14\page_14.png: 640x480 3 Captions, 7 Pictures, 1 Text, 16.9ms
Speed: 2.0ms preprocess, 16.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_14\annotated_page_14.png'





image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_2\page_2.png: 640x480 5 List-items, 3 Section-headers, 9 Texts, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_2\annotated_page_2.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_3\page_3.png: 640x480 8 Formulas, 1 Picture, 2 Section-headers, 17 Texts, 17.9ms
Speed: 2.0ms preprocess, 17.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)




Saved annotated image as 'images_folder\EconAgent(1)\page_3\annotated_page_3.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_4\page_4.png: 640x480 4 Formulas, 3 Section-headers, 13 Texts, 16.9ms
Speed: 2.0ms preprocess, 16.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_4\annotated_page_4.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_5\page_5.png: 640x480 5 List-items, 5 Section-headers, 14 Texts, 16.9ms
Speed: 2.0ms preprocess, 16.9ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_5\annotated_page_5.png'





image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_6\page_6.png: 640x480 1 Caption, 1 Picture, 1 Section-header, 11 Texts, 9.0ms
Speed: 1.0ms preprocess, 9.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_6\annotated_page_6.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_7\page_7.png: 640x480 2 Captions, 2 Pictures, 2 Section-headers, 1 Table, 9 Texts, 16.9ms
Speed: 2.0ms preprocess, 16.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)




Saved annotated image as 'images_folder\EconAgent(1)\page_7\annotated_page_7.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_8\page_8.png: 640x480 1 Caption, 4 Pictures, 1 Section-header, 11 Texts, 17.9ms
Speed: 1.0ms preprocess, 17.9ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 480)
Saved annotated image as 'images_folder\EconAgent(1)\page_8\annotated_page_8.png'

image 1/1 h:\projects\Graduate\PatternRecognitionSystems\YOLOv10-Document-Layout-Analysis-main\images_folder\EconAgent(1)\page_9\page_9.png: 640x480 6 Section-headers, 9 Texts, 18.9ms
Speed: 2.0ms preprocess, 18.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)




Saved annotated image as 'images_folder\EconAgent(1)\page_9\annotated_page_9.png'


In [35]:
def extract_text_from_images(base_folder):
    # 遍历 images_folder 中的所有子文件夹
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        if os.path.isdir(folder_path):
            print(f"Processing folder: {folder_name}")
            all_text = ""
            
  
            page_folders = sorted([f for f in os.listdir(folder_path) if f.startswith('page_')])

            for page_folder in page_folders:
                page_folder_path = os.path.join(folder_path, page_folder)
                
                if os.path.isdir(page_folder_path):
                    text_folder = os.path.join(page_folder_path, 'Text')
                    
                    if os.path.exists(text_folder) and os.path.isdir(text_folder):
                        for image_file in os.listdir(text_folder):
                            image_path = os.path.join(text_folder, image_file)
                            if image_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                                try:
                                    img = Image.open(image_path)
                                    text = pytesseract.image_to_string(img)
                                    
                                    clean_text = text.replace('\n', ' ').replace('- ', '')
                                    
                                    all_text += clean_text + " "
                                except Exception as e:
                                    print(f"Error processing {image_path}: {e}")
            
            if all_text:
                output_txt_path = os.path.join(folder_path, f"{folder_name}_text_output.txt")
                with open(output_txt_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(all_text.strip())
                print(f"Saved text file: {output_txt_path}")
            else:
                print(f"No text found for folder: {folder_name}")

In [36]:
base_folder = 'images_folder'  
extract_text_from_images(base_folder)

Processing folder: EconAgent(1)
Saved text file: images_folder\EconAgent(1)\EconAgent(1)_text_output.txt


In [67]:
def summarize_large_text(text, model_name="facebook/bart-large-cnn", chunk_size=1024, summary_length=300):
    # 初始化模型和tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # 分割长文本为块
    chunks = []
    current_chunk = []
    tokens_count = 0

    for sentence in text.split('. '):  # 按句子分割文本
        sentence_tokens = tokenizer.tokenize(sentence)
        tokens_count += len(sentence_tokens)

        if tokens_count > chunk_size:  # 如果当前块超过限制大小
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            tokens_count = len(sentence_tokens)

        current_chunk.append(sentence)

    if current_chunk:  # 添加最后的块
        chunks.append(" ".join(current_chunk))

    # 逐块生成摘要
    all_summaries = []
    for chunk in chunks:
        inputs = tokenizer([chunk], max_length=chunk_size, return_tensors="pt", truncation=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=summary_length, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        all_summaries.append(summary)

    # 将所有摘要合并
    final_summary = " ".join(all_summaries)
    return final_summary

In [68]:
def summarize_text_in_folder(base_folder, model_name="facebook/bart-large-cnn", chunk_size=1024, summary_length=300):
    # 初始化模型和tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    # 遍历 base_folder 中的所有子文件夹
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        if os.path.isdir(folder_path):
            print(f"Processing folder: {folder_name}")
            
            # 寻找同名的文本文件，例如 EcoAgent(1)_text_output.txt
            txt_file_name = f"{folder_name}_text_output.txt"
            txt_file_path = os.path.join(folder_path, txt_file_name)
            
            if os.path.exists(txt_file_path):
                # 读取文本内容
                with open(txt_file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                
                # 将长文本进行分块摘要
                summary = summarize_large_text(text, model_name, chunk_size, summary_length)
                
                # 将摘要保存到同一文件夹下，命名为 folder_name_summary.txt
                summary_file_path = os.path.join(folder_path, f"{folder_name}_summary.txt")
                with open(summary_file_path, 'w', encoding='utf-8') as summary_file:
                    summary_file.write(summary)
                
                print(f"Saved summary for {folder_name} as {summary_file_path}")
            else:
                print(f"No text file found for folder: {folder_name}")

In [69]:
base_folder = 'images_folder'  
summarize_text_in_folder(base_folder)

Processing folder: EconAgent(1)
Saved summary for EconAgent(1) as images_folder\EconAgent(1)\EconAgent(1)_summary.txt


In [30]:

# !pip install nltk
# import nltk
# nltk.download('stopwords')




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lou\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [54]:


cuda_available = torch.cuda.is_available()
print(f"CUDA Available: {cuda_available}")


if cuda_available:
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}")
    print(f"Current CUDA Device: {torch.cuda.current_device()}")
    print(f"Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

CUDA Available: True
Number of CUDA Devices: 1
Current CUDA Device: 0
Device Name: NVIDIA GeForce RTX 4070 Ti SUPER


In [55]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [23]:
test_set = [
    {
        "sentence": "Deep learning has revolutionized artificial intelligence, primarily through the development of sophisticated neural network architectures such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs). CNNs have achieved state-of-the-art performance in image recognition and computer vision tasks, while RNNs and their variants, such as Long Short-Term Memory (LSTM) and Gated Recurrent Units (GRU), have been particularly successful in sequential data tasks like natural language processing and speech recognition. The rise of generative models, including Generative Adversarial Networks (GANs) and Transformer models like GPT-3, has also opened new doors in text generation and content creation, enabling machines to create art, music, and even complex narratives.",
        "true_keywords": ["deep learning", "convolutional neural networks", "recurrent neural networks", "image recognition", "computer vision", "LSTM", "GRU", "GANs", "GPT-3", "text generation", "natural language processing"]
    },
    {
        "sentence": "The development of blockchain technology has moved beyond the scope of cryptocurrencies, finding applications in fields as diverse as healthcare, supply chain management, and finance. Blockchain's decentralized, distributed ledger system ensures data integrity and security without the need for intermediaries. Smart contracts, which execute transactions automatically when predefined conditions are met, have revolutionized the way agreements are handled in legal and business environments. Additionally, blockchain has enabled the rise of decentralized finance (DeFi), which offers financial services without traditional financial intermediaries like banks.",
        "true_keywords": ["blockchain", "cryptocurrencies", "distributed ledger", "decentralized finance", "DeFi", "smart contracts", "data integrity", "supply chain management"]
    },
    {
        "sentence": "Reinforcement learning (RL) is a subset of machine learning that focuses on training agents to make decisions by interacting with an environment. Unlike supervised learning, which requires labeled data, RL operates through the concept of rewards and penalties. Techniques such as Q-learning, policy gradients, and Deep Q-Networks (DQNs) have been used to teach agents to perform complex tasks like playing video games at superhuman levels, optimizing robotic control systems, and managing inventory in supply chains. Reinforcement learning has also been critical in advancing artificial general intelligence (AGI), as it enables machines to learn from experience in a more flexible and scalable manner.",
        "true_keywords": ["reinforcement learning", "Q-learning", "policy gradients", "Deep Q-Networks", "robotic control systems", "artificial general intelligence", "AGI"]
    },
    {
        "sentence": "Natural language processing (NLP) has undergone a transformation with the advent of large-scale pre-trained language models like BERT, GPT-3, and T5. These models use transformers, a deep learning architecture that excels at understanding context through attention mechanisms. They have achieved breakthrough performance in tasks such as text classification, sentiment analysis, question answering, and machine translation. BERT, in particular, has become a go-to model for tasks requiring deep understanding of language, while GPT-3 has pushed the boundaries of text generation with its ability to produce human-like responses in conversation and creative writing.",
        "true_keywords": ["natural language processing", "BERT", "GPT-3", "T5", "text classification", "sentiment analysis", "question answering", "transformer", "machine translation"]
    },
    {
        "sentence": "Cloud computing has fundamentally changed how organizations deploy and scale applications, enabling access to flexible and cost-efficient computing resources. Companies like Amazon Web Services (AWS), Microsoft Azure, and Google Cloud provide infrastructure as a service (IaaS), platform as a service (PaaS), and software as a service (SaaS) solutions that can handle everything from data storage to machine learning workloads. In addition to traditional cloud services, hybrid cloud models allow organizations to maintain on-premises infrastructure while leveraging the benefits of the public cloud. The rise of edge computing has further pushed the boundaries of cloud architecture by bringing computation closer to the data source, which reduces latency and improves real-time decision-making in applications like autonomous vehicles and IoT devices.",
        "true_keywords": ["cloud computing", "AWS", "Microsoft Azure", "Google Cloud", "IaaS", "PaaS", "SaaS", "hybrid cloud", "edge computing", "autonomous vehicles", "IoT devices"]
    },
    {
        "sentence": "Quantum computing holds the promise of solving computational problems that are beyond the reach of classical computers. Unlike classical bits, which are binary, quantum bits (qubits) can exist in superpositions of states, allowing quantum computers to process vast amounts of information simultaneously. Quantum algorithms, such as Shor's algorithm for factoring large integers and Grover's algorithm for searching unsorted databases, have shown potential to revolutionize fields like cryptography, materials science, and drug discovery. Companies like IBM, Google, and Intel are actively developing quantum hardware, while software frameworks like Qiskit and Cirq enable researchers to experiment with quantum algorithms in a simulated environment.",
        "true_keywords": ["quantum computing", "qubits", "superposition", "Shor's algorithm", "Grover's algorithm", "cryptography", "materials science", "drug discovery", "Qiskit", "Cirq"]
    },
    {
        "sentence": "Cybersecurity is becoming increasingly vital as businesses and governments rely more heavily on digital infrastructure. With the rise of cloud computing, IoT devices, and big data, the potential for security breaches has grown exponentially. Cyberattacks such as ransomware, phishing, and Distributed Denial of Service (DDoS) attacks can cripple organizations, leading to data loss, financial damage, and reputational harm. Defensive measures like encryption, multi-factor authentication, and firewalls are standard, but proactive strategies such as ethical hacking, penetration testing, and zero-trust architectures are gaining traction as businesses seek to protect their data in more sophisticated ways.",
        "true_keywords": ["cybersecurity", "ransomware", "phishing", "DDoS", "encryption", "multi-factor authentication", "ethical hacking", "penetration testing", "zero-trust architecture"]
    },
    {
        "sentence": "The Internet of Things (IoT) refers to the growing network of interconnected devices that communicate and share data over the internet. From smart home devices like thermostats and security cameras to industrial sensors in manufacturing plants, IoT has the potential to automate and optimize various aspects of everyday life and business operations. However, with the increasing number of devices, there are also challenges related to data privacy, security, and the management of vast amounts of real-time data. Edge computing and 5G technology are expected to play critical roles in addressing these challenges by enabling faster data processing and reducing latency.",
        "true_keywords": ["Internet of Things", "IoT", "smart home devices", "edge computing", "5G", "data privacy", "real-time data", "industrial sensors"]
    },
    {
        "sentence": "Data analytics has become the backbone of decision-making processes across various industries. By leveraging techniques such as predictive analytics, data mining, and machine learning, companies can derive insights from large datasets that help optimize operations, target customers more effectively, and reduce risks. Big data technologies like Apache Hadoop and Apache Spark enable the processing of vast amounts of unstructured data, while visualization tools like Tableau and Power BI help translate complex information into actionable insights that stakeholders can easily understand.",
        "true_keywords": ["data analytics", "predictive analytics", "data mining", "machine learning", "big data", "Apache Hadoop", "Apache Spark", "Tableau", "Power BI"]
    },
    {
        "sentence": "Autonomous systems, particularly in the realm of self-driving cars, have advanced rapidly due to breakthroughs in machine learning, sensor fusion, and control algorithms. These systems rely on a combination of LIDAR, radar, cameras, and GPS to interpret their surroundings and make real-time driving decisions. Despite significant progress, challenges remain in areas such as safety, ethics, and the legal frameworks surrounding the deployment of autonomous vehicles. Companies like Tesla, Waymo, and Uber are at the forefront of autonomous vehicle development, working to refine these technologies for widespread use.",
        "true_keywords": ["autonomous systems", "self-driving cars", "sensor fusion", "machine learning", "LIDAR", "radar", "Tesla", "Waymo", "Uber", "real-time decisions"]
    },
    {
        "sentence": "Generative Adversarial Networks (GANs) are a class of machine learning frameworks designed to generate new data instances that resemble a given dataset. Comprising two neural networks, a generator and a discriminator, GANs are used in various applications such as image generation, video synthesis, and even drug discovery. The generator creates fake data while the discriminator evaluates its authenticity, and the two networks train in a competitive setting. GANs have shown great promise in creative industries, enabling artists and designers to generate novel content that pushes the boundaries of creativity.",
        "true_keywords": ["Generative Adversarial Networks", "GANs", "image generation", "video synthesis", "drug discovery", "generator", "discriminator", "creative industries"]
    },
    {
        "sentence": "In the field of robotics, the integration of artificial intelligence has enabled robots to perform increasingly complex tasks. From autonomous drones used in delivery services to robotic arms in manufacturing plants, AI-driven robots are making industries more efficient. Advances in computer vision and natural language processing have also given rise to social robots that can interact with humans in meaningful ways, from customer service to healthcare support. However, the growing presence of robots in the workforce raises questions about job displacement and the ethical implications of human-robot interaction.",
        "true_keywords": ["robotics", "artificial intelligence", "autonomous drones", "robotic arms", "computer vision", "natural language processing", "social robots", "human-robot interaction"]
    },
    {
        "sentence": "Augmented reality (AR) and virtual reality (VR) are reshaping the way we interact with the digital world, creating immersive experiences that blend the physical and virtual environments. AR overlays digital information on the physical world, while VR creates entirely virtual environments. These technologies are being used in a wide range of industries, including gaming, healthcare, and education. For instance, AR applications are helping surgeons visualize complex medical procedures in real time, and VR is being used to train employees in safe, controlled environments. As AR and VR technologies advance, they are likely to play an increasingly prominent role in daily life and business operations.",
        "true_keywords": ["augmented reality", "virtual reality", "AR", "VR", "immersive experiences", "gaming", "healthcare", "education", "digital environments"]
    },
    {
        "sentence": "The rise of big data has led to significant advancements in how organizations manage and analyze information. Big data technologies, such as distributed file systems and NoSQL databases, enable the storage and processing of vast amounts of structured and unstructured data. Organizations are now able to harness this data to gain insights into customer behavior, market trends, and operational efficiencies. Additionally, machine learning models trained on big data are powering predictive analytics, allowing businesses to make more informed decisions in real time.",
        "true_keywords": ["big data", "distributed file systems", "NoSQL databases", "structured data", "unstructured data", "predictive analytics", "machine learning", "real-time decisions"]
    },
    {
        "sentence": "DevOps, a combination of development and operations, is a methodology that emphasizes collaboration between software developers and IT operations teams to automate and streamline the software development lifecycle. By integrating practices such as continuous integration (CI) and continuous delivery (CD), DevOps aims to improve the speed and quality of software deployment. Tools like Jenkins, Docker, and Kubernetes have become essential components of modern DevOps pipelines, enabling organizations to deploy software updates more frequently and with fewer errors. As the demand for faster development cycles grows, DevOps has become a critical practice for technology companies.",
        "true_keywords": ["DevOps", "continuous integration", "continuous delivery", "CI", "CD", "Jenkins", "Docker", "Kubernetes", "software deployment"]
    },
    {
        "sentence": "Edge AI is an emerging field that combines edge computing and artificial intelligence to enable real-time data processing and decision-making at the source of data generation. This is particularly useful for applications like autonomous drones, smart cameras, and industrial IoT devices, where low latency and immediate insights are crucial. By processing data locally on devices rather than sending it to the cloud, Edge AI reduces bandwidth usage and improves response times, making it a powerful tool for mission-critical applications. Companies like NVIDIA and Intel are actively developing hardware solutions optimized for Edge AI workloads.",
        "true_keywords": ["Edge AI", "edge computing", "artificial intelligence", "autonomous drones", "smart cameras", "industrial IoT", "low latency", "NVIDIA", "Intel"]
    },
    {
        "sentence": "The rise of quantum cryptography has provided new ways to secure communication by leveraging the principles of quantum mechanics. Unlike classical encryption methods, quantum cryptography promises unbreakable security through quantum key distribution (QKD), where keys are exchanged using quantum particles. Any attempt to intercept or tamper with the communication alters the quantum state of the particles, thereby alerting the sender and receiver to the intrusion. This technology is still in its infancy, but governments and private organizations are already investing in quantum cryptography to protect sensitive data in the future.",
        "true_keywords": ["quantum cryptography", "quantum key distribution", "QKD", "quantum mechanics", "unbreakable security", "encryption", "secure communication"]
    },
    {
        "sentence": "Federated learning is a machine learning technique that enables the training of models across multiple decentralized devices without sharing raw data. This approach is particularly valuable in scenarios where data privacy is paramount, such as healthcare and finance. Federated learning allows organizations to create robust machine learning models by leveraging data from various sources while ensuring that sensitive information remains on local devices. Google has been a pioneer in this field, using federated learning to improve the performance of its predictive text features without compromising user privacy.",
        "true_keywords": ["federated learning", "machine learning", "decentralized devices", "data privacy", "healthcare", "finance", "Google", "predictive text"]
    },
    {
        "sentence": "Robotic process automation (RPA) is transforming how businesses handle repetitive, rule-based tasks by using software robots, or 'bots', to automate processes. These bots can interact with various applications, perform data entry, process transactions, and generate reports, reducing the need for human intervention in routine tasks. RPA is being used across industries like finance, healthcare, and customer service to improve efficiency and reduce operational costs. Additionally, intelligent process automation (IPA), which incorporates AI and machine learning into RPA systems, is making automation smarter by enabling bots to handle more complex decision-making tasks.",
        "true_keywords": ["robotic process automation", "RPA", "software robots", "bots", "automation", "intelligent process automation", "AI", "machine learning"]
    },
    {
        "sentence": "Blockchain technology has the potential to revolutionize not only the financial industry but also supply chain management, healthcare, and even voting systems. By providing a decentralized and immutable ledger, blockchain ensures data transparency and security across different sectors. Smart contracts, which execute automatically when predefined conditions are met, add another layer of efficiency and trust to business operations. Furthermore, decentralized finance (DeFi) applications are reshaping traditional banking by enabling peer-to-peer lending, borrowing, and trading without the need for centralized institutions.",
        "true_keywords": ["blockchain", "decentralized", "immutable ledger", "smart contracts", "DeFi", "supply chain management", "healthcare", "voting systems"]
    },
    {
        "sentence": "Natural language processing (NLP) is an interdisciplinary field at the intersection of linguistics, computer science, and artificial intelligence that focuses on the interaction between computers and human language. Techniques such as tokenization, stemming, and lemmatization are fundamental preprocessing steps. Recent advances in NLP have been driven by transformer models like BERT, GPT-3, and T5, which excel in tasks like text classification, named entity recognition, and machine translation. With these advancements, applications like chatbots, sentiment analysis, and virtual assistants are becoming increasingly effective at understanding and generating human language.",
        "true_keywords": ["natural language processing", "NLP", "BERT", "GPT-3", "T5", "text classification", "named entity recognition", "machine translation", "chatbots"]
    },
    {
        "sentence": "The Internet of Things (IoT) is a rapidly growing network of interconnected devices that communicate with each other over the internet. These devices, which range from smart home appliances to industrial sensors, generate massive amounts of data that need to be processed and analyzed in real time. Edge computing, which processes data closer to the source rather than relying on centralized cloud servers, is emerging as a critical component of the IoT ecosystem, enabling faster response times and reduced bandwidth usage. As IoT continues to expand, security and privacy concerns become increasingly important, particularly in sectors like healthcare and smart cities.",
        "true_keywords": ["Internet of Things", "IoT", "interconnected devices", "edge computing", "real-time data", "smart home appliances", "industrial sensors", "security", "privacy"]
    },
    {
        "sentence": "Quantum computing promises to solve problems that are currently intractable for classical computers by leveraging the principles of quantum mechanics, such as superposition and entanglement. Quantum algorithms like Shor’s algorithm for factoring large numbers and Grover’s algorithm for unstructured search have shown that quantum computers can outperform classical systems for specific tasks. As companies like IBM, Google, and Rigetti race to build scalable quantum processors, researchers are exploring applications in cryptography, drug discovery, and financial modeling that could benefit from quantum speedup.",
        "true_keywords": ["quantum computing", "quantum mechanics", "superposition", "entanglement", "Shor's algorithm", "Grover's algorithm", "cryptography", "drug discovery", "financial modeling"]
    },
    {
        "sentence": "Deep learning, a subset of machine learning, has revolutionized fields like image recognition, speech synthesis, and natural language processing. Convolutional Neural Networks (CNNs) have become the standard architecture for image classification tasks, while Recurrent Neural Networks (RNNs) and their variants like Long Short-Term Memory (LSTM) and Gated Recurrent Units (GRU) are widely used for sequence-based data such as time series and text. The emergence of transformer models has further enhanced deep learning’s capabilities, allowing for more accurate predictions and complex data modeling across different domains.",
        "true_keywords": ["deep learning", "machine learning", "convolutional neural networks", "image recognition", "RNN", "LSTM", "GRU", "transformer models", "speech synthesis", "natural language processing"]
    },
    {
        "sentence": "Autonomous vehicles are one of the most prominent applications of artificial intelligence and machine learning, combining sensor data, computer vision, and real-time decision-making algorithms to navigate the environment. These vehicles rely on technologies such as LIDAR, radar, and cameras to perceive their surroundings, while deep learning models process this data to identify obstacles, pedestrians, and traffic signals. Despite the progress made by companies like Tesla and Waymo, challenges remain, particularly in ensuring safety in complex environments and dealing with the ethical implications of autonomous decision-making.",
        "true_keywords": ["autonomous vehicles", "artificial intelligence", "machine learning", "sensor data", "computer vision", "LIDAR", "radar", "deep learning", "Tesla", "Waymo"]
    },
    {
        "sentence": "Reinforcement learning (RL), a subfield of machine learning, has gained attention for its ability to train agents to make decisions by interacting with their environment. Unlike supervised learning, which relies on labeled data, RL uses rewards and penalties to guide the learning process. Techniques such as Q-learning, policy gradients, and actor-critic models are commonly used in RL to solve complex problems in robotics, game playing, and resource management. DeepMind’s AlphaGo, which defeated human world champions in the game of Go, is one of the most famous examples of RL’s potential.",
        "true_keywords": ["reinforcement learning", "RL", "Q-learning", "policy gradients", "actor-critic models", "robotics", "game playing", "resource management", "AlphaGo", "DeepMind"]
    },
    {
        "sentence": "Data science has become an essential discipline for organizations looking to extract insights from the vast amounts of data generated daily. By employing techniques such as data mining, machine learning, and predictive analytics, data scientists can uncover hidden patterns and trends that drive decision-making. Big data platforms like Apache Hadoop and Apache Spark are frequently used to process and analyze large datasets, while tools like Python’s Pandas and Scikit-learn offer powerful libraries for statistical analysis and machine learning model development.",
        "true_keywords": ["data science", "data mining", "machine learning", "predictive analytics", "Apache Hadoop", "Apache Spark", "Pandas", "Scikit-learn", "big data"]
    },
    {
        "sentence": "Blockchain technology, best known as the foundation for cryptocurrencies like Bitcoin and Ethereum, has evolved into a versatile tool for ensuring security and transparency across various industries. Beyond finance, blockchain is being applied to areas like supply chain management, where it can provide an immutable record of product provenance, and healthcare, where it secures sensitive patient data. Decentralized applications (dApps) and smart contracts are revolutionizing the way digital transactions are conducted, removing the need for trusted third parties and intermediaries.",
        "true_keywords": ["blockchain", "Bitcoin", "Ethereum", "cryptocurrencies", "supply chain management", "healthcare", "smart contracts", "decentralized applications", "dApps"]
    },
    {
        "sentence": "Cloud computing has transformed the way organizations manage their IT infrastructure, providing scalable and flexible solutions through services like Amazon Web Services (AWS), Microsoft Azure, and Google Cloud. With cloud services, companies can deploy applications and store data without the need to maintain physical servers. The flexibility offered by cloud platforms extends to hybrid cloud architectures, which combine on-premises infrastructure with public cloud services. As cloud adoption grows, edge computing is emerging as a complementary technology that processes data closer to its source, reducing latency and bandwidth costs for real-time applications like autonomous vehicles and IoT devices.",
        "true_keywords": ["cloud computing", "AWS", "Microsoft Azure", "Google Cloud", "hybrid cloud", "edge computing", "autonomous vehicles", "IoT", "data storage"]
    },
    {
        "sentence": "Artificial intelligence (AI) has become a critical tool for businesses looking to optimize operations, enhance customer experiences, and innovate in new areas. AI techniques such as machine learning, natural language processing, and computer vision enable companies to automate tasks that traditionally required human intervention. For example, AI-powered chatbots can handle customer inquiries, while computer vision systems can identify defects in manufacturing. As AI continues to evolve, ethical concerns such as bias in AI algorithms, data privacy, and job displacement are becoming more prominent topics of discussion.",
        "true_keywords": ["artificial intelligence", "AI", "machine learning", "natural language processing", "computer vision", "chatbots", "data privacy", "bias in AI", "job displacement"]
    },
    {
        "sentence": "Quantum computing, although still in its early stages, has the potential to revolutionize industries by solving problems that are currently beyond the capabilities of classical computers. Quantum computers leverage qubits and phenomena such as superposition and entanglement to perform complex calculations at unprecedented speeds. Quantum algorithms like Shor’s algorithm for factoring large integers and Grover’s algorithm for database search show promise in fields like cryptography and materials science. However, building a fault-tolerant quantum computer remains a significant challenge due to qubit instability and error correction.",
        "true_keywords": ["quantum computing", "qubits", "superposition", "entanglement", "Shor's algorithm", "Grover's algorithm", "cryptography", "materials science", "fault tolerance"]
    },
    {
        "sentence": "Robotics has advanced rapidly with the integration of artificial intelligence and machine learning, leading to the development of autonomous robots capable of performing complex tasks without human intervention. From manufacturing robots that assemble cars to drones used for aerial surveillance, AI-powered robots are becoming increasingly prevalent in various industries. Robotics is also playing a crucial role in healthcare, where robotic surgical systems assist doctors in performing delicate procedures with precision. As robots become more autonomous, ethical considerations around human-robot interaction and job displacement are emerging.",
        "true_keywords": ["robotics", "artificial intelligence", "autonomous robots", "machine learning", "drones", "manufacturing", "healthcare", "robotic surgery", "human-robot interaction"]
    },
    {
        "sentence": "Deep reinforcement learning (DRL) combines the strengths of deep learning and reinforcement learning, allowing agents to learn complex tasks by interacting with their environment. This approach has been successful in applications ranging from game playing, such as AlphaGo’s victory over human Go champions, to robotic control systems and autonomous driving. DRL models use neural networks to approximate value functions and policies, and techniques such as Deep Q-Networks (DQNs) and Proximal Policy Optimization (PPO) have become popular in the field. However, DRL remains challenging due to the instability of training and the exploration-exploitation trade-off.",
        "true_keywords": ["deep reinforcement learning", "DRL", "AlphaGo", "robotic control systems", "autonomous driving", "Deep Q-Networks", "Proximal Policy Optimization", "exploration-exploitation"]
    },
    {
        "sentence": "Augmented reality (AR) and virtual reality (VR) technologies are transforming the way we interact with digital content, providing immersive experiences that blur the lines between the real and virtual worlds. AR overlays digital information onto the physical world, while VR creates entirely simulated environments. These technologies are being used in a variety of fields, from gaming and entertainment to education and healthcare. For example, AR is being used in surgery to provide doctors with real-time, interactive visualizations of a patient’s anatomy, while VR is being employed to train employees in complex tasks in a risk-free virtual environment.",
        "true_keywords": ["augmented reality", "virtual reality", "AR", "VR", "immersive experiences", "gaming", "education", "healthcare", "surgery"]
    },
    {
        "sentence": "Cybersecurity has become an increasingly critical concern as organizations rely more on digital infrastructure to conduct their operations. Cyber threats like phishing, ransomware, and Distributed Denial of Service (DDoS) attacks can cause significant financial and reputational damage to businesses. Advanced encryption techniques, multi-factor authentication, and firewalls are standard defenses against such threats, but newer strategies like zero-trust security models and artificial intelligence-driven threat detection systems are becoming essential in identifying and mitigating cyberattacks in real time.",
        "true_keywords": ["cybersecurity", "phishing", "ransomware", "DDoS attacks", "encryption", "multi-factor authentication", "firewalls", "zero-trust security", "AI-driven threat detection"]
    },
    {
        "sentence": "Machine learning models, particularly deep learning architectures, have dramatically improved the ability of computers to recognize patterns in data. These models have been applied to a wide range of applications, from image recognition and speech processing to drug discovery and financial forecasting. However, one of the biggest challenges in machine learning is ensuring that models generalize well to new data. Overfitting, where a model performs well on training data but poorly on unseen data, remains a key issue, and techniques like cross-validation and regularization are commonly used to address it.",
        "true_keywords": ["machine learning", "deep learning", "image recognition", "speech processing", "drug discovery", "financial forecasting", "overfitting", "cross-validation", "regularization"]
    },
    {
        "sentence": "Edge computing is becoming a critical technology as more data is generated at the edge of networks, particularly by Internet of Things (IoT) devices. Rather than sending all data to centralized cloud servers for processing, edge computing enables data processing to occur closer to the source, reducing latency and bandwidth usage. This is especially important for applications like autonomous vehicles, industrial automation, and smart cities, where real-time data processing is essential. Edge computing also enhances privacy and security by keeping sensitive data local, rather than transmitting it to the cloud.",
        "true_keywords": ["edge computing", "IoT", "real-time data processing", "autonomous vehicles", "industrial automation", "smart cities", "privacy", "security", "cloud"]
    },
    {
        "sentence": "DevOps practices have transformed the way software is developed, tested, and deployed, by encouraging collaboration between development and operations teams. Continuous integration (CI) and continuous delivery (CD) pipelines are central to the DevOps approach, allowing for faster, more reliable software releases. Tools like Jenkins, Docker, and Kubernetes have become essential in automating the deployment process, ensuring that applications can scale and adapt quickly. By implementing infrastructure as code (IaC), DevOps teams can manage and configure infrastructure through code, making deployments more consistent and reducing the likelihood of configuration errors.",
        "true_keywords": ["DevOps", "continuous integration", "continuous delivery", "CI", "CD", "Jenkins", "Docker", "Kubernetes", "infrastructure as code", "IaC"]
    }
    
]


In [56]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载 DistilBERT 模型和分词器，并转移到 GPU
tokenizer = AutoTokenizer.from_pretrained("ml6team/keyphrase-extraction-distilbert-inspec")
model = AutoModelForTokenClassification.from_pretrained("ml6team/keyphrase-extraction-distilbert-inspec")
model.to(device) 

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [57]:
def extract_keywords_distilbert_inspec(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_class_ids = torch.argmax(logits, dim=2)

    predicted_keywords = []
    words = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze(0).cpu())

    current_keyword = []
    for word, label_id in zip(words, predicted_class_ids[0].cpu()):
        label = model.config.id2label[label_id.item()]
        if label == 'B-KEY':
            if current_keyword:
                predicted_keywords.append(" ".join(current_keyword))
                current_keyword = []
            current_keyword.append(word)
        elif label == 'I-KEY':
            current_keyword.append(word)
        else:
            if current_keyword:
                predicted_keywords.append(" ".join(current_keyword))

    if current_keyword:
        predicted_keywords.append(" ".join(current_keyword))

    cleaned_keywords = []
    for keyword in predicted_keywords:
        cleaned_keyword = keyword.replace(' ##', '').strip()
        cleaned_keyword = re.sub(r'\s+', ' ', cleaned_keyword)  # 移除多余空格
        cleaned_keyword = re.sub(r'[^\w\s]', '', cleaned_keyword)  # 移除非单词字符（标点）
        if cleaned_keyword:  # 仅添加非空的关键字
            cleaned_keywords.append(cleaned_keyword)

    unique_keywords = list(set(cleaned_keywords))

    return unique_keywords

In [12]:
def evaluate_keywords_extraction(extraction_function):
    all_true_keywords = []
    all_predicted_keywords = []

    for item in test_set:
        sentence = item["sentence"]
        true_keywords = item["true_keywords"]

     
        predicted_keywords = extraction_function(sentence)

        
        true_keywords_lower = set([kw.lower() for kw in true_keywords])
        predicted_keywords_lower = set([kw.lower() for kw in predicted_keywords])

        true_positive = len(true_keywords_lower & predicted_keywords_lower)
        predicted_positive = len(predicted_keywords_lower)
        actual_positive = len(true_keywords_lower)

        precision = true_positive / predicted_positive if predicted_positive > 0 else 0
        recall = true_positive / actual_positive if actual_positive > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        all_true_keywords.append(true_positive)
        all_predicted_keywords.append(predicted_positive)

        print(f"Sentence: {sentence}")
        print(f"True Keywords: {true_keywords}")
        print(f"Predicted Keywords: {predicted_keywords}")
        print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1:.2f}\n")

    total_true_positive = sum(all_true_keywords)
    total_predicted_positive = sum(all_predicted_keywords)
    total_actual_positive = sum([len(item["true_keywords"]) for item in test_set])

    overall_precision = total_true_positive / total_predicted_positive if total_predicted_positive > 0 else 0
    overall_recall = total_true_positive / total_actual_positive if total_actual_positive > 0 else 0
    overall_f1 = (2 * overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

    return overall_precision, overall_recall, overall_f1

In [18]:
precision, recall, f1 = evaluate_keywords_extraction(extract_keywords_distilbert_inspec)

print(f"Overall Precision: {precision:.2f}")
print(f"Overall Recall: {recall:.2f}")
print(f"Overall F1-Score: {f1:.2f}")

Sentence: Deep learning has revolutionized artificial intelligence, primarily through the development of sophisticated neural network architectures such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs). CNNs have achieved state-of-the-art performance in image recognition and computer vision tasks, while RNNs and their variants, such as Long Short-Term Memory (LSTM) and Gated Recurrent Units (GRU), have been particularly successful in sequential data tasks like natural language processing and speech recognition. The rise of generative models, including Generative Adversarial Networks (GANs) and Transformer models like GPT-3, has also opened new doors in text generation and content creation, enabling machines to create art, music, and even complex narratives.
True Keywords: ['deep learning', 'convolutional neural networks', 'recurrent neural networks', 'image recognition', 'computer vision', 'LSTM', 'GRU', 'GANs', 'GPT-3', 'text generation', 'natural language 

In [70]:
def process_keywords_in_folder(base_folder):
    # 遍历 base_folder 中的所有子文件夹
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        if os.path.isdir(folder_path):
            print(f"Processing folder: {folder_name}")
            
            # 寻找同名的摘要文本文件，例如 EcoAgent(1)_summary.txt
            summary_file_name = f"{folder_name}_summary.txt"
            summary_file_path = os.path.join(folder_path, summary_file_name)
            
            if os.path.exists(summary_file_path):
                # 读取摘要文件内容
                with open(summary_file_path, 'r', encoding='utf-8') as file:
                    summary_text = file.read()

                # 提取关键词
                keywords = extract_keywords_distilbert_inspec(summary_text)
                print(f"Keywords for {folder_name}: {keywords}")

                # 将关键词保存到文件中
                keywords_file_path = os.path.join(folder_path, f"{folder_name}_keywords.txt")
                with open(keywords_file_path, 'w', encoding='utf-8') as keyword_file:
                    for keyword in keywords:
                        keyword_file.write(keyword + '\n')

                print(f"Saved keywords to {keywords_file_path}")
            else:
                print(f"No summary file found for folder: {folder_name}")


In [71]:
base_folder = 'images_folder' 
process_keywords_in_folder(base_folder)

Processing folder: EconAgent(1)
Keywords for EconAgent(1): ['average price goods', 'government taxation', 'employment rate inflation simulation', 'econagent', 'macroeconomics', 'unemployment rate world', 'unemployment rate', 'employment rate inflation', 'macroeconomic simu lation', 'monthly wage wage', 'economic trends', 'un', 'average price', 'simulation markets', 'savings account balance', 'consumption market', 'interest rate', 'age distribution', 'essential goods', 'real consumption goods', 'artificial intelligence', 'phillips curve', 'negative correlations', 'randomly selected goods', 'gpt', 'simulation envi ronment', 'simulation', 'decision rules', 'phillips curve law', 'language model', 'human choices', 'human economic behavior', 'employment rate', 'monthly wage', 'python', 'market dynamics', 'savings account balance interest rates', 'empowered agent', 'essential goods goods price', 'randomly selected', 'deflation', 'living costs', 'phillips curve lawun']
Saved keywords to images

In [77]:
import requests

def remove_html_tags_and_cleanup(text):
    clean_text = re.sub('<.*?>', '', text)
    clean_text = re.sub(r'\n+', '\n', clean_text)
    return clean_text

def search_wikipedia(keywords):
    url = "https://en.wikipedia.org/w/api.php"
    search_results = {}

    for keyword in keywords:
        print(f"Searching for: {keyword}")
        params = {
            "action": "query",
            "list": "search",
            "srsearch": keyword,
            "format": "json"
        }
        
        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            search_results_data = data.get('query', {}).get('search', [])
            if search_results_data:
                result = search_results_data[0]
                title = result['title']
                
                # 获取页面引言部分
                extract_params = {
                    "action": "query",
                    "prop": "extracts",
                    "exintro": True,
                    "titles": title,
                    "format": "json"
                }
                extract_response = requests.get(url, params=extract_params)
                if extract_response.status_code == 200:
                    extract_data = extract_response.json()
                    pages = extract_data.get('query', {}).get('pages', {})
                    page = next(iter(pages.values()))
                    extract = page.get('extract', '')

                    # 移除 HTML 标签并清理换行符
                    clean_extract = remove_html_tags_and_cleanup(extract)
                    search_results[keyword] = {"title": title, "extract": clean_extract}
                else:
                    print(f"Failed to fetch extract for '{title}'")
            else:
                print(f"No results found for '{keyword}'.\n")
        else:
            print(f"Failed to fetch data from Wikipedia API for keyword '{keyword}', Status Code: {response.status_code}")
    
    return search_results 

In [78]:
def process_keywords_in_folder(base_folder):
    for folder_name in os.listdir(base_folder):
        folder_path = os.path.join(base_folder, folder_name)
        
        if os.path.isdir(folder_path):
            print(f"Processing folder: {folder_name}")
            
            # 寻找同名的关键词文本文件，例如 EcoAgent(1)_keywords.txt
            keywords_file_name = f"{folder_name}_keywords.txt"
            keywords_file_path = os.path.join(folder_path, keywords_file_name)
            
            if os.path.exists(keywords_file_path):
                # 读取关键词
                with open(keywords_file_path, 'r', encoding='utf-8') as file:
                    keywords = [line.strip() for line in file if line.strip()]
                
                if keywords:
                    # 搜索关键词并获取结果
                    search_results = search_wikipedia(keywords)
                    
                    # 保存 Wikipedia 搜索结果
                    output_file_path = os.path.join(folder_path, f"{folder_name}_wikipedia_search_results.txt")
                    with open(output_file_path, 'w', encoding='utf-8') as output_file:
                        for keyword, result in search_results.items():
                            output_file.write(f"Keyword: {keyword}\n")
                            output_file.write(f"Title: {result['title']}\n")
                            output_file.write(f"Extract: {result['extract']}\n\n")
                    print(f"Saved Wikipedia search results to {output_file_path}")
            else:
                print(f"No keywords file found for folder: {folder_name}")

In [79]:
base_folder = 'images_folder'  # 根目录，例如 'images_folder'
process_keywords_in_folder(base_folder)

Processing folder: EconAgent(1)
Searching for: average price goods
Searching for: government taxation
Searching for: employment rate inflation simulation
Searching for: econagent
No results found for 'econagent'.

Searching for: macroeconomics
Searching for: unemployment rate world
Searching for: unemployment rate
Searching for: employment rate inflation
Searching for: macroeconomic simu lation
No results found for 'macroeconomic simu lation'.

Searching for: monthly wage wage
Searching for: economic trends
Searching for: un
Searching for: average price
Searching for: simulation markets
Searching for: savings account balance
Searching for: consumption market
Searching for: interest rate
Searching for: age distribution
Searching for: essential goods
Searching for: real consumption goods
Searching for: artificial intelligence
Searching for: phillips curve
Searching for: negative correlations
Searching for: randomly selected goods
Searching for: gpt
Searching for: simulation envi ronment
