In [1]:
import json
import re
from api.config.db_config import get_db_connection
from typing import Tuple

from ollama import Client
from ai.models.prompt.sdg_citation_prompt import sdg_citation_prompt

client = Client(host="http://127.0.0.1:11434")

conn = get_db_connection()

def get_abs(patent_number):
    # Fetch the patent data from the database
    fetch_patent_query = """
    SELECT en_abstract
    FROM patent
    WHERE patent.number = %s
    """
    cursor = conn.cursor()
    cursor.execute(fetch_patent_query, (patent_number,))
    row = cursor.fetchone()
    cursor.close()
    return row[0]


def get_description(patent_number):
    descriptions = []
    fetch_description_query = """
    SELECT description_number, description_text
    FROM patent_description
    WHERE patent_number = %s;
    """
    cursor = conn.cursor()
    cursor.execute(fetch_description_query, (patent_number,))
    fetchall = cursor.fetchall()
    for description in fetchall:
        descriptions.append({
            "description_number": description[0],
            "description_text": description[1]
        })
    cursor.close()
    return descriptions

In [2]:
def extract_sdgs(text: str) -> list[str]:
        """
        Extracts and standardizes SDG references from a given text.

        This method identifies SDG mentions in various formats, including:
        - "SDG" followed by a number (e.g., "SDG1", "SDG 2").
        - Numbers with sub-targets (e.g., "16.1", "3.4"), where the main number
          is extracted.
        - Standalone numbers (1-17) that appear at the beginning of the text
          or are preceded by common delimiters (commas, semicolons, colons, whitespace)
          and followed by delimiters or the end of the string.
        The matching is case-insensitive.

        Args:
            text (str): The input text to scan for SDG references.

        Returns:
            List[str]: A list of unique SDGs found, formatted as "SDG<number>"
                (e.g., ["SDG1", "SDG2"]), sorted numerically. Returns ["None"]
                if no valid SDGs (1-17) are found or if the input text is empty
                or not a string.
        """
        if not text or not isinstance(text, str):
            # Modified to return ["None"] as per original logic for empty/invalid text
            return ["None"]

        sdg_numbers = set()  # Use set to avoid duplicates

        # Pattern 1: SDG followed by number with optional sub-target
        # Captures: SDG1, sdg 2, SDG13.4, etc.
        sdg_pattern = r'(?i)\bsdg\s*(\d{1,2})(?:\.\d+)?\b'
        sdg_matches = re.findall(sdg_pattern, text)
        for match in sdg_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Pattern 2: Number with sub-target (e.g., "16.1", "3.4")
        # Look for patterns like X.Y where X is 1-17
        number_with_sub_pattern = r'\b(\d{1,2})\.\d+\b'
        sub_matches = re.findall(number_with_sub_pattern, text)
        for match in sub_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Pattern 3: Standalone numbers at beginning or after delimiters
        standalone_pattern = r'(?:^|[,;:]\s*|(?<=\s))(\d{1,2})(?=\s*[,;]|\s*$|\s+)'
        standalone_matches = re.findall(standalone_pattern, text.strip())
        for match in standalone_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Convert to sorted list of formatted strings
        result = [f"SDG{num}" for num in sorted(sdg_numbers)]

        return ["None"] if not result else result

In [3]:
import re
from typing import Tuple

def get_citation_explanation(text: str) -> Tuple[str, str]:
    """
    Extracts <summary> elements and <citation><explanation> pairs from text.
    
    The method assumes the input text contains XML-like tags `<citation>`
    and `<explanation>` delimiting the relevant content. It also expects
    a `</think>` tag preceding the main content.

    Args:
        text: The text to analyze
        
    Returns:
        Tuple containing (summary_content, formatted_citations_explanations)
    """
    # Remove the part before </think> if it exists
    if "</think>" in text:
        text = text.split("</think>", 1)[1]
    
    # Extract summary if it exists
    summary_match = re.search(r'<summary>\s*(.*?)\s*</summary>', text, re.DOTALL)
    summary_content = summary_match.group(1).strip() if summary_match else ""
    
    # Extract all citation/explanation pairs
    citation_explanation_pairs = []
    
    # Find all citations
    citation_matches = re.finditer(r'<citation>\s*(.*?)\s*</citation>', text, re.DOTALL)
    
    for citation_match in citation_matches:
        citation_content = citation_match.group(1).strip()
        citation_end = citation_match.end()
        
        # Look for the explanation that immediately follows this citation
        remaining_text = text[citation_end:]
        explanation_match = re.search(r'^\s*<explanation>\s*(.*?)\s*</explanation>', remaining_text, re.DOTALL)
        
        if explanation_match:
            explanation_content = explanation_match.group(1).strip()
            citation_explanation_pairs.append((citation_content, explanation_content))
    
    # Format the citation/explanation pairs
    formatted_pairs = []
    for i, (citation, explanation) in enumerate(citation_explanation_pairs, 1):
        formatted_pair = f"**Citation {i}:**\n{citation}\n**Explanation {i}:**\n{explanation}"
        formatted_pairs.append(formatted_pair)
    
    formatted_citations_explanations = "\n\n".join(formatted_pairs)
    
    return summary_content, formatted_citations_explanations



# Example usage with your text
def test_function():
    sample_text = """<think>
    Okay, let's tackle this query. The user wants an analysis linking the provided text to SDG 3: Good Health and Well-being. First, I need to understand what the text is about. It's a technical description of a medical device component, specifically a connector mechanism in a surgical stapling instrument. The key points are about ensuring proper electrical connections between the jaw member and the cartridge assembly to provide specifications like staple size, which is crucial for the device's functionality during surgery.

    Now, how does this relate to SDG 3? SDG 3 focuses on ensuring healthy lives and promoting well-being. The text's relevance here is that the device is used in medical procedures, which directly impact patient health outcomes. By ensuring the device works correctly, it contributes to safer surgeries, reducing complications and improving patient recovery. The connector mechanism's reliability is essential for accurate information transfer, which is part of the device's safety and effectiveness.

    Looking at the citations, I need to find parts of the text that mention the device's role in medical procedures. For example, in [description number 16], it says the connector mechanism provides specifications to the handle assembly, which affects the surgery's outcome. Another part in [description number 20] talks about the connector members being beveled for proper engagement, ensuring the device functions correctly during use. These details are crucial for the device's reliability, which in turn supports good health outcomes.

    I need to make sure not to mention other SDGs. Even though the device might relate to innovation or industry (SDG 9), the focus here is strictly on health. The summary should clearly state that the device's proper functioning contributes to safe and effective medical procedures, thus aligning with SDG 3. The citations must be exact and linked to the health aspect, not other areas.
    </think>

    <summary>This text is linked to SDG 3: Good Health and Well-being because it describes a medical device component (a surgical stapling instrument) designed to improve the accuracy and safety of surgical procedures. By ensuring reliable electrical connections between the device's parts, the mechanism enables precise communication of critical cartridge specifications (e.g., staple size, configuration) to the handle assembly, which directly impacts the quality of patient care during surgery. This contributes to safer, more effective medical interventions, aligning with the goal of promoting health and well-being through technological innovation in healthcare.</summary>

    <citation>The tool assembly 104 of the loading unit 100 includes a connector mechanism 130 (FIG. 16) having a first or male connector assembly 140 disposed within the jaw member 110 (FIG. 5) and a second or female connector assembly 160 (FIG. 6) secured to the cartridge assembly 120. As will be described in further detail below, when the cartridge assembly 120 is loaded into the jaw member 110 of the loading unit 100, the second connector assembly 160 of the connector mechanism 130 electrically couples with the first connector assembly 140 of the connector mechanism 130 to provide specifications of the cartridge assembly 120 to the handle assembly 12 (FIG. 1) of the surgical stapling instrument 10, e.g., staple size, number and/or configuration.</citation>  
    <explanation>Directly links the device's functionality to medical procedures by ensuring accurate transmission of critical surgical parameters (e.g., staple size), which enhances the safety and effectiveness of surgeries, a core aspect of SDG 3's focus on health outcomes.</explanation>  

    <citation>First and second connector members 146, 148 extend outwardly from the first connector housing 142 and are positioned and configured to engage the second connector assembly 160 (FIG. 6) of the connector mechanism 130 when the cartridge assembly 120 is received within the channel 111 of the jaw member 110.</citation>  
    <explanation>Highlights the mechanical precision required for the device to function correctly during surgery, ensuring reliable performance that supports safe medical interventions and aligns with SDG 3's objective of promoting health through technological reliability.</explanation>
    """
    
    summary, citations_explanations = get_citation_explanation(sample_text)
    
    print("SUMMARY:")
    print(summary)
    print("\n" + "="*50 + "\n")
    print("CITATIONS AND EXPLANATIONS:")
    print(citations_explanations)

test_function()

SUMMARY:
This text is linked to SDG 3: Good Health and Well-being because it describes a medical device component (a surgical stapling instrument) designed to improve the accuracy and safety of surgical procedures. By ensuring reliable electrical connections between the device's parts, the mechanism enables precise communication of critical cartridge specifications (e.g., staple size, configuration) to the handle assembly, which directly impacts the quality of patient care during surgery. This contributes to safer, more effective medical interventions, aligning with the goal of promoting health and well-being through technological innovation in healthcare.


CITATIONS AND EXPLANATIONS:
**Citation 1:**
The tool assembly 104 of the loading unit 100 includes a connector mechanism 130 (FIG. 16) having a first or male connector assembly 140 disposed within the jaw member 110 (FIG. 5) and a second or female connector assembly 160 (FIG. 6) secured to the cartridge assembly 120. As will be des

In [None]:
with open("../src/ai/testsets/dataset_labeled.jsonl", "r", encoding="utf-8") as f:
    with open("ouput_data.jsonl", "w", encoding="utf-8") as f_out:
        for line in f:
            iteam = json.loads(line)
            for key, value in iteam.items():
                if key != "reason":
                    patent_number = key
                    abs = get_abs(patent_number)
                    sdgs = extract_sdgs(value)
                    descriptions = get_description(patent_number)
                    patent_text = f"[abstract]: {abs}\n"
                    for description in descriptions:
                        patent_text += f"[description number {description["description_number"]}]: {description["description_text"]}\n"
                else:
                    reason = value
            
            patent_text = " ".join(patent_text.split()[:3000])

            for sdg in sdgs:
                if sdg != "None":
                    formatted_prompt = sdg_citation_prompt(patent_text, sdg)

                    response_data = client.generate(
                        model="qwen3:14b", # Make sure this model is available in your Ollama instance
                        prompt=formatted_prompt,
                        options={"temperature": 0.2}
                    )
                    raw_output_text = response_data.get('response', '').strip()
                    summary_content, formatted_citations_explanations = get_citation_explanation(raw_output_text)

                    print(raw_output_text)

                # If not a SDG the previous model already made an explanation
                else:
                    summary_content, formatted_citations_explanations = reason, ""

                sdg_summary_detail = {
                    "patent_number": patent_number,
                    "sdg": sdg,
                    "summary_content": summary_content,
                    "formatted_citations_explanations": formatted_citations_explanations
                }

                print(sdg_summary_detail)
                
                # Save
                f_out.write(json.dumps(sdg_summary_detail, ensure_ascii=False) + "\n")
                f_out.flush()

{'patent_number': 'EP4191828A1', 'sdg': 'None', 'summary_content': "The provided text is a technical description of an electronic device's hardware components (e.g., housing, display, sensors, camera modules) and does not mention any sustainability initiatives, social impact, environmental considerations, or alignment with global challenges addressed by the Sustainable Development Goals (SDGs). It focuses solely on the structural and functional aspects of the device without context linking it to SDG-related themes such as innovation, infrastructure, or responsible consumption. As such, no SDG is directly relevant to this text.", 'formatted_citations_explanations': ''}


2025-05-27 05:38:06,060 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


<think>
Okay, let's tackle this query. The user provided a technical document about a medical device connector mechanism and wants an analysis linking it to SDG 3: Good Health and Well-being. 

First, I need to understand why this text is relevant to SDG 3. The document discusses a connector mechanism that ensures proper electrical coupling between a jaw member and a cartridge assembly in a surgical stapler. This is crucial for the device's functionality, which in turn affects patient safety and treatment outcomes. 

Looking at the text, the connector mechanism provides specifications like staple size, number, and configuration to the handle assembly. This ensures that the device operates correctly during surgery, reducing the risk of complications. Proper device function directly contributes to better health outcomes for patients, aligning with SDG 3's goal of promoting well-being.

Now, I need to find specific citations from the text that support this. The first relevant part is in [

2025-05-27 05:40:09,456 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


<think>
Okay, let's tackle this query. The user wants a structured analysis linking the provided text to SDG 9. First, I need to understand why the text is relevant to SDG 9. SDG 9 is about industry, innovation, and infrastructure. The text discusses CPRI link negotiation and fronthaul networks, which are part of telecommunications infrastructure. It also mentions challenges in existing infrastructure and the need for innovation in network setups.

Looking at the original text, there are parts about current deployments using CPRI, which is a standard for infrastructure. The problem with fronthaul networks (intermediate nodes causing discontinuity) points to the need for resilient infrastructure. The mention of innovation in link negotiation procedures and the use of packet-based networks like Ethernet could relate to fostering innovation and sustainable industrialization.

Now, for Part 1, the summary needs to clearly connect the text to SDG 9. I should focus on infrastructure developm

2025-05-27 05:42:18,099 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


<think>
Okay, let's tackle this query. The user wants a structured analysis linking the provided text to SDG 9. First, I need to understand why the text is connected to SDG 9. SDG 9 focuses on industry, innovation, and infrastructure. The text is about radar systems in vehicles, which involves technology and infrastructure. The key points are innovation in radar systems, infrastructure resilience through better technology, and promoting sustainable industrialization by reducing errors and improving efficiency.

Looking at the text, there's mention of monitoring radar frequency bands to avoid interference, which relates to infrastructure resilience. The use of control signals from a CAN bus as a frequency reference shows innovation in using existing infrastructure for more accurate systems. Also, the development of advanced radar systems can be part of fostering innovation in the industry.

Now, I need to find specific citations. The part where they use the CAN bus as a frequency refere

2025-05-27 05:45:54,015 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


<think>
Okay, let's tackle this query. The user provided a lengthy technical text about a waste heat recovery boiler and wants an analysis linking it to SDG 7: Affordable and Clean Energy. First, I need to understand why this text is connected to SDG 7.

SDG 7 focuses on ensuring access to affordable, reliable, sustainable, and modern energy. The key here is energy efficiency and sustainability. The waste heat recovery boiler likely recovers heat that would otherwise be lost, improving energy efficiency. That makes energy use more sustainable and could reduce costs, aligning with affordable and clean energy.

Looking through the text, there are mentions of heat pipes and connection pipes that help distribute fluid uniformly, ensuring even heat absorption. This efficiency in energy use would contribute to sustainable energy practices. Also, by recovering waste heat, the system reduces the need for additional energy sources, which supports clean energy goals.

Now, I need to find specifi

## Test with class in production

In [None]:
# from ai.models.CitationPatent import CitationPatent
# from api.config.ai_config import ai_client, ai_model

# citationPatent = CitationPatent(client=ai_client, model_name=ai_model)

# with open("../src/ai/testsets/dataset_labeled.jsonl", "r", encoding="utf-8") as f:
#     with open("ouput_data.jsonl", "w", encoding="utf-8") as f_out:
#         for line in f:
#             iteam = json.loads(line)
#             for key, value in iteam.items():
#                 if key != "reason":
#                     patent_number = key
#                     abs = get_abs(patent_number)
#                     sdgs = extract_sdgs(value)
#                     descriptions = get_description(patent_number)
#                     patent_text = f"[abstract]: {abs}\n"
#                     for description in descriptions:
#                         patent_text += f"[description number {description["description_number"]}]: {description["description_text"]}\n"
#                 else:
#                     reason = value
            
#             patent_text = " ".join(patent_text.split()[:3000])

#             for sdg in sdgs:
                
#                 citation_content, explanation_content = citationPatent.citation(patent_text, sdg, reason)

#                 sdg_summary_detail = {
#                     "patent_number": patent_number,
#                     "sdg": sdg,
#                     "sdg_reason": explanation_content,
#                     "sdg_details": citation_content
#                 }

#                 print(sdg_summary_detail)
            
#                 # Save
#                 f_out.write(json.dumps(sdg_summary_detail, ensure_ascii=False) + "\n")
#                 f_out.flush()

2025-05-27 05:08:44,243 - httpx - INFO - _send_single_request - HTTP Request: GET http://localhost:11434/api/tags "HTTP/1.1 200 OK"
2025-05-27 05:08:59,537 - httpx - INFO - _send_single_request - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


{'patent_number': 'EP4191828A1', 'sdg': 'None', 'sdg_reason': 'This passage describes the technical components of the device, focusing on hardware features without mentioning any SDG-related objectives.', 'sdg_details': 'According to an embodiment, the electronic device 100 may include at least one or more of a display 101, audio modules 103 and 104, camera modules 105, 113, 114, and 115, a key input device 107, and a connector hole 106.'}


2025-05-27 05:09:10,016 - httpx - INFO - _send_single_request - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


{'patent_number': 'EP4140416A1', 'sdg': 'SDG3', 'sdg_reason': 'The connector mechanism electrically couples with the first connector assembly to provide specifications of the cartridge (e.g., staple size, number, and configuration) to the handle assembly, ensuring accurate medical procedures and patient safety.', 'sdg_details': '[description number 16]'}
