In [9]:
import json
import re
from api.config.db_config import get_db_connection
from typing import Tuple

from ollama import Client
from ai.models.prompt.sdg_citation_prompt import sdg_citation_prompt

client = Client(host="http://127.0.0.1:11434")

conn = get_db_connection()

def get_abs(patent_number):
    # Fetch the patent data from the database
    fetch_patent_query = """
    SELECT en_abstract
    FROM patent
    WHERE patent.number = %s
    """
    cursor = conn.cursor()
    cursor.execute(fetch_patent_query, (patent_number,))
    row = cursor.fetchone()
    cursor.close()
    return row[0]


def get_description(patent_number):
    descriptions = []
    fetch_description_query = """
    SELECT description_number, description_text
    FROM patent_description
    WHERE patent_number = %s;
    """
    cursor = conn.cursor()
    cursor.execute(fetch_description_query, (patent_number,))
    fetchall = cursor.fetchall()
    for description in fetchall:
        descriptions.append({
            "description_number": description[0],
            "description_text": description[1]
        })
    cursor.close()
    return descriptions

In [10]:
def extract_sdgs(text: str) -> list[str]:
        """
        Extracts and standardizes SDG references from a given text.

        This method identifies SDG mentions in various formats, including:
        - "SDG" followed by a number (e.g., "SDG1", "SDG 2").
        - Numbers with sub-targets (e.g., "16.1", "3.4"), where the main number
          is extracted.
        - Standalone numbers (1-17) that appear at the beginning of the text
          or are preceded by common delimiters (commas, semicolons, colons, whitespace)
          and followed by delimiters or the end of the string.
        The matching is case-insensitive.

        Args:
            text (str): The input text to scan for SDG references.

        Returns:
            List[str]: A list of unique SDGs found, formatted as "SDG<number>"
                (e.g., ["SDG1", "SDG2"]), sorted numerically. Returns ["None"]
                if no valid SDGs (1-17) are found or if the input text is empty
                or not a string.
        """
        if not text or not isinstance(text, str):
            # Modified to return ["None"] as per original logic for empty/invalid text
            return ["None"]

        sdg_numbers = set()  # Use set to avoid duplicates

        # Pattern 1: SDG followed by number with optional sub-target
        # Captures: SDG1, sdg 2, SDG13.4, etc.
        sdg_pattern = r'(?i)\bsdg\s*(\d{1,2})(?:\.\d+)?\b'
        sdg_matches = re.findall(sdg_pattern, text)
        for match in sdg_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Pattern 2: Number with sub-target (e.g., "16.1", "3.4")
        # Look for patterns like X.Y where X is 1-17
        number_with_sub_pattern = r'\b(\d{1,2})\.\d+\b'
        sub_matches = re.findall(number_with_sub_pattern, text)
        for match in sub_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Pattern 3: Standalone numbers at beginning or after delimiters
        standalone_pattern = r'(?:^|[,;:]\s*|(?<=\s))(\d{1,2})(?=\s*[,;]|\s*$|\s+)'
        standalone_matches = re.findall(standalone_pattern, text.strip())
        for match in standalone_matches:
            number = int(match)
            if 1 <= number <= 17:
                sdg_numbers.add(number)

        # Convert to sorted list of formatted strings
        result = [f"SDG{num}" for num in sorted(sdg_numbers)]

        return ["None"] if not result else result

In [11]:
def get_citation_explanation(text: str) -> Tuple[str, str]:
    text = text.split("</think>")[1]
    citation_match = re.search(r'<citation>(.*?)</citation>', text, re.DOTALL)
    explanation_match = re.search(r'<explanation>(.*?)</explanation>', text, re.DOTALL)

    citation_content_regex = ""
    explanation_content_regex = ""

    if citation_match:
        citation_content_regex = citation_match.group(1).strip()

    if explanation_match:
        explanation_content_regex = explanation_match.group(1).strip()
    return citation_content_regex, explanation_content_regex

In [12]:
with open("../src/ai/testsets/dataset_labeled_4B.jsonl", "r", encoding="utf-8") as f:
    with open("ouput_data.jsonl", "w", encoding="utf-8") as f_out:
        for line in f:
            iteam = json.loads(line)
            for key, value in iteam.items():
                if key != "reason":
                    patent_number = key
                    abs = get_abs(patent_number)
                    sdgs = extract_sdgs(value)
                    descriptions = get_description(patent_number)
                    patent_text = f"{abs}\n"
                    for description in descriptions:
                        patent_text += f"{description["description_number"]}: {description["description_text"]}\n"
                else:
                    reason = value
                
            patent_text = " ".join(patent_text.split()[:3000])
            for sdg in sdgs:
                formatted_prompt = sdg_citation_prompt(patent_text,sdg, reason)
                #print(formatted_prompt)

                response_data = client.generate(
                    model="qwen3:8b", # Make sure this model is available in your Ollama instance
                    prompt=formatted_prompt,
                    options={"temperature": 0.2}
                )
                raw_output_text = response_data.get('response', '').strip()
                citation, explanation = get_citation_explanation(raw_output_text)
                
                sdg_summary_detail = {
                    "patent_number": patent_number,
                    "sdg": sdg,
                    "sdg_reason": explanation,
                    "sdg_details": citation
                }
                f_out.write(json.dumps(sdg_summary_detail, ensure_ascii=False) + "\n")
                f_out.flush()


2025-05-27 02:15:07,605 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-27 02:15:19,905 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-27 02:16:04,339 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-27 02:16:21,062 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-27 02:16:39,772 - httpx - INFO - _send_single_request - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"


KeyboardInterrupt: 