In [13]:
import json
import re

from ollama import Client

client = Client(host="http://127.0.0.1:11434")
with open("sdg_label_prompt.md", "r") as f:
    prompt_template = f.read()
        


In [None]:
def analyze_classification(classification_data, total_elements):
    """
    Calculates and prints the total number of elements and the proportion
    of each category in the given classification dictionary.

    Args:
        classification_data (dict): A dictionary where keys are categories
                                    and values are their counts.
    """
    # Calculate the total number of elements
    # total_elements = sum(classification_data.values())

    # Calculate the proportion of each category
    proportions = {}
    if total_elements > 0:  # Avoid division by zero if all values are 0
        for category, count in classification_data.items():
            proportions[category] = count / total_elements
    else:
        for category in classification_data.keys():
            proportions[category] = 0  # Or float('nan') or None, depending on desired output for 0 total

    # Print the results
    print(f"Initial dictionary: {classification_data}")
    print(f"Total number of elements: {total_elements}")
    print("\nProportion of each category:")
    for category, proportion in proportions.items():
        # Handle the case where total_elements is 0 to avoid division by zero in the print statement
        if total_elements > 0:
            print(f"{category}: {proportion:.2%} ({classification_data[category]}/{total_elements})")
        else:
            print(f"{category}: 0.00% ({classification_data[category]}/{total_elements})")


    # If you want to store the proportions in a new dictionary, 'proportions' already holds this.
    # To show it formatted like the print loop above:
    formatted_proportions_dict = {}
    if total_elements > 0:
        formatted_proportions_dict = {category: f"{proportion:.2%}" for category, proportion in proportions.items()}
    else:
        formatted_proportions_dict = {category: "0.00%" for category in classification_data.keys()}

    print(f"\nProportions dictionary (formatted as percentages): {formatted_proportions_dict}")


In [15]:
def get_sdg_strings(sdg_content: str) -> list[str]:
    """Parses SDG references from a string and returns standardized SDG labels.

    This function takes a string which may contain one or more SDG references,
    separated by commas. It identifies valid SDG numbers (1-17), standardizes
    them into "SDG{number}" format (e.g., "SDG1", "SDG7"). If the input
    string is empty, contains only whitespace, or if no valid SDG numbers
    can be extracted, it returns a list containing a single string "none".
    The output list is sorted numerically by SDG number.

    Args:
        sdg_content (str): A string containing SDG references. These can be
            plain numbers (e.g., '9'), comma-separated numbers (e.g., '7,9'),
            or prefixed with "SDG" (e.g., 'SDG7, SDG9'), case-insensitively.

    Returns:
        list[str]: A list of standardized SDG strings (e.g., ["SDG1"],
            ["SDG7", "SDG9"]). Returns ["none"] if no valid SDGs are found
            or if the input string is empty/whitespace.

    Examples:
        >>> get_sdg_strings('9')
        ['SDG9']
        >>> get_sdg_strings('7,9')
        ['SDG7', 'SDG9']
        >>> get_sdg_strings('SDG7, SDG9')
        ['SDG7', 'SDG9']
        >>> get_sdg_strings('sdg5,sdg1')
        ['SDG1', 'SDG5']
        >>> get_sdg_strings('SDG1, SDG1, SDG2')
        ['SDG1', 'SDG2']
        >>> get_sdg_strings('SDG18, 0, text')
        ['none']
        >>> get_sdg_strings('')
        ['none']
        >>> get_sdg_strings('Test, SDG11, 25')
        ['SDG11']
    """
    # Handle empty or whitespace-only strings early
    if not sdg_content or not sdg_content.strip():
        return ["none"]

    parts = sdg_content.split(',')
    found_sdgs = set() # Use a set to automatically handle duplicates like "7, SDG7"

    for part in parts:
        # Normalize: lowercase and strip whitespace from the current part
        processed_part = part.strip().lower()
        
        number_str_to_check = ""
        if processed_part.startswith("sdg"):
            # Remove "sdg" prefix and strip any space that might have been between "sdg" and the number
            # e.g., "sdg 7" becomes "7", "sdg7" becomes "7"
            potential_number_part = processed_part[3:].strip() 
            number_str_to_check = potential_number_part
        else:
            number_str_to_check = processed_part
        
        # Check if the extracted string is a valid number for an SDG
        if number_str_to_check.isdigit():
            try:
                number = int(number_str_to_check)
                if 1 <= number <= 17:
                    found_sdgs.add(f"SDG{number}")
            except ValueError:
                # This should not be reached if .isdigit() was true, but good for safety
                pass 

    if not found_sdgs:
        return ["none"]
    else:
        # Sort the found SDGs by number for consistent output order
        # e.g., if input was "SDG9,SDG7", output is ["SDG7", "SDG9"]
        return sorted(list(found_sdgs), key=lambda x: int(x[3:]))

In [None]:
classification = {
        "SDG1" : 0, 
        "SDG2" : 0, 
        "SDG3" : 0, 
        "SDG4" : 0, 
        "SDG5" : 0, 
        "SDG6" : 0, 
        "SDG7" : 0, 
        "SDG8" : 0, 
        "SDG9" : 0, 
        "SDG10" : 0, 
        "SDG11" : 0, 
        "SDG12" : 0, 
        "SDG13" : 0, 
        "SDG14" : 0, 
        "SDG15" : 0, 
        "SDG16" : 0, 
        "SDG17" : 0,
        "none": 0
    }

count_line = 0
with open("../src/ai/testsets/dataset_labeled_MAGB_test.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        iteam = json.loads(line)
        count_line +=1
        for key, value in iteam.items():
            if key != "reason":
                found_sdgs = get_sdg_strings(value)
                for i in found_sdgs:
                    classification[i] += 1
# Call the function
analyze_classification(classification, count_line)


Initial dictionary: {'SDG1': 0, 'SDG2': 18, 'SDG3': 422, 'SDG4': 9, 'SDG5': 1, 'SDG6': 33, 'SDG7': 380, 'SDG8': 29, 'SDG9': 1821, 'SDG10': 6, 'SDG11': 58, 'SDG12': 210, 'SDG13': 45, 'SDG14': 4, 'SDG15': 1, 'SDG16': 13, 'SDG17': 0, 'none': 40}
Total number of elements: 3090

Proportion of each category:
SDG1: 0.00% (0/3090)
SDG2: 0.58% (18/3090)
SDG3: 13.66% (422/3090)
SDG4: 0.29% (9/3090)
SDG5: 0.03% (1/3090)
SDG6: 1.07% (33/3090)
SDG7: 12.30% (380/3090)
SDG8: 0.94% (29/3090)
SDG9: 58.93% (1821/3090)
SDG10: 0.19% (6/3090)
SDG11: 1.88% (58/3090)
SDG12: 6.80% (210/3090)
SDG13: 1.46% (45/3090)
SDG14: 0.13% (4/3090)
SDG15: 0.03% (1/3090)
SDG16: 0.42% (13/3090)
SDG17: 0.00% (0/3090)
none: 1.29% (40/3090)

Proportions dictionary (formatted as percentages): {'SDG1': '0.00%', 'SDG2': '0.58%', 'SDG3': '13.66%', 'SDG4': '0.29%', 'SDG5': '0.03%', 'SDG6': '1.07%', 'SDG7': '12.30%', 'SDG8': '0.94%', 'SDG9': '58.93%', 'SDG10': '0.19%', 'SDG11': '1.88%', 'SDG12': '6.80%', 'SDG13': '1.46%', 'SDG14': '

In [None]:
classification = {
        "SDG1" : 0, 
        "SDG2" : 0, 
        "SDG3" : 0, 
        "SDG4" : 0, 
        "SDG5" : 0, 
        "SDG6" : 0, 
        "SDG7" : 0, 
        "SDG8" : 0, 
        "SDG9" : 0, 
        "SDG10" : 0, 
        "SDG11" : 0, 
        "SDG12" : 0, 
        "SDG13" : 0, 
        "SDG14" : 0, 
        "SDG15" : 0, 
        "SDG16" : 0, 
        "SDG17" : 0,
        "none": 0
    }

count_line = 0
with open("../src/ai/testsets/dataset_labeled.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        iteam = json.loads(line)
        count_line +=1
        for key, value in iteam.items():
            if key != "reason":
                found_sdgs = get_sdg_strings(value)
                for i in found_sdgs:
                    classification[i] += 1
# Call the function
analyze_classification(classification, count_line)

Initial dictionary: {'SDG1': 0, 'SDG2': 15, 'SDG3': 229, 'SDG4': 5, 'SDG5': 0, 'SDG6': 9, 'SDG7': 143, 'SDG8': 7, 'SDG9': 884, 'SDG10': 5, 'SDG11': 26, 'SDG12': 86, 'SDG13': 39, 'SDG14': 2, 'SDG15': 3, 'SDG16': 6, 'SDG17': 0, 'none': 76}
Total number of elements: 1535

Proportion of each category:
SDG1: 0.00% (0/1535)
SDG2: 0.98% (15/1535)
SDG3: 14.92% (229/1535)
SDG4: 0.33% (5/1535)
SDG5: 0.00% (0/1535)
SDG6: 0.59% (9/1535)
SDG7: 9.32% (143/1535)
SDG8: 0.46% (7/1535)
SDG9: 57.59% (884/1535)
SDG10: 0.33% (5/1535)
SDG11: 1.69% (26/1535)
SDG12: 5.60% (86/1535)
SDG13: 2.54% (39/1535)
SDG14: 0.13% (2/1535)
SDG15: 0.20% (3/1535)
SDG16: 0.39% (6/1535)
SDG17: 0.00% (0/1535)
none: 4.95% (76/1535)

Proportions dictionary (formatted as percentages): {'SDG1': '0.00%', 'SDG2': '0.98%', 'SDG3': '14.92%', 'SDG4': '0.33%', 'SDG5': '0.00%', 'SDG6': '0.59%', 'SDG7': '9.32%', 'SDG8': '0.46%', 'SDG9': '57.59%', 'SDG10': '0.33%', 'SDG11': '1.69%', 'SDG12': '5.60%', 'SDG13': '2.54%', 'SDG14': '0.13%', 'SDG