In [None]:
import json

from ollama import Client

client = Client(host="http://127.0.0.1:11434")
with open("sdg_label_prompt.md", "r") as f:
    prompt_template = f.read()

In [24]:
def analyze_classification(classification_data, total_elements):
    """
    Calculates and prints the total number of elements and the proportion
    of each category in the given classification dictionary.

    Args:
        classification_data (dict): A dictionary where keys are categories
                                    and values are their counts.
    """
    # Calculate the total number of elements
    # total_elements = sum(classification_data.values())

    # Calculate the proportion of each category
    proportions = {}
    if total_elements > 0:  # Avoid division by zero if all values are 0
        for category, count in classification_data.items():
            proportions[category] = count / total_elements
    else:
        for category in classification_data.keys():
            proportions[category] = 0  # Or float('nan') or None, depending on desired output for 0 total

    # Print the results
    print(f"Initial dictionary: {classification_data}")
    print(f"Total number of elements: {total_elements}")
    print("\nProportion of each category:")
    for category, proportion in proportions.items():
        # Handle the case where total_elements is 0 to avoid division by zero in the print statement
        if total_elements > 0:
            print(f"{category}: {proportion:.2%} ({classification_data[category]}/{total_elements})")
        else:
            print(f"{category}: 0.00% ({classification_data[category]}/{total_elements})")


    # If you want to store the proportions in a new dictionary, 'proportions' already holds this.
    # To show it formatted like the print loop above:
    formatted_proportions_dict = {}
    if total_elements > 0:
        formatted_proportions_dict = {category: f"{proportion:.2%}" for category, proportion in proportions.items()}
    else:
        formatted_proportions_dict = {category: "0.00%" for category in classification_data.keys()}

    print(f"\nProportions dictionary (formatted as percentages): {formatted_proportions_dict}")


In [25]:
def get_sdg_strings(sdg_content: str) -> list[str]:
    """Parses SDG references from a string and returns standardized SDG labels.

    This function takes a string which may contain one or more SDG references,
    separated by commas. It identifies valid SDG numbers (1-17), standardizes
    them into "SDG{number}" format (e.g., "SDG1", "SDG7"). If the input
    string is empty, contains only whitespace, or if no valid SDG numbers
    can be extracted, it returns a list containing a single string "none".
    The output list is sorted numerically by SDG number.

    Args:
        sdg_content (str): A string containing SDG references. These can be
            plain numbers (e.g., '9'), comma-separated numbers (e.g., '7,9'),
            or prefixed with "SDG" (e.g., 'SDG7, SDG9'), case-insensitively.

    Returns:
        list[str]: A list of standardized SDG strings (e.g., ["SDG1"],
            ["SDG7", "SDG9"]). Returns ["none"] if no valid SDGs are found
            or if the input string is empty/whitespace.

    Examples:
        >>> get_sdg_strings('9')
        ['SDG9']
        >>> get_sdg_strings('7,9')
        ['SDG7', 'SDG9']
        >>> get_sdg_strings('SDG7, SDG9')
        ['SDG7', 'SDG9']
        >>> get_sdg_strings('sdg5,sdg1')
        ['SDG1', 'SDG5']
        >>> get_sdg_strings('SDG1, SDG1, SDG2')
        ['SDG1', 'SDG2']
        >>> get_sdg_strings('SDG18, 0, text')
        ['none']
        >>> get_sdg_strings('')
        ['none']
        >>> get_sdg_strings('Test, SDG11, 25')
        ['SDG11']
    """
    # Handle empty or whitespace-only strings early
    if not sdg_content or not sdg_content.strip():
        return ["none"]

    parts = sdg_content.split(',')
    found_sdgs = set() # Use a set to automatically handle duplicates like "7, SDG7"

    for part in parts:
        # Normalize: lowercase and strip whitespace from the current part
        processed_part = part.strip().lower()
        
        number_str_to_check = ""
        if processed_part.startswith("sdg"):
            # Remove "sdg" prefix and strip any space that might have been between "sdg" and the number
            # e.g., "sdg 7" becomes "7", "sdg7" becomes "7"
            potential_number_part = processed_part[3:].strip() 
            number_str_to_check = potential_number_part
        else:
            number_str_to_check = processed_part
        
        # Check if the extracted string is a valid number for an SDG
        if number_str_to_check.isdigit():
            try:
                number = int(number_str_to_check)
                if 1 <= number <= 17:
                    found_sdgs.add(f"SDG{number}")
            except ValueError:
                # This should not be reached if .isdigit() was true, but good for safety
                pass 

    if not found_sdgs:
        return ["none"]
    else:
        # Sort the found SDGs by number for consistent output order
        # e.g., if input was "SDG9,SDG7", output is ["SDG7", "SDG9"]
        return sorted(list(found_sdgs), key=lambda x: int(x[3:]))

In [26]:
classification = {
        "SDG1" : 0, 
        "SDG2" : 0, 
        "SDG3" : 0, 
        "SDG4" : 0, 
        "SDG5" : 0, 
        "SDG6" : 0, 
        "SDG7" : 0, 
        "SDG8" : 0, 
        "SDG9" : 0, 
        "SDG10" : 0, 
        "SDG11" : 0, 
        "SDG12" : 0, 
        "SDG13" : 0, 
        "SDG14" : 0, 
        "SDG15" : 0, 
        "SDG16" : 0, 
        "SDG17" : 0,
        "none": 0
    }

count_line = 0
with open("../src/ai/testsets/dataset_labeled_MAGB.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        iteam = json.loads(line)
        count_line +=1
        for key, value in iteam.items():
            if key != "reason":
                found_sdgs = get_sdg_strings(value)
                for i in found_sdgs:
                    classification[i] += 1
# Call the function
analyze_classification(classification, count_line)


Initial dictionary: {'SDG1': 0, 'SDG2': 23, 'SDG3': 522, 'SDG4': 10, 'SDG5': 1, 'SDG6': 38, 'SDG7': 465, 'SDG8': 35, 'SDG9': 2201, 'SDG10': 7, 'SDG11': 79, 'SDG12': 252, 'SDG13': 61, 'SDG14': 6, 'SDG15': 2, 'SDG16': 17, 'SDG17': 0, 'none': 46}
Total number of elements: 3000

Proportion of each category:
SDG1: 0.00% (0/3000)
SDG2: 0.77% (23/3000)
SDG3: 17.40% (522/3000)
SDG4: 0.33% (10/3000)
SDG5: 0.03% (1/3000)
SDG6: 1.27% (38/3000)
SDG7: 15.50% (465/3000)
SDG8: 1.17% (35/3000)
SDG9: 73.37% (2201/3000)
SDG10: 0.23% (7/3000)
SDG11: 2.63% (79/3000)
SDG12: 8.40% (252/3000)
SDG13: 2.03% (61/3000)
SDG14: 0.20% (6/3000)
SDG15: 0.07% (2/3000)
SDG16: 0.57% (17/3000)
SDG17: 0.00% (0/3000)
none: 1.53% (46/3000)

Proportions dictionary (formatted as percentages): {'SDG1': '0.00%', 'SDG2': '0.77%', 'SDG3': '17.40%', 'SDG4': '0.33%', 'SDG5': '0.03%', 'SDG6': '1.27%', 'SDG7': '15.50%', 'SDG8': '1.17%', 'SDG9': '73.37%', 'SDG10': '0.23%', 'SDG11': '2.63%', 'SDG12': '8.40%', 'SDG13': '2.03%', 'SDG14':

In [27]:
classification = {
        "SDG1" : 0, 
        "SDG2" : 0, 
        "SDG3" : 0, 
        "SDG4" : 0, 
        "SDG5" : 0, 
        "SDG6" : 0, 
        "SDG7" : 0, 
        "SDG8" : 0, 
        "SDG9" : 0, 
        "SDG10" : 0, 
        "SDG11" : 0, 
        "SDG12" : 0, 
        "SDG13" : 0, 
        "SDG14" : 0, 
        "SDG15" : 0, 
        "SDG16" : 0, 
        "SDG17" : 0,
        "none": 0
    }

count_line = 0
with open("../src/ai/testsets/dataset_labeled.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        iteam = json.loads(line)
        count_line +=1
        for key, value in iteam.items():
            if key != "reason":
                found_sdgs = get_sdg_strings(value)
                for i in found_sdgs:
                    classification[i] += 1
# Call the function
analyze_classification(classification, count_line)

Initial dictionary: {'SDG1': 0, 'SDG2': 15, 'SDG3': 229, 'SDG4': 5, 'SDG5': 0, 'SDG6': 9, 'SDG7': 143, 'SDG8': 7, 'SDG9': 884, 'SDG10': 5, 'SDG11': 26, 'SDG12': 86, 'SDG13': 39, 'SDG14': 2, 'SDG15': 3, 'SDG16': 6, 'SDG17': 0, 'none': 76}
Total number of elements: 1326

Proportion of each category:
SDG1: 0.00% (0/1326)
SDG2: 1.13% (15/1326)
SDG3: 17.27% (229/1326)
SDG4: 0.38% (5/1326)
SDG5: 0.00% (0/1326)
SDG6: 0.68% (9/1326)
SDG7: 10.78% (143/1326)
SDG8: 0.53% (7/1326)
SDG9: 66.67% (884/1326)
SDG10: 0.38% (5/1326)
SDG11: 1.96% (26/1326)
SDG12: 6.49% (86/1326)
SDG13: 2.94% (39/1326)
SDG14: 0.15% (2/1326)
SDG15: 0.23% (3/1326)
SDG16: 0.45% (6/1326)
SDG17: 0.00% (0/1326)
none: 5.73% (76/1326)

Proportions dictionary (formatted as percentages): {'SDG1': '0.00%', 'SDG2': '1.13%', 'SDG3': '17.27%', 'SDG4': '0.38%', 'SDG5': '0.00%', 'SDG6': '0.68%', 'SDG7': '10.78%', 'SDG8': '0.53%', 'SDG9': '66.67%', 'SDG10': '0.38%', 'SDG11': '1.96%', 'SDG12': '6.49%', 'SDG13': '2.94%', 'SDG14': '0.15%', 'S