In [66]:
pip install torch --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Note: you may need to restart the kernel to use updated packages.


In [67]:
!pip install transformers



In [68]:
import json
import os
import torch
import pickle
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from typing import List, Tuple

In [69]:
# ----------------------------
# CONFIGURATION
# ----------------------------

MODEL_NAME = "facebook/bart-large-mnli"  # Lightweight zero-shot LLM
USE_GPU = torch.cuda.is_available()
TOP_K = 3
MAX_TOKENS = 512

In [70]:
# ----------------------------
# SDG CANDIDATE LABELS (flattened)
# ----------------------------

sdg_labels = [
    "No Poverty",
    "Zero Hunger",
    "Good Health and Well-being",
    "Quality Education",
    "Gender Equality",
    "Clean Water and Sanitation",
    "Affordable and Clean Energy",
    "Decent Work and Economic Growth",
    "Industry, Innovation and Infrastructure",
    "Reduced Inequalities",
    "Sustainable Cities and Communities",
    "Responsible Consumption and Production",
    "Climate Action",
    "Life Below Water",
    "Life on Land",
    "Peace, Justice and Strong Institutions",
    "Partnerships for the Goals"
]


In [71]:
# ----------------------------
# LOAD AND PARSE INPUT FILE
# ----------------------------

def load_dat_file(path: str) -> List[dict]:
    documents = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                ep, text = line.strip().split('\t', 1)
                documents.append({'ep': ep.strip(), 'text': text.strip()})
            except ValueError:
                continue
    return documents

In [72]:
# ----------------------------
# CLASSIFIER FUNCTION
# ----------------------------

def classify_with_llm(text: str, labels: List[str], classifier, top_k: int = 3) -> List[Tuple[str, float]]:
    result = classifier(text, candidate_labels=labels, multi_label=True)
    paired = list(zip(result['labels'], result['scores']))
    return sorted(paired, key=lambda x: -x[1])[:top_k]


In [73]:
# ----------------------------
# MAIN FUNCTION
# ----------------------------

def run_sdg_classification(input_path, goal_path, output_path):
    # Load SDG goals from pickle
    with open(goal_path, 'rb') as f:
        sdg_df = pickle.load(f)
    sdg_labels = sdg_df.iloc[:, 0].tolist()  # assumes SDG names are in first column

    # Load patent texts
    data = []
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            if "\t" in line:
                ep, text = line.strip().split("\t", 1)
                data.append({"ep": ep, "text": text})

    # Load model
    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0 if torch.cuda.is_available() else -1)

    # Classify each text
    results = []
    for doc in data:
        text = " ".join(doc["text"].split()[:512])  # truncate long text
        pred = classifier(text, candidate_labels=sdg_labels, multi_label=True)
        results.append({
            "ep": doc["ep"],
            "text": doc["text"],
            "predictions": list(zip(pred["labels"], pred["scores"]))
        })

    # Save results
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(f" Classification done. Results saved to {output_path}")

### EXAMPLE RUN

In [74]:
with open("test1.dat", "w", encoding="utf-8") as f:
    f.write("EP1234567\tA system for purifying water using solar energy.\n")
    f.write("EP7654321\tAn AI method for detecting illegal fishing activity.\n")


In [75]:
run_sdg_classification("test1.dat", "sgd_goals.dat", "results.json")

Device set to use cpu


 Classification done. Results saved to results.json


In [76]:
import json

with open("results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

for entry in results:
    print(f"\nPatent: {entry['ep']}")
    for goal, score in entry['predictions']:
        print(f" - {goal}: {score:.2f}")


Patent: EP1234567
 - Clean Water and Sanitation: 0.95
 - Affordable and Clean Energy: 0.94
 - Responsible Consumption and Production: 0.72
 - Industry, Innovation and Infrastructure: 0.66
 - Climate Action: 0.20
 - Reduced Inequalities: 0.20
 - Partnerships for the Goals: 0.19
 - Sustainable Cities and Communities: 0.16
 - Decent Work and Economic Growth: 0.10
 - Life Below Water: 0.07
 - No Poverty: 0.05
 - Life on Land: 0.01
 - Zero Hunger: 0.01
 - Good Health and Well-being: 0.01
 - Peace, Justice and Strong Institutions: 0.00
 - Gender Equality: 0.00
 - Quality Education: 0.00

Patent: EP7654321
 - Reduced Inequalities: 0.19
 - Life Below Water: 0.13
 - No Poverty: 0.01
 - Industry, Innovation and Infrastructure: 0.01
 - Partnerships for the Goals: 0.01
 - Life on Land: 0.01
 - Zero Hunger: 0.00
 - Responsible Consumption and Production: 0.00
 - Decent Work and Economic Growth: 0.00
 - Sustainable Cities and Communities: 0.00
 - Climate Action: 0.00
 - Peace, Justice and Strong In