# Green Patent Detection (PatentSBERTa): Active Learning + LLM→Human HITL

### Part A: Baseline Model (Frozen Embeddings)

In [3]:
# Installing and importing all necessary libraries

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, classification_report


1) Loading the 50k parquet and check columns

In [6]:
from datasets import load_dataset

ds = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test", split="train")
print(ds)


Generating train split: 100%|██████████| 1372910/1372910 [01:13<00:00, 18727.71 examples/s]
Generating test split: 100%|██████████| 119384/119384 [00:06<00:00, 18309.65 examples/s]


Dataset({
    features: ['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H', 'A01J', 'A01K', 'A01L', 'A01M', 'A01N', 'A21B', 'A21C', 'A21D', 'A22B', 'A22C', 'A23B', 'A23C', 'A23D', 'A23F', 'A23G', 'A23J', 'A23K', 'A23L', 'A23N', 'A23P', 'A23V', 'A23Y', 'A24B', 'A24C', 'A24D', 'A24F', 'A41B', 'A41C', 'A41D', 'A41F', 'A41G', 'A41H', 'A42B', 'A42C', 'A43B', 'A43C', 'A43D', 'A44B', 'A44C', 'A44D', 'A45B', 'A45C', 'A45D', 'A45F', 'A46B', 'A46D', 'A47B', 'A47C', 'A47D', 'A47F', 'A47G', 'A47H', 'A47J', 'A47K', 'A47L', 'A61B', 'A61C', 'A61D', 'A61F', 'A61G', 'A61H', 'A61J', 'A61K', 'A61L', 'A61M', 'A61N', 'A61P', 'A61Q', 'A62B', 'A62C', 'A62D', 'A63B', 'A63C', 'A63D', 'A63F', 'A63G', 'A63H', 'A63J', 'A63K', 'B01B', 'B01D', 'B01F', 'B01J', 'B01L', 'B02B', 'B02C', 'B03B', 'B03C', 'B03D', 'B04B', 'B04C', 'B05B', 'B05C', 'B05D', 'B06B', 'B07B', 'B07C', 'B08B', 'B09B', 'B09C', 'B21B', 'B21C', 'B21D', 'B21F', 'B21G', 'B21H', 'B21J', 'B21K', 'B21L', 'B22C', 'B22D', 'B22F', 'B23B', '

2. Load train+test and put into one pandas df

In [8]:
ds_train = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test", split="train")
ds_test  = load_dataset("AI-Growth-Lab/patents_claims_1.5m_traim_test", split="test")

df = pd.concat([ds_train.to_pandas(), ds_test.to_pandas()], ignore_index=True)

print(df.shape)
print(df.columns[:20])
df.head(2)


(1492294, 666)
Index(['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H',
       'A01J', 'A01K', 'A01L', 'A01M', 'A01N', 'A21B', 'A21C', 'A21D', 'A22B',
       'A22C', 'A23B'],
      dtype='object')


Unnamed: 0,id,date,text,A01B,A01C,A01D,A01F,A01G,A01H,A01J,...,Y02B,Y02C,Y02D,Y02E,Y02P,Y02T,Y02W,Y04S,Y10S,Y10T
0,8788730,2014-07-22,1. A method for sending a keycode of a non-key...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8621421,2013-12-31,1. A method executed at least in part in a com...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.head()

Unnamed: 0,id,date,text,A01B,A01C,A01D,A01F,A01G,A01H,A01J,...,Y02B,Y02C,Y02D,Y02E,Y02P,Y02T,Y02W,Y04S,Y10S,Y10T
0,8788730,2014-07-22,1. A method for sending a keycode of a non-key...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8621421,2013-12-31,1. A method executed at least in part in a com...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9461433,2016-10-04,1. A light-emitting device comprising: a base;...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9229528,2016-01-05,"1. An input apparatus, comprising: a plurality...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8508147,2013-08-13,"1. A dimmer circuit, comprising: a bleeder as ...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


3. Creating is_green_silver

In [10]:
#Finding any columns starting with Y02
y02_cols = [c for c in df.columns if str(c).startswith("Y02")]
print("Y02 columns:", y02_cols[:20], " ... total:", len(y02_cols))


Y02 columns: ['Y02A', 'Y02B', 'Y02C', 'Y02D', 'Y02E', 'Y02P', 'Y02T', 'Y02W']  ... total: 8


In [11]:
#CREATING THE LABEL
if len(y02_cols) > 0:
    df["is_green_silver"] = (df[y02_cols].sum(axis=1) > 0).astype(int)
else:
    raise ValueError("No Y02* columns found. We need to inspect how green tech is encoded in this dataset.")


In [12]:
#Checking balance
print(df["is_green_silver"].value_counts())


is_green_silver
0    1379456
1     112838
Name: count, dtype: int64


4. Creating the balanced 50k sample (25k green + 25k not green)

In [13]:
SEED = 42

green = df[df["is_green_silver"] == 1].sample(25000, random_state=SEED)
nong  = df[df["is_green_silver"] == 0].sample(25000, random_state=SEED)

balanced = pd.concat([green, nong], ignore_index=True).sample(frac=1, random_state=SEED).reset_index(drop=True)

balanced["is_green_silver"].value_counts()


is_green_silver
0    25000
1    25000
Name: count, dtype: int64

5. Creating splits and save patents_50k_green.parquet

In [14]:
train = balanced.sample(frac=0.70, random_state=SEED)
rest = balanced.drop(train.index)

eval_ = rest.sample(frac=0.50, random_state=SEED)   # half of remaining = 15%
pool  = rest.drop(eval_.index)                      # remaining = 15%

train = train.copy(); eval_ = eval_.copy(); pool = pool.copy()
train["split"] = "train_silver"
eval_["split"]  = "eval_silver"
pool["split"]   = "pool_unlabeled"

final_df = pd.concat([train, pool, eval_], ignore_index=True)

final_df.to_parquet("patents_50k_green.parquet", index=False)
final_df["split"].value_counts()


split
train_silver      35000
pool_unlabeled     7500
eval_silver        7500
Name: count, dtype: int64

In [87]:
# From this point on, always load from the working dataset
final_df = pd.read_parquet("patents_50k_green.parquet")
print("Reloaded working dataset:", final_df.shape)

Reloaded working dataset: (50000, 668)


6. Computing frozen PatentSBERTa embeddings for train+eval

In [15]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

MODEL_NAME = "AI-Growth-Lab/PatentSBERTa"
TEXT_COL = "text"
LABEL_COL = "is_green_silver"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
encoder = AutoModel.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
encoder.eval()

@torch.no_grad()
def embed_texts(texts, batch_size=32, max_length=256):
    vecs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)
        
        out = encoder(**enc).last_hidden_state  # (B,T,H)
        mask = enc["attention_mask"].unsqueeze(-1).float()
        pooled = (out * mask).sum(1) / mask.sum(1).clamp(min=1e-9)  # mean pooling
        
        vecs.append(pooled.cpu().numpy())
    return np.vstack(vecs)

train_df = final_df[final_df["split"] == "train_silver"].copy()
eval_df  = final_df[final_df["split"] == "eval_silver"].copy()

X_train = embed_texts(train_df[TEXT_COL].astype(str).tolist())
y_train = train_df[LABEL_COL].astype(int).values

X_eval  = embed_texts(eval_df[TEXT_COL].astype(str).tolist())
y_eval  = eval_df[LABEL_COL].astype(int).values

print(X_train.shape, X_eval.shape)


100%|██████████| 1094/1094 [1:06:01<00:00,  3.62s/it]
100%|██████████| 235/235 [12:07<00:00,  3.10s/it]

(35000, 768) (7500, 768)





In [None]:
import numpy as np

#Saving
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_eval.npy", X_eval)
np.save("y_eval.npy", y_eval)

print("Saved embeddings + labels ✅")


Saved embeddings + labels ✅


7. Training the baseline classifier with Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_recall_fscore_support

clf = LogisticRegression(max_iter=2000, n_jobs=-1)
clf.fit(X_train, y_train)

pred = clf.predict(X_eval)

print(classification_report(y_eval, pred, digits=4))

p, r, f1, _ = precision_recall_fscore_support(y_eval, pred, average="binary")
print(f"Precision={p:.4f}  Recall={r:.4f}  F1={f1:.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after p

              precision    recall  f1-score   support

           0     0.7788    0.7975    0.7881      3758
           1     0.7916    0.7726    0.7820      3742

    accuracy                         0.7851      7500
   macro avg     0.7852    0.7850    0.7850      7500
weighted avg     0.7852    0.7851    0.7850      7500

Precision=0.7916  Recall=0.7726  F1=0.7820


The baseline model using frozen PatentSBERTa embeddings and Logistic Regression achieved an F1-score of 0.782 on the evaluation set. The performance is significantly above random guessing but still imperfect, making it suitable for uncertainty sampling. The model produces meaningful probability estimates around the decision boundary (p ≈ 0.5), which are necessary to identify high-risk examples for human labeling.

### Part B: Identify High-Risk Examples (Uncertainty Sampling)

In [62]:
MAX_SEQ_LENGTH = 256

1. Loading prepared 50k parquet (the one with splits)

In [63]:
import pandas as pd

final_df = pd.read_parquet("patents_50k_green.parquet")
final_df["split"].value_counts()


split
train_silver      35000
pool_unlabeled     7500
eval_silver        7500
Name: count, dtype: int64

2. Selecting pool_unlabeled (this is what we score)

In [19]:
pool_df = final_df[final_df["split"] == "pool_unlabeled"].copy()
print("Pool size:", len(pool_df))
pool_df.head(2)


Pool size: 7500


Unnamed: 0,id,date,text,A01B,A01C,A01D,A01F,A01G,A01H,A01J,...,Y02D,Y02E,Y02P,Y02T,Y02W,Y04S,Y10S,Y10T,is_green_silver,split
35000,8879379,2014-11-04,1. A method to detect a phase connection of a ...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pool_unlabeled
35001,8398513,2013-03-19,1. A plate-link chain for a motor vehicle driv...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pool_unlabeled


3. Creating embeddings for the pool (frozen PatentSBERTa)

In [20]:
TEXT_COL = "text"          # patent text column


In [64]:
X_pool = embed_texts(
    pool_df[TEXT_COL].astype(str).tolist(),
    batch_size=48,
    max_length=MAX_SEQ_LENGTH
)
print("X_pool:", X_pool.shape)


100%|██████████| 157/157 [16:56<00:00,  6.47s/it]

X_pool: (7500, 768)





4. Computing p_green for each pool example

In [65]:
pool_df["p_green"] = clf.predict_proba(X_pool)[:, 1]
pool_df[["p_green", TEXT_COL]].head(5)


Unnamed: 0,p_green,text
35000,0.688134,1. A method to detect a phase connection of a ...
35001,0.442564,1. A plate-link chain for a motor vehicle driv...
35002,0.288172,1. A composition for removing or reducing micr...
35003,0.530667,1. A monoclonal or polyclonal antibody selecte...
35004,0.197066,1. A method in a mobile device for determining...


5. Computing uncertainty score u (formula: u=1−2⋅∣p−0.5∣)

In [66]:
pool_df["u"] = 1 - 2 * np.abs(pool_df["p_green"] - 0.5)

# sanity check: u should be between 0 and 1
print(pool_df["u"].min(), pool_df["u"].max())


0.0022613876175343606 0.9998479878680508


6. Selecting top 100 highest-u rows (most uncertain)

In [24]:
hitl_df = pool_df.sort_values("u", ascending=False).head(100).copy()
hitl_df[["p_green", "u"]].head(10)


Unnamed: 0,p_green,u
41661,0.499996,0.999993
40522,0.500049,0.999902
36576,0.500073,0.999854
35842,0.499804,0.999609
38080,0.500281,0.999439
41968,0.499713,0.999426
38900,0.500313,0.999375
41232,0.500313,0.999375
41838,0.500322,0.999357
38933,0.500349,0.999302


7. Adding the required empty HITL columns

In [25]:
hitl_df["llm_green_suggested"] = ""
hitl_df["llm_confidence"] = ""
hitl_df["llm_rationale"] = ""
hitl_df["is_green_human"] = ""
hitl_df["notes"] = ""


8. Exporting hitl_green_100.csv with the required columns

In [26]:
export_df = hitl_df.rename(columns={"id": "doc_id"})

export_cols = [
    "doc_id",
    TEXT_COL,
    "p_green",
    "u",
    "llm_green_suggested",
    "llm_confidence",
    "llm_rationale",
    "is_green_human",
    "notes"
]

export_df[export_cols].to_csv("hitl_green_100.csv", index=False)
print("Saved ✅ hitl_green_100.csv")


Saved ✅ hitl_green_100.csv


In [27]:
# Most uncertain should have p_green close to 0.5
export_df[["p_green", "u"]].head(10)


Unnamed: 0,p_green,u
41661,0.499996,0.999993
40522,0.500049,0.999902
36576,0.500073,0.999854
35842,0.499804,0.999609
38080,0.500281,0.999439
41968,0.499713,0.999426
38900,0.500313,0.999375
41232,0.500313,0.999375
41838,0.500322,0.999357
38933,0.500349,0.999302


In [28]:
import joblib

joblib.dump(clf, "baseline_logreg.pkl")
print("Baseline model saved ✅")


Baseline model saved ✅


### PART C: Implement LLM → Human HITL (Gold Labels)

1. Loading hitl_green_100.csv and create a “labeling-only” file. This file will include ONLY doc_id and text + empty label columns.

In [None]:
hitl = pd.read_csv("hitl_green_100.csv")

# keep only claim text + required label columns (no p_green / u shown during labeling)
label_df = hitl[["doc_id", "text"]].copy()

label_df["llm_green_suggested"] = ""
label_df["llm_confidence"] = ""
label_df["llm_rationale"] = ""
label_df["is_green_human"] = ""   # to fill
label_df["notes"] = ""

label_df.to_csv("hitl_green_100_label_only.csv", index=False)
print("Saved: hitl_green_100_label_only.csv ✅")


Saved: hitl_green_100_label_only.csv ✅


2. Auto labeling with Ollama

In [33]:
import subprocess

prompt = "Reply ONLY with JSON: {\"test\": 1}"

result = subprocess.run(
    ["ollama", "run", "qwen3:latest"],
    input=prompt,
    capture_output=True,
    text=True,
)

print("RETURN CODE:", result.returncode)
print("STDOUT:", result.stdout[:300])
print("STDERR:", result.stderr[:300])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RETURN CODE: 0
STDOUT: Thinking...
Okay, the user wants me to reply only with a JSON object that has a key "test" and value 1. Let me make sure I understand the request correctly. They specified "Reply ONLY with JSON: {"test": 1}". So I need to output exactly that JSON without any additional text or explanations.

First, 
STDERR: [?2026h[?25l[1G⠙ [K[?25h[?2026l[?2026h[?25l[1G⠹ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠸ [K[?25h[?2026l[?2026h[?25l[1G⠴ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠦ [K[?25h[?2026l[?2026h[?25l[1G⠧ [K[?25h[?2026l[?2


In [34]:
import requests
requests.get("http://localhost:11434").status_code

200

In [35]:
import pandas as pd
import requests
import time
import numpy as np

INPUT_CSV = "hitl_green_100_label_only.csv"
OUTPUT_CSV = "hitl_green_100_with_llm.csv"
MODEL = "qwen3:latest"

SYSTEM = """You are labeling whether a patent CLAIM is GREEN technology.

RULES:
- Use ONLY the claim text provided. Do NOT use any metadata.
- Output MUST be valid JSON only.
- Fields:
  - llm_green_suggested: 0 or 1
  - llm_confidence: "low" or "medium" or "high"
  - llm_rationale: 1-3 sentences and must quote short phrases from the claim text.

Guidance:
GREEN = renewable energy, energy efficiency, storage, emissions reduction, carbon capture, recycling, waste/water treatment, clean transport.
NOT GREEN = general tech with no environmental purpose.
"""

def ollama_label(claim_text: str) -> dict:
    payload = {
        "model": MODEL,
        "prompt": SYSTEM + "\n\nClaim:\n" + claim_text + "\n\nReturn JSON now.",
        "stream": False,
        "format": "json",          # <-- forces JSON
        "options": {
            "temperature": 0.0     # <-- reduces “creative” output
        }
    }
    r = requests.post("http://localhost:11434/api/generate", json=payload, timeout=180)
    r.raise_for_status()
    data = r.json()

    # With format=json, response is JSON text, usually in data["response"]
    # Example: {"llm_green_suggested":1,"llm_confidence":"medium","llm_rationale":"..."}
    import json
    out = json.loads(data["response"])

    # Normalize + validate
    out["llm_green_suggested"] = int(out["llm_green_suggested"])
    out["llm_confidence"] = str(out["llm_confidence"]).strip().lower()
    if out["llm_confidence"] in ("med", "mid", "moderate"):
        out["llm_confidence"] = "medium"
    assert out["llm_green_suggested"] in (0, 1)
    assert out["llm_confidence"] in ("low", "medium", "high")
    out["llm_rationale"] = str(out["llm_rationale"]).strip()
    return out

df = pd.read_csv(INPUT_CSV)

# Ensure columns exist and start empty
for col in ["llm_green_suggested","llm_confidence","llm_rationale","is_green_human","notes"]:
    if col not in df.columns:
        df[col] = np.nan

df["llm_green_suggested"] = np.nan
df["llm_confidence"] = np.nan
df["llm_rationale"] = np.nan

for i, row in df.iterrows():
    claim = str(row["text"])
    print(f"Labeling {i+1}/{len(df)} doc_id={row['doc_id']}")

    try:
        out = ollama_label(claim)
        df.at[i, "llm_green_suggested"] = out["llm_green_suggested"]
        df.at[i, "llm_confidence"] = out["llm_confidence"]
        df.at[i, "llm_rationale"] = out["llm_rationale"]
    except Exception as e:
        df.at[i, "llm_rationale"] = f"ERROR: {e}"
        print("  ❌", e)

    if (i+1) % 5 == 0:
        df.to_csv(OUTPUT_CSV, index=False)
        print("  (checkpoint saved)")

df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved ✅ {OUTPUT_CSV}")

Labeling 1/100 doc_id=9349230


  df.at[i, "llm_confidence"] = out["llm_confidence"]
  df.at[i, "llm_rationale"] = out["llm_rationale"]


Labeling 2/100 doc_id=9098212
Labeling 3/100 doc_id=8679883
Labeling 4/100 doc_id=8928812
Labeling 5/100 doc_id=8975620
  (checkpoint saved)
Labeling 6/100 doc_id=8555926
Labeling 7/100 doc_id=8862535
Labeling 8/100 doc_id=9809521
Labeling 9/100 doc_id=8380927
Labeling 10/100 doc_id=9216467
  (checkpoint saved)
Labeling 11/100 doc_id=8685428
Labeling 12/100 doc_id=8944642
Labeling 13/100 doc_id=9370096
Labeling 14/100 doc_id=8788855
Labeling 15/100 doc_id=8378336
  (checkpoint saved)
Labeling 16/100 doc_id=9324203
Labeling 17/100 doc_id=9206114
Labeling 18/100 doc_id=9536210
Labeling 19/100 doc_id=8700276
Labeling 20/100 doc_id=9325463
  (checkpoint saved)
Labeling 21/100 doc_id=9150447
Labeling 22/100 doc_id=8895813
Labeling 23/100 doc_id=9501132
Labeling 24/100 doc_id=8936732
Labeling 25/100 doc_id=9086874
  (checkpoint saved)
Labeling 26/100 doc_id=8791307
Labeling 27/100 doc_id=8895142
Labeling 28/100 doc_id=9551951
Labeling 29/100 doc_id=8448150
Labeling 30/100 doc_id=8490722
  (c

In [55]:
import pandas as pd
import numpy as np

df = pd.read_csv("hitl_green_100_with_llm.csv")

filled = df["llm_green_suggested"].notna().sum()
print("Actually filled rows:", filled, "out of", len(df))

# show a few values to confirm
print(df["llm_green_suggested"].head(10))

Actually filled rows: 100 out of 100
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: llm_green_suggested, dtype: float64


In [50]:
# check info about row 27
row_27 = df.iloc[26]
print("Row 27 info:")
print(row_27)

Row 27 info:
doc_id                                                           8895142
text                   1. A carbon black having: a) a nitrogen BET su...
llm_green_suggested                                                  1.0
llm_confidence                                                      high
llm_rationale          The claim mentions carbon black with a nitroge...
is_green_human                                                       NaN
notes                                                                NaN
Name: 26, dtype: object


Compute the override ones.

In [46]:
import pandas as pd

gold = pd.read_csv("hitl_green_100_gold.csv", sep=";")

gold["llm_green_suggested"] = gold["llm_green_suggested"].astype(int)
gold["is_green_human"] = gold["is_green_human"].astype(int)

gold["override"] = gold["llm_green_suggested"] != gold["is_green_human"]

print("Overrides:", gold["override"].sum(), "out of", len(gold))


Overrides: 6 out of 100


### PART D: Final Model (Fine-Tune PatentSBERTa Once)

In [67]:
import pandas as pd

full_df = pd.read_parquet("patents_50k_green.parquet")
print(full_df.shape)
print(full_df.columns)

(50000, 668)
Index(['id', 'date', 'text', 'A01B', 'A01C', 'A01D', 'A01F', 'A01G', 'A01H',
       'A01J',
       ...
       'Y02D', 'Y02E', 'Y02P', 'Y02T', 'Y02W', 'Y04S', 'Y10S', 'Y10T',
       'is_green_silver', 'split'],
      dtype='object', length=668)


1. Merging 100 human labels and create is_green_gold. Loading labeled HITL file

In [69]:
hitl_gold = pd.read_csv("hitl_green_100_gold.csv", sep=";")  
print(hitl_gold.columns)
print(hitl_gold.shape)

Index(['doc_id', 'text', 'llm_green_suggested', 'llm_confidence',
       'llm_rationale', 'is_green_human', 'notes'],
      dtype='object')
(100, 7)


2. Creating gold label column in the full dataset

In [70]:
# --- make sure the key column matches ---
if "doc_id" in hitl_gold.columns and "id" not in hitl_gold.columns:
    hitl_gold = hitl_gold.rename(columns={"doc_id": "id"})

# keep only what we need
hitl_gold_small = hitl_gold[["id", "is_green_human"]].copy()

# merge into the full dataset
df = full_df.merge(hitl_gold_small, on="id", how="left")

# create is_green_gold = silver everywhere, overwritten by human on the 100
df["is_green_gold"] = df["is_green_silver"]
mask = df["is_green_human"].notna()
df.loc[mask, "is_green_gold"] = df.loc[mask, "is_green_human"].astype(int)

print("Rows with human labels merged in:", mask.sum())
print(df.loc[mask, ["id", "is_green_silver", "is_green_human", "is_green_gold"]].head(10))

Rows with human labels merged in: 100
            id  is_green_silver  is_green_human  is_green_gold
35124  8927615                1             0.0              0
35211  8956524                1             0.0              0
35323  9427802                0             0.0              0
35415  8673733                0             0.0              0
35459  8490722                1             0.0              0
35508  9698306                0             0.0              0
35547  9738704                1             0.0              0
35742  9367066                1             0.0              0
35757  8970930                0             0.0              0
35796  9206114                1             0.0              0


3. Building train/eval + gold_100 sets

In [71]:
# making sure labels are int
df["is_green_gold"] = df["is_green_gold"].astype(int)

# splits
train_df = df[df["split"] == "train_silver"].copy()
eval_df  = df[df["split"] == "eval_silver"].copy()

# the 100 gold rows (where we have human label)
gold_100_df = df[df["is_green_human"].notna()].copy()

print("train_df:", train_df.shape, "label mean:", train_df["is_green_gold"].mean())
print("eval_df:", eval_df.shape, "label mean:", eval_df["is_green_silver"].mean())  # eval uses silver ground truth
print("gold_100_df:", gold_100_df.shape, "label mean:", gold_100_df["is_green_gold"].mean())

train_df: (35000, 670) label mean: 0.5004
eval_df: (7500, 670) label mean: 0.49893333333333334
gold_100_df: (100, 670) label mean: 0.05


In [72]:
# how many human greens?
print("Human greens:", gold_100_df["is_green_gold"].sum(), "out of", len(gold_100_df))

# how many human overrides of SILVER?
if "is_green_silver" in gold_100_df.columns:
    overrides_silver = (gold_100_df["is_green_gold"] != gold_100_df["is_green_silver"]).sum()
    print("Overrides vs silver:", overrides_silver, "out of", len(gold_100_df))

# how many human overrides of LLM (if you have llm column in that file)
if "llm_green_suggested" in gold_100_df.columns:
    overrides_llm = (gold_100_df["is_green_gold"] != gold_100_df["llm_green_suggested"]).sum()
    print("Overrides vs LLM:", overrides_llm, "out of", len(gold_100_df))

Human greens: 5 out of 100
Overrides vs silver: 46 out of 100


4. Converting pandas dataframes to HuggingFace datasets

In [74]:
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL_NAME = "AI-Growth-Lab/PatentSBERTa"
MAX_SEQ_LENGTH = 256

# Train uses gold labels (this is the whole HITL idea)
train_hf = Dataset.from_pandas(
    train_df[["text", "is_green_gold"]].rename(columns={"is_green_gold": "label"}),
    preserve_index=False
)

# Eval must use SILVER labels (assignment requirement)
eval_hf = Dataset.from_pandas(
    eval_df[["text", "is_green_silver"]].rename(columns={"is_green_silver": "label"}),
    preserve_index=False
)

# Separate evaluation on human gold set
gold_hf = Dataset.from_pandas(
    gold_100_df[["text", "is_green_gold"]].rename(columns={"is_green_gold": "label"}),
    preserve_index=False
)

print(train_hf, eval_hf, gold_hf)

Dataset({
    features: ['text', 'label'],
    num_rows: 35000
}) Dataset({
    features: ['text', 'label'],
    num_rows: 7500
}) Dataset({
    features: ['text', 'label'],
    num_rows: 100
})


5. Tokeniing the text

In [75]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH
    )

train_hf = train_hf.map(tokenize_fn, batched=True, remove_columns=["text"])
eval_hf  = eval_hf.map(tokenize_fn, batched=True, remove_columns=["text"])
gold_hf  = gold_hf.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Tokenization done.")

Map: 100%|██████████| 35000/35000 [00:08<00:00, 4139.06 examples/s]
Map: 100%|██████████| 7500/7500 [00:01<00:00, 4668.81 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4019.61 examples/s]

Tokenization done.





6. Loading the classification model

In [76]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at AI-Growth-Lab/PatentSBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7. Training setup

In [78]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="patentsberta_green_ft",
    num_train_epochs=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    do_train=True,
    do_eval=True,
    report_to=[]
)

8. Trainer + metrics

In [84]:
import numpy as np
from transformers import Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    # Works across HF versions
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    return {"accuracy": acc, "precision": p, "recall": r, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=eval_hf,          # this should be eval_silver
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [88]:
# small update