# Using LLMs for Classification

## Libraries Imported

In [64]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch

## Extract from CSV File

In [65]:
df = pd.read_csv("./data/PURE_test.csv")

In [66]:
df = df.drop(columns=["Unnamed: 0"])

In [67]:
df = df.rename(columns={'Requirement':'text', 'Name of Doc': 'source', 'Req/Not Req': 'label'})

In [68]:
df['y'] = np.where(df['label'] == 'Req', 1, 0)

In [69]:
df.head()

Unnamed: 0,text,source,label,y
0,System Initialization performs those functions...,nasa x38.doc,Req,1
1,"Whenever a power-on reset occurs, System Initi...",nasa x38.doc,Req,1
2,"As part of System Initialization , the Boot RO...",nasa x38.doc,Req,1
3,System Initialization shall [SRS014] initiate ...,nasa x38.doc,Req,1
4,System Initialization shall [SRS292] enable an...,nasa x38.doc,Req,1


In [70]:
PROMPT_TEMPLATE = """You are classifying software engineering sentences.

Choose the best label:

A) Req = the sentence describes a required system behavior, feature, or constraint 
   (e.g. uses "shall", "must", "the system will", or describes what the system or user can do).

B) Not_Req = the sentence is background info, assumptions, document structure, or explanation.
   It does NOT directly state a behavior or constraint the system must satisfy.

Examples:
Sentence: "The interfaces must be made customizable or user-configurable to the extent possible."
Answer: A

Sentence: "This document describes the main modules of the system."
Answer: B

Now classify this new sentence.

Sentence: "{sentence}"
Answer with exactly one letter: A or B.
Answer:
"""


In [71]:
model_name = "google/flan-t5-large"

In [72]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)

classifier = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [73]:
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

model.eval()

texts = df["text"].tolist()
labels = df["label"].tolist()
prompts = [PROMPT_TEMPLATE.format(sentence=x) for x in texts]

preds = []
truth = []

with torch.inference_mode():
    outputs = classifier(
        prompts,
        max_new_tokens=2,
        num_beams=3,
        do_sample=False,
        batch_size=16,
        truncation=True
    )

for out, y in zip(outputs, labels):
    raw = out["generated_text"]
    first = raw.strip().split()[0].upper()
    
    if first.startswith("A"):
        pred = "Req"
    elif first.startswith("B"):
        pred = "Not_Req"
    else:
        pred = "Not_Req" 

    preds.append(pred)
    # Normalize truth
    true = "Req" if str(y).lower().startswith("req") else "Not_Req"
    truth.append(true)

# Metrics
accuracy = accuracy_score(truth, preds)
precision = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
recall = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])

print("\nResults:") 
print("Accuracy:", accuracy) 
print("Precision:", precision) 
print("Recall:", recall) 
print("F1 Score:", f1)



Results:
Accuracy: 0.7861799217731421
Precision: 0.899343544857768
Recall: 0.776937618147448
F1 Score: 0.8336713995943205


In [74]:
BASE_EXAMPLES = """Examples:
Sentence: "The interfaces must be made customizable or user-configurable to the extent possible."
Answer: A

Sentence: "This document describes the main modules of the system."
Answer: B
"""

def build_meta_prompt(req, notreq):
    hard_examples_text = ""

    # misclassified true-Req → A
    for sent, _, _ in req[:3]:
        hard_examples_text += f'Sentence: "{sent}"; Answer: A\n'

    # misclassified true-Not_Req → B
    for sent, _, _ in notreq[:3]:
        hard_examples_text += f'Sentence: "{sent}"; Answer: B\n'

    prompt = f"""You are classifying software engineering sentences.

Choose the best label:

A) Req = the sentence describes a required system behavior, feature, or constraint 
   (e.g. uses "shall", "must", "the system will", or describes what the system or user can do).

B) Not_Req = the sentence is background info, assumptions, document structure, or explanation.
   It does NOT directly state a behavior or constraint the system must satisfy.

{BASE_EXAMPLES}

Examples the model got wrong previously:
{hard_examples_text}

Now classify this new sentence.

Sentence: "{{sentence}}"
Answer with exactly one letter: A or B.
Answer:
"""
    return prompt

In [75]:
prev_f1 = 0
improvement_threshold = 0.001   # stop if improvement < 0.1%

while True:
    # 1. run inference
    # 2. compute f1 score → f1

    if f1 - prev_f1 < improvement_threshold:
        break

    prev_f1 = f1

    req_inc = []
    notreq_inc = []
    
    for sent, true, pred in zip(df["text"], truth, preds):
        if true != pred and true == 'Req':
            req_inc.append((sent, true, pred))
        elif true != pred and true == 'Not_Req':
            notreq_inc.append((sent, true, pred))
    
        if len(req_inc) >= 5 or len(notreq_inc) >= 5:
            break
    
    
    print(f"Found {len(req_inc)} misclassified req examples to improve the prompt.")
    print(f"Found {len(notreq_inc)} misclassified not req examples to improve the prompt.")


    meta_prompt = build_meta_prompt(req_inc, notreq_inc)

    import torch
    from tqdm import tqdm
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
    
    model.eval()
    
    texts = df["text"].tolist()
    labels = df["label"].tolist()
    prompts = [meta_prompt.format(sentence=x) for x in texts]
    
    preds = []
    truth = []
    
    with torch.inference_mode():
        outputs = classifier(
            prompts,
            max_new_tokens=2,
            num_beams=3,
            do_sample=False,
            batch_size=16,
            truncation=True
        )
    
    for out, y in zip(outputs, labels):
        raw = out["generated_text"]
        first = raw.strip().split()[0].upper()
        
        if first.startswith("A"):
            pred = "Req"
        elif first.startswith("B"):
            pred = "Not_Req"
        else:
            pred = "Not_Req" 
            
        preds.append(pred)
        # Normalize truth
        true = "Req" if str(y).lower().startswith("req") else "Not_Req"
        truth.append(true)
    
    # Metrics
    accuracy = accuracy_score(truth, preds)
    precision = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    recall = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    
    print("\nResults:") 
    print("Accuracy:", accuracy) 
    print("Precision:", precision) 
    print("Recall:", recall) 
    print("F1 Score:", f1)


Found 5 misclassified req examples to improve the prompt.
Found 0 misclassified not req examples to improve the prompt.

Results:
Accuracy: 0.7907431551499348
Precision: 0.8992416034669556
Recall: 0.7844990548204159
F1 Score: 0.8379606259464917
Found 5 misclassified req examples to improve the prompt.
Found 0 misclassified not req examples to improve the prompt.

Results:
Accuracy: 0.7900912646675359
Precision: 0.9
Recall: 0.782608695652174
F1 Score: 0.8372093023255814
