# Using LLMs for Classification

## Libraries Imported

In [172]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import torch

## Extract from CSV File

In [173]:
df = pd.read_csv("./data/PURE_test.csv")

In [174]:
df = df.drop(columns=["Unnamed: 0"])

In [175]:
df = df.rename(columns={'Requirement':'text', 'Name of Doc': 'source', 'Req/Not Req': 'label'})

In [176]:
df['y'] = np.where(df['label'] == 'Req', 1, 0)

In [177]:
df.head()

Unnamed: 0,text,source,label,y
0,System Initialization performs those functions...,nasa x38.doc,Req,1
1,"Whenever a power-on reset occurs, System Initi...",nasa x38.doc,Req,1
2,"As part of System Initialization , the Boot RO...",nasa x38.doc,Req,1
3,System Initialization shall [SRS014] initiate ...,nasa x38.doc,Req,1
4,System Initialization shall [SRS292] enable an...,nasa x38.doc,Req,1


In [178]:
PROMPT_TEMPLATE = """You are classifying software engineering sentences.

Choose the best label:

A) Req = the sentence describes a required system behavior, feature, or constraint 
   (e.g. uses "shall", "must", "the system will", or describes what the system or user can do).

B) Not_Req = the sentence is background info, assumptions, document structure, or explanation.
   It does NOT directly state a behavior or constraint the system must satisfy.

Examples:
Sentence: "The interfaces must be made customizable or user-configurable to the extent possible."
Answer: A

Sentence: "This document describes the main modules of the system."
Answer: B

Now classify this new sentence.

Sentence: "{sentence}"
Answer with exactly one letter: A or B.
Answer:
"""


In [179]:
INITIAL_PROMPT = """
Classify this sentence as Req or Not_Req. 
Requirement = a required system behavior, feature, or constraint (e.g. sentences with "shall", "must", "the system will", or describing what the system or user can do). 
Not_Req = background, document structure, assumptions, or explanations that are not required behavior. 

Examples: 
Sentence: "The interfaces must be made customizable or user-configurable to the extent possible." 
Label: Req 

Sentence: "This document describes the main modules of the system." 
Label: Not_Req 

Sentence: "{sentence}" 
Answer with exactly: Req or Not_Req. 
Label: 
"""

In [180]:
model_name = "google/flan-t5-large"

In [181]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)

classifier = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [182]:
model.eval()

texts = df["text"].tolist()
labels = df["label"].tolist()
prompts = [INITIAL_PROMPT.format(sentence=x) for x in texts]

preds = []
truth = []

with torch.inference_mode():
    outputs = classifier(
        prompts,
        max_new_tokens=2,
        num_beams=1,
        do_sample=False,
        batch_size=16,
        truncation=True
    )

for out, y in zip(outputs, labels):
    raw = out["generated_text"]
    raw_label = raw.strip().split()[0].strip().lower()
    
    if raw_label in {"req", "requirement"}:
        pred = "Req"
    elif raw_label in {"not_req", "not-req", "notreq", "non-requirement", "nonrequirement"}:
        pred = "Not_Req"
    else:
        pred = "Not_Req"

    preds.append(pred)
    true = "Req" if str(y).lower().startswith("req") else "Not_Req"
    truth.append(true)

# Metrics
accuracy = accuracy_score(truth, preds)
precision = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
recall = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])

print("\nResults:") 
print("Accuracy:", accuracy) 
print("Precision:", precision) 
print("Recall:", recall) 
print("F1 Score:", f1)



Results:
Accuracy: 0.7229465449804433
Precision: 0.9126466753585397
Recall: 0.6616257088846881
F1 Score: 0.7671232876712328


In [183]:
model.eval()

texts = df["text"].tolist()
labels = df["label"].tolist()
prompts = [PROMPT_TEMPLATE.format(sentence=x) for x in texts]

preds = []
truth = []

with torch.inference_mode():
    outputs = classifier(
        prompts,
        max_new_tokens=2,
        num_beams=1,
        do_sample=False,
        batch_size=16,
        truncation=True
    )

for out, y in zip(outputs, labels):
    raw = out["generated_text"]
    first = raw.strip().split()[0].upper()
    
    if first.startswith("A"):
        pred = "Req"
    elif first.startswith("B"):
        pred = "Not_Req"
    else:
        pred = "Not_Req" 

    preds.append(pred)
    true = "Req" if str(y).lower().startswith("req") else "Not_Req"
    truth.append(true)

# Metrics
accuracy = accuracy_score(truth, preds)
precision = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
recall = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])

print("\nResults:") 
print("Accuracy:", accuracy) 
print("Precision:", precision) 
print("Recall:", recall) 
print("F1 Score:", f1)



Results:
Accuracy: 0.7861799217731421
Precision: 0.899343544857768
Recall: 0.776937618147448
F1 Score: 0.8336713995943205


In [184]:
BASE_EXAMPLES = """Examples:
Sentence: "The interfaces must be made customizable or user-configurable to the extent possible."
Answer: A

Sentence: "This document describes the main modules of the system."
Answer: B
"""

def build_meta_prompt(req, notreq):
    hard_examples_text = ""

    for sent, _, _ in req[:3]:
        hard_examples_text += f'Sentence: "{sent}"; Answer: A\n'

    for sent, _, _ in notreq[:3]:
        hard_examples_text += f'Sentence: "{sent}"; Answer: B\n'

    prompt = f"""You are classifying software engineering sentences.

Choose the best label:

A) Req = the sentence describes a required system behavior, feature, or constraint 
   (e.g. uses "shall", "must", "the system will", or describes what the system or user can do).

B) Not_Req = the sentence is background info, assumptions, document structure, or explanation.
   It does NOT directly state a behavior or constraint the system must satisfy.

{BASE_EXAMPLES}

Examples the model got wrong previously:
{hard_examples_text}

Now classify this new sentence.

Sentence: "{{sentence}}"
Answer with exactly one letter: A or B.
Answer:
"""
    return prompt

In [185]:
prev_f1 = 0
improvement_threshold = 0.001  

while True:

    if f1 - prev_f1 < improvement_threshold:
        break

    prev_f1 = f1

    req_inc = []
    notreq_inc = []
    
    for sent, true, pred in zip(df["text"], truth, preds):
        if true != pred and true == 'Req':
            req_inc.append((sent, true, pred))
        elif true != pred and true == 'Not_Req':
            notreq_inc.append((sent, true, pred))
    
        if len(req_inc) >= 5 or len(notreq_inc) >= 5:
            break
    
    
    print(f"Found {len(req_inc)} misclassified req examples to improve the prompt.")
    print(f"Found {len(notreq_inc)} misclassified not req examples to improve the prompt.")


    meta_prompt = build_meta_prompt(req_inc, notreq_inc)
    model.eval()
    
    texts = df["text"].tolist()
    labels = df["label"].tolist()
    prompts = [meta_prompt.format(sentence=x) for x in texts]
    
    preds = []
    truth = []
    
    with torch.inference_mode():
        outputs = classifier(
            prompts,
            max_new_tokens=2,
            num_beams=3,
            do_sample=False,
            batch_size=16,
            truncation=True
        )
    
    for out, y in zip(outputs, labels):
        raw = out["generated_text"]
        first = raw.strip().split()[0].upper()
        
        if first.startswith("A"):
            pred = "Req"
        elif first.startswith("B"):
            pred = "Not_Req"
        else:
            pred = "Not_Req" 
            
        preds.append(pred)
        # Normalize truth
        true = "Req" if str(y).lower().startswith("req") else "Not_Req"
        truth.append(true)
    
    # Metrics
    accuracy = accuracy_score(truth, preds)
    precision = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    recall = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    
    print("\nResults:") 
    print("Accuracy:", accuracy) 
    print("Precision:", precision) 
    print("Recall:", recall) 
    print("F1 Score:", f1)


Found 5 misclassified req examples to improve the prompt.
Found 0 misclassified not req examples to improve the prompt.

Results:
Accuracy: 0.7907431551499348
Precision: 0.8992416034669556
Recall: 0.7844990548204159
F1 Score: 0.8379606259464917
Found 5 misclassified req examples to improve the prompt.
Found 0 misclassified not req examples to improve the prompt.

Results:
Accuracy: 0.7900912646675359
Precision: 0.9
Recall: 0.782608695652174
F1 Score: 0.8372093023255814


In [186]:
texts = df["text"].tolist()
labels = df["label"].tolist()
prompts = [PROMPT_TEMPLATE.format(sentence=x) for x in texts]

temperatures = [0.1, 0.3, 0.7, 1.0]

print("Temperature Experiment:\n")

for temp in temperatures:
    print(f"Temp:{temp}")

    preds = []
    truth = []

    with torch.inference_mode():
        outputs = classifier(
            prompts,
            batch_size=16,
            truncation=True,
            do_sample=True,
            temperature=temp,
            max_new_tokens=2,
        )

    for out, y in zip(outputs, labels):
        raw = out["generated_text"]
        first = raw.strip().split()[0].upper()

        if first.startswith("A"):
            pred = "Req"
        else:
            pred = "Not_Req"

        preds.append(pred)
        truth.append("Req" if str(y).lower().startswith("req") else "Not_Req")

    # Metrics
    acc = accuracy_score(truth, preds)
    prec = precision_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    rec = recall_score([t=="Req" for t in truth], [p=="Req" for p in preds])
    f1 = f1_score([t=="Req" for t in truth], [p=="Req" for p in preds])

    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")


Temperature Experiment:

Temp:0.1
Accuracy:  0.786
Precision: 0.899
Recall:    0.777
F1 Score:  0.834
Temp:0.3
Accuracy:  0.786
Precision: 0.899
Recall:    0.777
F1 Score:  0.834
Temp:0.7
Accuracy:  0.786
Precision: 0.899
Recall:    0.777
F1 Score:  0.834
Temp:1.0
Accuracy:  0.786
Precision: 0.899
Recall:    0.777
F1 Score:  0.834


In [187]:
model_names = [
    "google/flan-t5-small",
    "google/flan-t5-base",
    "google/flan-t5-large",
]

texts = df["text"].tolist()
labels = df["label"].tolist()
prompts = [PROMPT_TEMPLATE.format(sentence=x) for x in texts]

results = []

for model_name in model_names:
    print(f"Model: {model_name}")

    # Load model + tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        dtype=torch.float16,
        low_cpu_mem_usage=True,
        device_map="auto"
    )
    classifier = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

    preds = []
    truth = []

    model.eval()
    with torch.inference_mode():
        outputs = classifier(
            prompts,
            batch_size=16,
            truncation=True,
            max_new_tokens=2,
            num_beams=1,     # deterministic greedy decoding
            do_sample=False,
        )

    for out, y in zip(outputs, labels):
        raw = out["generated_text"]
        first = raw.strip().split()[0].upper()

        if first.startswith("A"):
            pred = "Req"
        else:
            pred = "Not_Req"

        preds.append(pred)
        truth.append("Req" if str(y).lower().startswith("req") else "Not_Req")

    # Metrics
    acc = accuracy_score(truth, preds)
    prec = precision_score([t == "Req" for t in truth], [p == "Req" for p in preds])
    rec = recall_score([t == "Req" for t in truth], [p == "Req" for p in preds])
    f1 = f1_score([t == "Req" for t in truth], [p == "Req" for p in preds])

    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall:    {rec:.3f}")
    print(f"F1 Score:  {f1:.3f}")

    results.append({
        "model": model_name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
    })

Model: google/flan-t5-small


Device set to use cuda:0


Accuracy:  0.696
Precision: 0.694
Recall:    1.000
F1 Score:  0.820
Model: google/flan-t5-base


Device set to use cuda:0


Accuracy:  0.720
Precision: 0.731
Recall:    0.940
F1 Score:  0.823
Model: google/flan-t5-large


Device set to use cuda:0


Accuracy:  0.786
Precision: 0.899
Recall:    0.777
F1 Score:  0.834


In [188]:
print("\nResults:")
for r in results:
    print(
        f"{r['model']}: "
        f"Acc={r['accuracy']:.3f}, "
        f"Prec={r['precision']:.3f}, "
        f"Rec={r['recall']:.3f}, "
        f"F1={r['f1']:.3f}"
    )


Results:
google/flan-t5-small: Acc=0.696, Prec=0.694, Rec=1.000, F1=0.820
google/flan-t5-base: Acc=0.720, Prec=0.731, Rec=0.940, F1=0.823
google/flan-t5-large: Acc=0.786, Prec=0.899, Rec=0.777, F1=0.834
