# Using LLMs for Classification

## Libraries Imported

In [28]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

## Extract from CSV File

In [29]:
df = pd.read_csv("./data/PURE_test.csv")

In [30]:
df = df.drop(columns=["Unnamed: 0"])

In [31]:
df = df.rename(columns={'Requirement':'text', 'Name of Doc': 'source', 'Req/Not Req': 'label'})

In [32]:
df['y'] = np.where(df['label'] == 'Req', 1, 0)

In [33]:
df.head()

Unnamed: 0,text,source,label,y
0,System Initialization performs those functions...,nasa x38.doc,Req,1
1,"Whenever a power-on reset occurs, System Initi...",nasa x38.doc,Req,1
2,"As part of System Initialization , the Boot RO...",nasa x38.doc,Req,1
3,System Initialization shall [SRS014] initiate ...,nasa x38.doc,Req,1
4,System Initialization shall [SRS292] enable an...,nasa x38.doc,Req,1


In [None]:
PROMPT_TEMPLATE = """
You are a requirements demarcator for software and systems engineering documents. Classify a single sentence as either Requirement or Non-requirement.
1. Definition of Requirement
    A Requirement contains wording strongly associated with required system behavior or capabilities. It typically includes directive phrases such as:
    the system, shall be, should be, must be, will be (referring to the system), system provides, ability to, be able to, the user, can be, at least, shall provide, able to.
    A Requirement describes behavior, features, constraints, or capabilities that a system must provide.

2. Definition of Non-requirement
    A Non-requirement contains wording associated with explanations, descriptions, document text, assumptions, or information that is not specifying mandatory system behavior. It often includes phrases such as:
    this document, of the, it is, the same, the library, there is, the requirements, to be (as description), the project, in this, by the.
    A Non-requirement provides context, references, assumptions, or descriptions without specifying required system functionality.

3. Output format
    Only output Requirement or Non-requirement.

4. Examples
    Sentence: "The help should be accessible to the users both in the offline and online mode."
    Label: Requirement

    Sentence: "The interfaces must be made customizable or user-configurable to the extent possible."
    Label: Requirement

    Sentence: "It is also assumed that the reader has a general understanding of Library services and processes."
    Label: Non-requirement

    Sentence: "The table of permitted actions in supported file systems is in section 2.2."
    Label: Non-requirement
    
Classify the following sentence as Requirement or Non-requirement:
    Sentence: "{sentence}"
    Label:
"""

In [36]:
model_name = "google/flan-t5-small"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

classifier = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

preds = []
truth = []

# iterrows wrapped with tqdm
for index, row in tqdm(df.iterrows(), total=len(df), desc="Classifying"):
    X = row['text']
    y = row['label']

    prompt = PROMPT_TEMPLATE.format(sentence=X)
    output = classifier(prompt)

    raw = output[0]['generated_text']
    answer = raw.replace(prompt, "").strip()
    first_line = answer.splitlines()[0].strip().lower()

    if first_line.startswith("requirement"):
        preds.append('Req')
    else:
        preds.append('Not_Req')

    truth.append(y)

# Convert to binary for metrics
binary_preds = [1 if p == 'Req' else 0 for p in preds]
binary_truth = [1 if t == 'Req' else 0 for t in truth]

# Metrics
accuracy = accuracy_score(truth, preds)
precision = precision_score(binary_truth, binary_preds)
recall = recall_score(binary_truth, binary_preds)
f1 = f1_score(binary_truth, binary_preds)
auc = roc_auc_score(binary_truth, binary_preds)

print("\nResults:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC:", auc)


Classifying:   0%|          | 0/1534 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Classifying: 100%|██████████| 1534/1534 [06:33<00:00,  3.90it/s]


Results:
Accuracy: 0.3213820078226858
Precision: 0.7575757575757576
Recall: 0.023629489603024575
F1 Score: 0.045829514207149404
AUC: 0.5034113834569744





In [40]:
# ---- Collect misclassified examples ---- #
misclassified = []

for sent, true, pred in zip(df["text"], truth, preds):
    if true != pred:
        misclassified.append((sent, true, pred))

# Limit to a few examples for prompt clarity
misclassified = misclassified[:10]

print(f"\nFound {len(misclassified)} misclassified examples to improve the prompt.")



Found 10 misclassified examples to improve the prompt.


In [41]:
def build_meta_prompt(current_prompt, mistakes):
    meta = f"""
You are helping to design a better prompt for a classifier that distinguishes
"Requirement" vs "Non-requirement" in software specification sentences.

Here is the CURRENT prompt:

<<<BEGIN CURRENT PROMPT>>>
{current_prompt}
<<<END CURRENT PROMPT>>>

Below are example sentences that the classifier FAILED on. Each shows the sentence,
its true label, and how the model incorrectly predicted:

"""
    for sent, true, pred in mistakes:
        meta += f"""
Sentence: "{sent}"
True label: {true}
Model predicted: {pred}
"""

    meta += """
TASK:
Using the mistakes and the current instructions, rewrite the classifier prompt so that:
- Definitions are clearer
- The difference between obligation & description is emphasized
- Examples better illustrate edge cases
- The output format stays EXACTLY one of: Requirement or Non-requirement
- DO NOT return analysis. Return ONLY the new improved full prompt.

Return ONLY the improved prompt, nothing else.
"""
    return meta


In [42]:
meta_prompt = build_meta_prompt(PROMPT_TEMPLATE, misclassified)

improved = classifier(meta_prompt, max_new_tokens=800, do_sample=False, temperature=0.0)[0]['generated_text']

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [43]:
improved_only = improved.replace(meta_prompt, "").strip()

print("\n----- IMPROVED PROMPT -----\n")
print(improved_only)
print("\n---------------------------\n")


----- IMPROVED PROMPT -----

Requirement

---------------------------

