# Prompt Generation

This python notebook takes the pure_train.csv and extracts important linguistic features to create a proper prompt for the LLM

## Libraries Imported

In [279]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from collections import defaultdict

## Data Extraction and Cleaning

### Extract from CSV File

In [280]:
df = pd.read_csv('./data/PURE_train.csv')

### Drop Unused Columns

In [281]:
df = df.drop(columns=["Unnamed: 0"])

### Renaming Columns

In [282]:
df = df.rename(columns={'Requirement':'text', 'Name of Doc': 'source', 'Req/Not Req': 'label'})

In [283]:
df['y'] = np.where(df['label'] == 'Req', 1, 0)

In [284]:
df.head()

Unnamed: 0,text,source,label,y
0,The solution should provide detailed context-s...,cctns.pdf,Req,1
1,The help should be accessible to the users bot...,cctns.pdf,Req,1
2,The solution should provide an interface for t...,cctns.pdf,Req,1
3,"The solution should send alerts (e.g., email, ...",cctns.pdf,Req,1
4,The solution should enable the user to track t...,cctns.pdf,Req,1


## Extracting Linguistic Features for Prompt Engineering

### Count Vectorizer and Logistic Regression for Feature Importance

In [285]:
sources = df['source'].unique()
n_top = 20

In [286]:
ling_words = {}

for src in sources:
    # iterate each subset
    print("Source:", src)
    print("-"*20)
    subset = df[df["source"] == src]
    
    counts = subset["y"].value_counts()
    n_req = counts.get(1, 0)
    n_non = counts.get(0, 0)
    
    if n_req == 0 or n_non == 0:
        print("Skipping. Not enough data for a meaningful analysis.")
        continue
    
    # vectorizer
    vectorizer = CountVectorizer(
        ngram_range=(2, 5),
        min_df=2,
        lowercase=True
    )
    
    # use vectorizer for train and label set
    X = vectorizer.fit_transform(subset['text'])
    y = subset['y']
    
    # use logistic regression to obtain coeffecients
    lgrg = LogisticRegression()
    
    lgrg.fit(X, y)
    
    # coeffecients
    coef = lgrg.coef_[0]
    
    # feature names
    feature_names = vectorizer.get_feature_names_out()
    
    # get top 10 features for req
    req_values = []
    req_indices = np.argsort(coef)[-n_top:]
    for i in req_indices[::-1]:
        req_values.append((feature_names[i], coef[i]))
        print(f"Word: {feature_names[i]}, Weight: {coef[i]}")
    
    print()
    
    # get top 10 features for non-req
    nonreq_values = []
    notreq_indices = np.argsort(coef)[:n_top]
    for i in notreq_indices:
        nonreq_values.append((feature_names[i], coef[i]))
        print(f"Word: {feature_names[i]}, Weight: {coef[i]}")
    
    print()
    
    ling_words[src] = (req_values, nonreq_values)

Source: cctns.pdf
--------------------
Word: should be, Weight: 1.4157989627847516
Word: the system, Weight: 0.9880781162690828
Word: system should, Weight: 0.5586876624983365
Word: the system should, Weight: 0.5586876624983365
Word: the user, Weight: 0.5332939247621353
Word: the system must, Weight: 0.4260760082279166
Word: system must, Weight: 0.4260760082279166
Word: iso 9241, Weight: 0.41643844850799244
Word: in the, Weight: 0.37397087480479163
Word: to be, Weight: 0.31396944643602964
Word: to the, Weight: 0.27195057509459514
Word: use of, Weight: 0.25372232275782
Word: does not, Weight: 0.24451890650815353
Word: must be, Weight: 0.2394118480426658
Word: the same, Weight: 0.23352051299622414
Word: solution should, Weight: 0.22184496607396428
Word: to allow, Weight: 0.2059591329123451
Word: the solution should, Weight: 0.20326315681050233
Word: the solution, Weight: 0.20326315681050233
Word: should be provided, Weight: 0.198557563954228

Word: of the, Weight: -0.8658616679178002
Wor

In [287]:
ling_words

{'cctns.pdf': ([('should be', np.float64(1.4157989627847516)),
   ('the system', np.float64(0.9880781162690828)),
   ('system should', np.float64(0.5586876624983365)),
   ('the system should', np.float64(0.5586876624983365)),
   ('the user', np.float64(0.5332939247621353)),
   ('the system must', np.float64(0.4260760082279166)),
   ('system must', np.float64(0.4260760082279166)),
   ('iso 9241', np.float64(0.41643844850799244)),
   ('in the', np.float64(0.37397087480479163)),
   ('to be', np.float64(0.31396944643602964)),
   ('to the', np.float64(0.27195057509459514)),
   ('use of', np.float64(0.25372232275782)),
   ('does not', np.float64(0.24451890650815353)),
   ('must be', np.float64(0.2394118480426658)),
   ('the same', np.float64(0.23352051299622414)),
   ('solution should', np.float64(0.22184496607396428)),
   ('to allow', np.float64(0.2059591329123451)),
   ('the solution should', np.float64(0.20326315681050233)),
   ('the solution', np.float64(0.20326315681050233)),
   ('shoul

In [288]:
global_req_scores = defaultdict(float)
global_req_counts = defaultdict(int)

global_non_scores = defaultdict(float)
global_non_counts = defaultdict(int)

for src, (req_values, nonreq_values) in ling_words.items():
    # req_values is a list of (phrase, weight)
    for phrase, w in req_values:
        global_req_scores[phrase] += w
        global_req_counts[phrase] += 1

    for phrase, w in nonreq_values:
        global_non_scores[phrase] += w
        global_non_counts[phrase] += 1


In [289]:
# Convert to sorted lists
req_phrases = sorted(
    global_req_scores.items(),
    key=lambda x: x[1],
    reverse=True
)

nonreq_phrases = sorted(
    global_non_scores.items(),
    key=lambda x: x[1]
)


In [290]:
req_common = []
for p, s in req_phrases:
    if global_req_counts[p] >= 2:
        req_common.append((p, s, global_req_counts[p]))


nonreq_common = []
for p, s in nonreq_phrases:
    if global_non_counts[p] >= 2:
        nonreq_common.append((p, s, global_non_counts[p]))

In [291]:
print("Global requirement indicators:")
for phrase, score, count in req_common[:30]:
    print(f"{phrase:30s} total_score={score:.3f} sources={count}")

print("\nGlobal non requirement indicators:")
for phrase, score, count in nonreq_common[:30]:
    print(f"{phrase:30s} total_score={score:.3f} sources={count}")


Global requirement indicators:
the system                     total_score=9.526 sources=8
shall be                       total_score=8.911 sources=7
should be                      total_score=5.385 sources=7
system shall                   total_score=4.489 sources=4
the user                       total_score=4.446 sources=6
in the                         total_score=3.648 sources=7
will be                        total_score=3.435 sources=4
must be                        total_score=2.890 sources=5
can be                         total_score=2.618 sources=4
the application                total_score=2.534 sources=3
system provides                total_score=2.315 sources=2
ability to                     total_score=2.224 sources=2
the system shall               total_score=2.135 sources=2
at least                       total_score=2.097 sources=3
the following                  total_score=1.546 sources=2
to the                         total_score=1.309 sources=3
and the                  

In [292]:
df_req = pd.DataFrame(req_common[:30], columns=["Phrase", "Total Score", "Sources"])
df_req["Type"] = "Requirement"

df_non = pd.DataFrame(nonreq_common[:30], columns=["Phrase", "Total Score", "Sources"])
df_non["Type"] = "Non-requirement"

df_results = pd.concat([df_req, df_non], ignore_index=True)

In [293]:
df_results.to_csv("linguistic_indicators.csv", index=False)