In [None]:
"""
Preprocessing of Software Requirements
Dataset source: https://data.mendeley.com/datasets/4ysx9fyzv4/1

Steps:
1. Read original dataset (Functional/Non-Functional requirements)
2. Clean, normalize and lemmatize text using spaCy
3. Remove duplicates and missing entries
4. Split into training and test sets
"""

import sys
from pathlib import Path

import spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

try:
    ROOT = Path(__file__).resolve().parents[1]
except NameError:
    ROOT = Path.cwd().parent

if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from config import DATA_RAW, DATA_PROCESSED

In [17]:
# Load spaCy English model (disable parser and NER to speed up)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

In [18]:
# Load dataset
file_path = DATA_RAW / "FR_NFR_Dataset.xlsx"
if not file_path.exists():
    raise FileNotFoundError(f"File not found: {file_path}")

df = pd.read_excel(DATA_RAW / "FR_NFR_Dataset.xlsx", engine='openpyxl')
df.rename(columns={"Requirement Text": "requirement"}, inplace=True)

print(f"Original dataset: {df.shape[0]} rows")

Original dataset: 6117 rows


In [10]:
# Ensure column consistency
if "Type" in df.columns:
    df.rename(columns={"Type": "label"}, inplace=True)

if "requirement" not in df.columns or "label" not in df.columns:
    raise ValueError("Dataset must contain 'requirement' and 'label' columns")

In [11]:
# Remove missing or duplicated entries
df.dropna(subset=["requirement", "label"], inplace=True)
df.drop_duplicates(subset=["requirement"], inplace=True)

In [None]:
# Text preprocessing with spaCy

def clean_text_spacy(text: str) -> str:
    """
    Lowercase, remove punctuation, stopwords, and lemmatize using spaCy.
    Only keeps alphabetic tokens.
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)  # keep letters and numbers
    text = re.sub(r"\s+", " ", text).strip()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)


tqdm.pandas(desc="Cleaning and lemmatizing text")
df["clean_text"] = df["requirement"].astype(str).progress_apply(clean_text_spacy)

Cleaning and lemmatizing text: 100%|██████████| 5977/5977 [00:20<00:00, 285.22it/s]


In [13]:
# Remove any empty clean_text entries
df = df[df["clean_text"].str.strip() != ""]

# Split into train/test
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

In [14]:
# Save processed data
train_df.to_csv(DATA_PROCESSED / "train.csv", index=False)
test_df.to_csv(DATA_PROCESSED / "test.csv", index=False)

# Summary
print(f"\nProcessed data saved to: {DATA_PROCESSED}")
print(f"Train set: {train_df.shape[0]} rows")
print(f"Test set:  {test_df.shape[0]} rows")
print("\nClass distribution:\n", df["label"].value_counts())


Processed data saved to: ../data/processed
Train set: 4781 rows
Test set:  1196 rows

Class distribution:
 label
FR     3931
NFR    2046
Name: count, dtype: int64
