In [None]:
"""
Feature Extraction for Software Requirements
Dataset: processed clean text from 02_Preprocessing.ipynb

Steps:
1. Load preprocessed train/test data
2. Extract TF-IDF features (bag-of-words com ngrams)
3. Extract BERT embeddings (DistilBERT)
4. Quick inspection of features
"""

import pickle
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoModel, AutoTokenizer

try:
    ROOT = Path(__file__).resolve().parents[1]
except NameError:
    ROOT = Path.cwd().parent

if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from config import DATA_PROCESSED, MODELS_DIR

In [3]:
# Load preprocessed data
train_df = pd.read_csv(DATA_PROCESSED / "train.csv")
test_df = pd.read_csv(DATA_PROCESSED / "test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (4781, 3)
Test shape: (1196, 3)


In [4]:
# TF-IDF Feature Extraction
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  
    ngram_range=(1,2),  
)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['clean_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['clean_text'])

y_train = train_df['label']
y_test = test_df['label']

print(f"TF-IDF Train shape: {X_train_tfidf.shape}")
print(f"TF-IDF Test shape: {X_test_tfidf.shape}")
print("Sample TF-IDF features:", tfidf_vectorizer.get_feature_names_out()[:10])

TF-IDF Train shape: (4781, 5000)
TF-IDF Test shape: (1196, 5000)
Sample TF-IDF features: ['ability' 'ability create' 'ability modify' 'ability search'
 'ability specify' 'ability view' 'able' 'able accept' 'able access'
 'able acknowledge']


In [5]:
# BERT Embeddings (DistilBERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")


def get_bert_embeddings(text_list, batch_size=16):
    """
    Returns mean-pooled BERT embeddings for a list of texts.
    """
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch_texts = text_list[i:i+batch_size]
        enc = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**enc)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        all_embeddings.append(embeddings)
    return torch.cat(all_embeddings)


X_train_bert = get_bert_embeddings(train_df['clean_text'].tolist())
X_test_bert = get_bert_embeddings(test_df['clean_text'].tolist())

print(f"BERT Train embeddings shape: {X_train_bert.shape}")
print(f"BERT Test embeddings shape: {X_test_bert.shape}")

BERT Train embeddings shape: torch.Size([4781, 768])
BERT Test embeddings shape: torch.Size([1196, 768])


In [6]:
# Quick inspection
print("\nExample TF-IDF vector (first training sample):")
print(X_train_tfidf[0].toarray())

print("\nExample BERT embedding (first training sample, first 10 values):")
print(X_train_bert[0].numpy()[:10])


Example TF-IDF vector (first training sample):
[[0. 0. 0. ... 0. 0. 0.]]

Example BERT embedding (first training sample, first 10 values):
[-0.04000298 -0.05843577  0.03065304  0.19102669  0.25054333 -0.16460642
  0.02225917  0.18777972  0.05804351 -0.17780061]


In [7]:
# Comparison: TF-IDF vs BERT (sample requirements)
sample_texts = train_df['clean_text'].tolist()[:5]
print("\nSample requirements:\n", sample_texts)

X_sample_tfidf = tfidf_vectorizer.transform(sample_texts)
X_sample_bert = get_bert_embeddings(sample_texts)

print("\nTF-IDF vectors (first sample):")
print(X_sample_tfidf[0].toarray())

print("\nBERT embedding (first sample, first 10 values):")
print(X_sample_bert[0].numpy()[:10])


Sample requirements:
 ['team member access maintain database server locally', 'system shall support multiple command line interface mode fastkramch bcturch muhturch madbtanam offer different functionality base user requirement', 'system shall able update user s location find mile radius', 'system shall deliver datum originator destination acceptable residual error rate ensure non corruption transfer datum', 'system shall assign default system datum value necessary']

TF-IDF vectors (first sample):
[[0. 0. 0. ... 0. 0. 0.]]

BERT embedding (first sample, first 10 values):
[-0.04000298 -0.05843577  0.03065304  0.19102669  0.25054333 -0.16460642
  0.02225917  0.18777972  0.05804351 -0.17780061]


In [None]:
# Save TF-IDF matrices
np.save(DATA_PROCESSED / "X_train_tfidf.npy", X_train_tfidf.toarray())
np.save(DATA_PROCESSED / "X_test_tfidf.npy", X_test_tfidf.toarray())

# Save BERT embeddings
np.save(DATA_PROCESSED / "X_train_bert.npy", X_train_bert.numpy())
np.save(DATA_PROCESSED / "X_test_bert.npy", X_test_bert.numpy())

print(f"\nSaved TF-IDF and BERT features to {DATA_PROCESSED}")

with open(MODELS_DIR / "tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)



Saved TF-IDF and BERT features to /home/glaucia/RequirementsNLP/data/processed
