In [None]:
"""
Training and evaluating models for FR/NFR classification
"""

import sys
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
) # Fine-tune BERT with HuggingFace

try:
    ROOT = Path(__file__).resolve().parents[1]
except NameError:
    ROOT = Path.cwd().parent

if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from config import DATA_PROCESSED, MODELS_DIR

In [None]:
# Load data
train_df = pd.read_csv(DATA_PROCESSED / "train.csv")
test_df = pd.read_csv(DATA_PROCESSED / "test.csv")

y_train = train_df["label"]
y_test = test_df["label"]

In [None]:
# Load precomputed features
X_train_tfidf = np.load(DATA_PROCESSED / "X_train_tfidf.npy")
X_test_tfidf = np.load(DATA_PROCESSED / "X_test_tfidf.npy")

X_train_bert = np.load(DATA_PROCESSED / "X_train_bert.npy") if (DATA_PROCESSED / "X_train_bert.npy").exists() else None
X_test_bert = np.load(DATA_PROCESSED / "X_test_bert.npy") if (DATA_PROCESSED / "X_test_bert.npy").exists() else None


In [None]:
# Train SVM
print("Training SVM...")
svm_clf = SVC(kernel="linear", probability=True)
svm_clf.fit(X_train_tfidf, y_train)

y_pred_svm = svm_clf.predict(X_test_tfidf)

print("Results (SVM - TF-IDF):")
print(classification_report(y_test, y_pred_svm))

joblib.dump(svm_clf, MODELS_DIR / "svm_tfidf.pkl")

Training SVM...
Results (SVM - TF-IDF):
              precision    recall  f1-score   support

          FR       0.89      0.93      0.91       787
         NFR       0.86      0.79      0.82       409

    accuracy                           0.88      1196
   macro avg       0.88      0.86      0.87      1196
weighted avg       0.88      0.88      0.88      1196



['/home/glaucia/RequirementsNLP/models/svm_tfidf.pkl']

In [None]:
# Train Logistic Regression
print("Training Logistic Regression (TF-IDF)...")
log_reg = LogisticRegression(max_iter=200, solver="liblinear")
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_lr))
joblib.dump(log_reg, MODELS_DIR / "log_reg_tfidf.pkl")

Training Logistic Regression (TF-IDF)...
              precision    recall  f1-score   support

          FR       0.87      0.95      0.90       787
         NFR       0.88      0.72      0.79       409

    accuracy                           0.87      1196
   macro avg       0.87      0.83      0.85      1196
weighted avg       0.87      0.87      0.87      1196



['/home/glaucia/RequirementsNLP/models/log_reg_tfidf.pkl']

In [None]:
# Train MLP with BERT embeddings
if X_train_bert is not None:
    print("Training MLP (BERT embeddings)...")
    mlp = MLPClassifier(hidden_layer_sizes=(256,), activation="relu", max_iter=30, random_state=42)
    mlp.fit(X_train_bert, y_train)
    y_pred_mlp = mlp.predict(X_test_bert)
    print(classification_report(y_test, y_pred_mlp))
    joblib.dump(mlp, MODELS_DIR / "mlp_bert.pkl")
else:
    print("BERT embeddings not found — skipping MLP model.")

Training MLP (BERT embeddings)...
              precision    recall  f1-score   support

          FR       0.86      0.93      0.89       787
         NFR       0.84      0.71      0.77       409

    accuracy                           0.86      1196
   macro avg       0.85      0.82      0.83      1196
weighted avg       0.85      0.86      0.85      1196





In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Convert labels to integers for HF Trainer
label2id = {label: idx for idx, label in enumerate(train_df['label'].unique())}
train_df['label'] = train_df['label'].map(label2id)
test_df['label'] = test_df['label'].map(label2id)

train_dataset = Dataset.from_pandas(train_df[["clean_text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["clean_text", "label"]])

def tokenize_function(examples):
    return tokenizer(examples["clean_text"], truncation=True, padding="max_length", max_length=128)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id)
)

training_args = TrainingArguments(
    output_dir="models/bert_finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("models/bert_finetuned")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 4781/4781 [00:00<00:00, 6058.03 examples/s]
Map: 100%|██████████| 1196/1196 [00:00<00:00, 4694.17 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.320217
2,0.377900,0.305366
3,0.377900,0.333691




In [8]:
print("Training completed successfully!")
print("Saved models:")
for f in MODELS_DIR.iterdir():
    print("-", f.name)

Training completed successfully!
Saved models:
- mlp_bert.pkl
- log_reg_tfidf.pkl
- svm_tfidf.pkl
