# Software Requirements Classification
## Logistic Regression

### Data Preprocessing

In [171]:
import pandas as pd

df = pd.read_csv('software_requirements_extended.csv')
df.head()

Unnamed: 0,Type,Requirement
0,PE,The system shall refresh the display every 60 ...
1,LF,The application shall match the color of the s...
2,US,If projected the data must be readable. On ...
3,A,The product shall be available during normal ...
4,US,If projected the data must be understandable...


In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Type         977 non-null    object
 1   Requirement  977 non-null    object
dtypes: object(2)
memory usage: 15.4+ KB


In [173]:
import plotly.express as px

# draw a pie chart of the different sentiments of the posts
fig = px.pie(df, names='Type', title ='Pie of types of requirements')
fig.show()

data set is small and not balanced to write a model to predict all the types of requirements. but its enough and balanced to predict if type is functional or non-functional

In [174]:
df['Type'] = df['Type'].apply(lambda x: 1 if x == 'FR' or x == 'F' else 0)
fig = px.pie(df, names='Type', title ='Pie of types of requirements')
fig.show()

In [175]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nickbres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nickbres/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

text cleaning with lemmatezation or stemmer 

In [176]:
# Pre-load stopwords to improve efficiency
STOPWORDS = set(stopwords.words("english"))

# Expanded pattern to potentially keep useful punctuation or additional cleaning
pattern = re.compile(r"[^a-zA-Z]")

def req_to_words(req):
    """Convert requirement text into a sequence of words after lemmatization."""
    # Convert to lower case
    text = req.lower()
    # Remove unwanted characters
    text = re.sub(pattern, " ", text)
    # Tokenize
    words = text.split()
    # Remove stopwords
    words = [w for w in words if w not in STOPWORDS]
    # Apply lemmatization
    # words = [ WordNetLemmatizer().lemmatize(w) for w in words]
    # Apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    return words

# Example use
print("\nOriginal requirement ->", df['Requirement'][0])
print("\nProcessed requirement ->", req_to_words(df['Requirement'][0]))


Original requirement -> The system shall refresh the display every 60 seconds.

Processed requirement -> ['system', 'shall', 'refresh', 'display', 'everi', 'second']


In [177]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import cross_val_score

### Vectorizer

In [178]:
y = df['Type'] # Target
X = [' '.join(req_to_words(req)) for req in df['Requirement']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [179]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

### Model

In [180]:
# # looking for the best hyperparameters
# # Define the model
# model = LogisticRegression()
# 
# # Define a grid of hyperparameters to search
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],  # Values for C
#     'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Types of penalty
#     'solver': ['liblinear', 'lbfgs', 'saga'],  # Solvers
#     'max_iter': [100, 200, 300]  # Maximum iterations
# }
# 
# # Note: Not all solvers support all penalties. For example, 'liblinear' supports 'l1' and 'l2',
# # 'saga' supports 'elasticnet' along with 'l1' and 'l2'. You might need to adjust the grid based on compatibility.
# 
# # Setup the grid search
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
# 
# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)
# 
# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

In [181]:
model = LogisticRegression(
    C=1,  # Regularization strength
    penalty='l2',  # Norm used in the penalization
    solver='liblinear',  # Optimization algorithm
    max_iter=100  # Maximum number of iterations for the optimization algorithm
)
pipe = make_pipeline(vectorizer, model)
scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
print("Accuracy for each fold: ", scores)
average_accuracy = np.mean(scores)
print("Average Cross-Validation Accuracy: {:.2f}%".format(average_accuracy * 100))
model.fit(X_train,y_train)

Accuracy for each fold:  [0.61734694 0.48979592 0.69230769 0.82564103 0.63589744]
Average Cross-Validation Accuracy: 65.22%


In [182]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Detailed performance report
print(classification_report(y_test, y_pred))

Accuracy: 85.17%
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       176
           1       0.86      0.87      0.87       215

    accuracy                           0.85       391
   macro avg       0.85      0.85      0.85       391
weighted avg       0.85      0.85      0.85       391


### Testing the model

In [183]:
def predict_req(req, expected, model, vectorizer):
    df = pd.DataFrame([req], columns=['Requirement'])
    X = [' '.join(req_to_words(req)) for req in df['Requirement']]
    X = vectorizer.transform(X)

    prediction = model.predict(X)
    predicted_label = 'Functional' if prediction[0] == 1 else 'Non-Functional'
    success = predicted_label == expected

    return predicted_label, success


In [184]:
def summarize_predictions(examples, model, vectorizer):
    results = []
    for req, expected_label in examples:
        predicted_label, success = predict_req(req, expected_label, model, vectorizer)
        results.append({
            'Requirement': req,
            'Expected Label': expected_label,
            'Predicted Label': predicted_label,
            'Success': success
        })
    return pd.DataFrame(results)

In [185]:
examples = [
    ("The system shall provide a login form that accepts a username and password.", 'Functional'),
    ("The application must ensure that response times are less than 2 seconds under normal load conditions.", 'Non-Functional'),
    ("All user passwords shall be encrypted before saving to the database.", 'Functional'),
    ("The software should be compatible with the Windows 10 operating system.", 'Non-Functional'),
    ("The interface shall refresh when new data is available.", 'Functional'),
    ("The product shall comply with international accessibility standards.", 'Non-Functional'),
    ("Database backup shall occur every 24 hours automatically.", 'Functional'),
    ("The system shall support a minimum of 500 concurrent user connections.", 'Functional'),
    ("The system’s mean time to failure shall be at least 10,000 hours.", 'Non-Functional'),
    ("Users must be able to complete the primary workflow in less than three minutes.", 'Non-Functional')
]


In [186]:
summary = summarize_predictions(examples, model, vectorizer)
summary

Unnamed: 0,Requirement,Expected Label,Predicted Label,Success
0,The system shall provide a login form that acc...,Functional,Functional,True
1,The application must ensure that response time...,Non-Functional,Non-Functional,True
2,All user passwords shall be encrypted before s...,Functional,Functional,True
3,The software should be compatible with the Win...,Non-Functional,Non-Functional,True
4,The interface shall refresh when new data is a...,Functional,Non-Functional,False
5,The product shall comply with international ac...,Non-Functional,Non-Functional,True
6,Database backup shall occur every 24 hours aut...,Functional,Non-Functional,False
7,The system shall support a minimum of 500 conc...,Functional,Non-Functional,False
8,The system’s mean time to failure shall be at ...,Non-Functional,Non-Functional,True
9,Users must be able to complete the primary wor...,Non-Functional,Non-Functional,True


In [187]:
fig = px.pie(summary, names='Success', title ='Success of the model with preprocessing')
fig.show()