# Software Requirements Classification
## Linear Regression

In [37]:
import pandas as pd

df = pd.read_csv('software_requirements_extended.csv')
df.head()

Unnamed: 0,Type,Requirement
0,PE,The system shall refresh the display every 60 ...
1,LF,The application shall match the color of the s...
2,US,If projected the data must be readable. On ...
3,A,The product shall be available during normal ...
4,US,If projected the data must be understandable...


In [38]:
df['Type'] = df['Type'].apply(lambda x: 1 if x == 'FR' or x == 'F' else 0)

In [39]:
import re
import nltk  #natural language processing

nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nickbres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
pattern  = re.compile(r"[^a-zA-Z0-9]")
def req_to_words(req):
    ''' Convert tweet text into a sequence of words '''
    # convert to lower case
    text = req.lower()
    # remove non letters
    text = re.sub(pattern, " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report

### Model

In [42]:
y = df['Type'] # Target
X = [' '.join(req_to_words(req)) for req in df['Requirement']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [43]:
vectorizer2 = TfidfVectorizer(analyzer=req_to_words)
X_train = vectorizer2.fit_transform(X_train)
X_test = vectorizer2.transform(X_test)

In [44]:
model2 = LinearRegression()
model2.fit(X_train, y_train)

In [45]:
# 3. Prediction and Evaluation
y_pred = model2.predict(X_test)
y_pred = [1 if i > 0.5 else 0 for i in y_pred]
print(f'Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 78.52%
              precision    recall  f1-score   support

           0       0.81      0.68      0.74       176
           1       0.77      0.87      0.82       215

    accuracy                           0.79       391
   macro avg       0.79      0.78      0.78       391
weighted avg       0.79      0.79      0.78       391
