In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from collections import Counter
from nltk.stem import WordNetLemmatizer
from rake_nltk import Metric

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load dataset (Modify filename if necessary)
df_real = pd.read_csv("../datasets/PROMISE_exp.csv")
df_real.head()

Unnamed: 0,ProjectID,RequirementText,_class_,Unnamed: 3,Unnamed: 4,Requirement Type,Count
0,1,The system shall refresh the display every 60 ...,PE,,,Functional Requirement (F),444.0
1,1,The application shall match the color of the s...,LF,,,Availability (A),31.0
2,1,If projected the data must be readable. On a...,US,,,Legal (L),15.0
3,1,The product shall be available during normal b...,A,,,Look-and-feel (LF),49.0
4,1,If projected the data must be understandable....,US,,,Maintainability (MN),24.0


In [4]:
df = df_real[['RequirementText', '_class_']].copy()

df = df.reset_index(drop=True)
df.head()

Unnamed: 0,RequirementText,_class_
0,The system shall refresh the display every 60 ...,PE
1,The application shall match the color of the s...,LF
2,If projected the data must be readable. On a...,US
3,The product shall be available during normal b...,A
4,If projected the data must be understandable....,US


In [5]:
df['RequirementText'] = df['RequirementText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['RequirementText'].head()

0    The system shall refresh display every 60 seco...
1    The application shall match color schema set f...
2    If projected data must readable. On 10x10 proj...
3    The product shall available normal business ho...
4    If projected data must understandable. On 10x1...
Name: RequirementText, dtype: object

In [6]:
# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)  # Remove URLs
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = re.sub(r'\b(cloud-based)\b', 'cloud_based', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(words)  # Return cleaned text

# Apply text cleaning
df['cleaned_text'] = df['RequirementText'].apply(clean_text)
df.head()

Unnamed: 0,RequirementText,_class_,cleaned_text
0,The system shall refresh display every 60 seco...,PE,system shall refresh display every second
1,The application shall match color schema set f...,LF,application shall match color schema set forth...
2,If projected data must readable. On 10x10 proj...,US,projected data must readable projection screen...
3,The product shall available normal business ho...,A,product shall available normal business hour l...
4,If projected data must understandable. On 10x1...,US,projected data must understandable projection ...


In [7]:
# Save to CSV
output_filename = "../datasets/PROMISE_exp_cleaned.csv"  # You can change this path
df.to_csv(output_filename, index=False)  # index=False avoids saving row numbers
print(f"Cleaned data saved to {output_filename}")

Cleaned data saved to ../datasets/PROMISE_exp_cleaned.csv


In [8]:
# Group text by category (Label)
category_texts = df.groupby('_class_')['cleaned_text'].apply(lambda x: ' '.join(x)).to_dict()

In [9]:
# Initialize results dictionary
seed_words = {}
fixed_word_count = 30  # Ensure exactly 30 words per category

In [10]:
# Extract seed words using enhanced TF-IDF and optimized RAKE
word_weights = Counter()
category_word_scores = {}

for category, text in category_texts.items():
    # --- Enhanced TF-IDF Extraction ---
    vectorizer = TfidfVectorizer(
        max_features=50,          # Increased from 10 to capture more relevant terms
        ngram_range=(1, 2),       
        stop_words=list(stop_words),
        min_df=1,                 # Ignore terms appearing in <3 documents
        max_df=1.0                # Remove terms in >70% of documents
    )
    tfidf_matrix = vectorizer.fit_transform([text])
    tfidf_terms = vectorizer.get_feature_names_out()
    
    # --- Optimized RAKE Extraction ---
    rake = Rake(
        stopwords=stop_words,
        min_length=2,             # Minimum phrase length
        max_length=3,             # Maximum phrase length
        ranking_metric=Metric.WORD_FREQUENCY,  # Focus on term importance
        include_repeated_phrases=False  # Avoid duplicate phrases
    )
    rake.extract_keywords_from_text(text)
    rake_phrases = rake.get_ranked_phrases()[:8]  # Get more phrases for better coverage
    
    # Process RAKE phrases to handle multi-word terms
    rake_terms = []
    for phrase in rake_phrases:
        # Split phrases and lemmatize components
        processed_phrase = [lemmatizer.lemmatize(word) for word in phrase.split()]
        rake_terms.extend(processed_phrase)
    
    # Combine and filter terms
    combined_terms = list(set(tfidf_terms) | set(rake_terms))
    filtered_terms = [
        word for word in combined_terms 
        if word.isalpha() 
        and word not in stop_words
        and len(word) > 2  # Filter short words
    ]
    
    # Compute weighted frequencies
    word_frequencies = Counter(filtered_terms)
    word_weights.update(word_frequencies)
    category_word_scores[category] = word_frequencies
    
    # Dynamic word count adjustment
    category_word_count = max(min(len(filtered_terms), 15), 50)  # Flexible count
    if len(filtered_terms) < category_word_count:
        # Add contextually relevant filler words
        freq_words = [word for word, _ in Counter(text.split()).most_common(30)]
        additional_words = [w for w in freq_words 
                           if w not in filtered_terms 
                           and w not in stop_words][:category_word_count - len(filtered_terms)]
        filtered_terms.extend(additional_words)
    
    seed_words[category] = filtered_terms[:category_word_count]

# Enhanced cross-category deduplication
final_seed_words = {category: [] for category in seed_words}
word_category_map = {}

In [11]:
# Initialize word_to_category dictionary
word_to_category = {}

# Initialize final_seed_words with empty lists for each category
final_seed_words = {category: [] for category in seed_words}

for category, words in seed_words.items():
    for word in words:
        if word in word_to_category:
            prev_category = word_to_category[word]
            
            # Only proceed if word exists in previous category's list
            if word in final_seed_words[prev_category]:
                current_score = category_word_scores[category].get(word, 0)
                prev_score = category_word_scores[prev_category].get(word, 0)
                
                if current_score > prev_score:
                    # Safely remove from previous category
                    final_seed_words[prev_category].remove(word)
                    final_seed_words[category].append(word)
                    word_to_category[word] = category
        else:
            final_seed_words[category].append(word)
            word_to_category[word] = category

In [12]:
# Display the final extracted seed words
for category, words in final_seed_words.items():
    print(f"Category: {category} -> Seed Words: {words}")

Category: A -> Seed Words: ['year', 'long', 'system', 'achieve', 'wcs', 'internet', 'user', 'product', 'available', 'provide', 'per', 'support', 'technical', 'contractual', 'application', 'availability', 'time', 'customer', 'website', 'service', 'period', 'use', 'shall', 'must', 'access', 'hour', 'day', 'online', 'schedule', 'uptime']
Category: F -> Seed Words: ['game', 'member', 'able', 'class', 'case', 'player', 'administrator', 'list', 'nursing', 'include', 'part', 'data', 'site', 'lead', 'lab', 'search', 'information', 'meeting', 'program', 'allow', 'staff', 'display', 'dispute', 'student', 'view', 'section', 'clinical', 'request', 'cohort']
Category: FT -> Seed Words: ['dependent', 'result', 'failure', 'continue', 'fault', 'robust', 'preference', 'accommodate', 'tablet', 'loss', 'without', 'input', 'operate', 'tolerance', 'event', 'prevent', 'malicious', 'reliability', 'database', 'filesystems', 'crash', 'item', 'operational', 'go']
Category: L -> Seed Words: ['action', 'audit', '