In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
from collections import Counter
from nltk.stem import WordNetLemmatizer
from rake_nltk import Metric

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load dataset (Modify filename if necessary)
df = pd.read_csv("../Dataset/text_requirement.csv")

In [4]:
# Ensure correct column names
df.columns = ['Review_Text', 'Label']
df.head()

Unnamed: 0,Review_Text,Label
0,"The system can file, store and retrieve inform...",BWR
1,The system must allow a student to be tracked ...,BWR
2,"The system must support management of access, ...",BWR
3,"The system must associate CBT, WBT, and e-Lear...",BWR
4,The system must associate test and examination...,BWR


In [10]:
df['Review_Text'] = df['Review_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
df['Review_Text'].head()

0    The system file, store retrieve information ac...
1    The system must allow student tracked UEID pro...
2    The system must support management access, vie...
3    The system must associate CBT, WBT, e-Learning...
4    The system must associate test examinations co...
Name: Review_Text, dtype: object

In [35]:
# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '', text)  # Remove URLs
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove short words
    text = re.sub(r'\b(cloud-based)\b', 'cloud_based', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize and remove stopwords
    return ' '.join(words)  # Return cleaned text

# Apply text cleaning
df['Cleaned_Text'] = df['Review_Text'].apply(clean_text)

In [36]:
# Group text by category (Label)
category_texts = df.groupby('Label')['Cleaned_Text'].apply(lambda x: ' '.join(x)).to_dict()

In [37]:
# Initialize results dictionary
seed_words = {}
fixed_word_count = 30  # Ensure exactly 30 words per category

In [None]:
# Extract seed words using enhanced TF-IDF and optimized RAKE
word_weights = Counter()
category_word_scores = {}

for category, text in category_texts.items():
    # --- Enhanced TF-IDF Extraction ---
    vectorizer = TfidfVectorizer(
        max_features=50,          # Increased from 10 to capture more relevant terms
        ngram_range=(1, 2),       # Consider both single words and bigrams
        stop_words=list(stop_words),
        min_df=1,                 # Ignore terms appearing in <3 documents
        max_df=1.0                # Remove terms in >70% of documents
    )
    tfidf_matrix = vectorizer.fit_transform([text])
    tfidf_terms = vectorizer.get_feature_names_out()
    
    # --- Optimized RAKE Extraction ---
    rake = Rake(
        stopwords=stop_words,
        min_length=2,             # Minimum phrase length
        max_length=3,             # Maximum phrase length
        ranking_metric=Metric.WORD_FREQUENCY,  # Focus on term importance
        include_repeated_phrases=False  # Avoid duplicate phrases
    )
    rake.extract_keywords_from_text(text)
    rake_phrases = rake.get_ranked_phrases()[:8]  # Get more phrases for better coverage
    
    # Process RAKE phrases to handle multi-word terms
    rake_terms = []
    for phrase in rake_phrases:
        # Split phrases and lemmatize components
        processed_phrase = [lemmatizer.lemmatize(word) for word in phrase.split()]
        rake_terms.extend(processed_phrase)
    
    # Combine and filter terms
    combined_terms = list(set(tfidf_terms) | set(rake_terms))
    filtered_terms = [
        word for word in combined_terms 
        if word.isalpha() 
        and word not in stop_words
        and len(word) > 2  # Filter short words
    ]
    
    # Compute weighted frequencies
    word_frequencies = Counter(filtered_terms)
    word_weights.update(word_frequencies)
    category_word_scores[category] = word_frequencies
    
    # Dynamic word count adjustment
    category_word_count = max(min(len(filtered_terms), 15), 50)  # Flexible count
    if len(filtered_terms) < category_word_count:
        # Add contextually relevant filler words
        freq_words = [word for word, _ in Counter(text.split()).most_common(30)]
        additional_words = [w for w in freq_words 
                           if w not in filtered_terms 
                           and w not in stop_words][:category_word_count - len(filtered_terms)]
        filtered_terms.extend(additional_words)
    
    seed_words[category] = filtered_terms[:category_word_count]

# Enhanced cross-category deduplication
final_seed_words = {category: [] for category in seed_words}
word_category_map = {}

In [40]:
for category, words in seed_words.items():
    for word in words:
        if word in word_to_category:
            prev_category = word_to_category[word]
            
            # Only proceed if word exists in previous category's list
            if word in final_seed_words[prev_category]:
                current_score = category_word_scores[category].get(word, 0)
                prev_score = category_word_scores[prev_category].get(word, 0)
                
                if current_score > prev_score:
                    # Safely remove from previous category
                    final_seed_words[prev_category].remove(word)
                    final_seed_words[category].append(word)
                    word_to_category[word] = category
        else:
            final_seed_words[category].append(word)
            word_to_category[word] = category

In [41]:
# Display the final extracted seed words
for category, words in final_seed_words.items():
    print(f"Category: {category} -> Seed Words: {words}")

Category: BWR -> Seed Words: ['teacher', 'list', 'administrator', 'activity', 'defensive', 'available', 'shot', 'district', 'result', 'participant', 'meeting', 'process', 'material', 'staff', 'report', 'assessment', 'game', 'create', 'development', 'plan']
Category: COM -> Seed Words: ['version', 'compatible', 'internet', 'client', 'current', 'phone', 'database', 'format', 'component', 'party', 'software', 'standard', 'multiple']
Category: PE -> Seed Words: ['day', 'website', 'every', 'load', 'service', 'processing', 'maximum', 'returned', 'longer', 'lead', 'performance', 'take', 'let', 'less', 'connection', 'customer', 'application', 'movie', 'server', 'without', 'task']
Category: U -> Seed Words: ['easy', 'language', 'page', 'dispute', 'successfully', 'cardmember', 'used', 'one', 'help', 'problem']
Category: UIR -> Seed Words: ['shipping', 'profile', 'individual', 'table', 'order', 'include', 'lab', 'allows', 'different', 'case', 'detailed', 'option', 'displayed', 'screen', 'site']
C