In [1]:
!pip install numpy pandas scipy scikit-learn matplotlib seaborn nltk spacy 




[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\aida-lab\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import LabelEncoder
from scipy.stats import spearmanr, pearsonr
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings('ignore')

class TextFeatureSelector:
  
    
    def __init__(self, documents, labels):
        self.documents = documents
        self.labels = labels
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(labels)
        self.vectorizer = None
        self.tfidf_matrix = None
        self.feature_names = None
        
    def _prepare_tfidf_matrix(self, max_features=5000, ngram_range=(1, 2)):
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            stop_words='english',
            lowercase=True,
            strip_accents='unicode'
        )
        
        self.tfidf_matrix = self.vectorizer.fit_transform(self.documents)
        self.feature_names = self.vectorizer.get_feature_names_out()
        
        print(f"TF-IDF Matrix Shape: {self.tfidf_matrix.shape}")
        print(f"Number of features: {len(self.feature_names)}")
        
    def chi_squared_test(self, k=100):
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        # Apply chi-squared test
        chi2_scores, p_values = chi2(self.tfidf_matrix, self.encoded_labels)
        
        # Create feature ranking
        feature_scores = list(zip(self.feature_names, chi2_scores, p_values))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top k features
        top_features = feature_scores[:k]
        
        results = {
            'method': 'Chi-Squared Test',
            'top_features': [(feat, score) for feat, score, _ in top_features],
            'all_scores': feature_scores,
            'selected_feature_names': [feat for feat, _, _ in top_features]
        }
        
        print(f"\nChi-Squared Test Results (Top {k} features):")
        for i, (feature, score, p_val) in enumerate(top_features[:10]):
            print(f"{i+1:2d}. {feature:20s} | Score: {score:8.4f} | p-value: {p_val:.4e}")
        
        return results
    
    def information_gain(self, k=100):
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        def calculate_entropy(labels):
            """Calculate entropy of label distribution"""
            unique_labels, counts = np.unique(labels, return_counts=True)
            probabilities = counts / len(labels)
            return -np.sum(probabilities * np.log2(probabilities + 1e-10))
        
        def calculate_ig(feature_values, labels):
            """Calculate Information Gain for a feature"""
            total_entropy = calculate_entropy(labels)
            
            # Split data based on feature presence/absence
            feature_present = feature_values > 0
            feature_absent = feature_values == 0
            
            if np.sum(feature_present) == 0 or np.sum(feature_absent) == 0:
                return 0
            
            # Calculate weighted entropy
            p_present = np.sum(feature_present) / len(labels)
            p_absent = np.sum(feature_absent) / len(labels)
            
            entropy_present = calculate_entropy(labels[feature_present])
            entropy_absent = calculate_entropy(labels[feature_absent])
            
            weighted_entropy = p_present * entropy_present + p_absent * entropy_absent
            
            return total_entropy - weighted_entropy
        
        # Calculate Information Gain for each feature
        ig_scores = []
        for i in range(self.tfidf_matrix.shape[1]):
            feature_values = self.tfidf_matrix[:, i].toarray().flatten()
            ig_score = calculate_ig(feature_values, self.encoded_labels)
            ig_scores.append(ig_score)
        
        # Create feature ranking
        feature_scores = list(zip(self.feature_names, ig_scores))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top k features
        top_features = feature_scores[:k]
        
        results = {
            'method': 'Information Gain',
            'top_features': top_features,
            'all_scores': feature_scores,
            'selected_feature_names': [feat for feat, _ in top_features]
        }
        
        print(f"\nInformation Gain Results (Top {k} features):")
        for i, (feature, score) in enumerate(top_features[:10]):
            print(f"{i+1:2d}. {feature:20s} | IG Score: {score:8.6f}")
        
        return results
    
    def spearman_correlation(self, k=100):
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        # Calculate Spearman correlation for each feature
        correlations = []
        p_values = []
        
        for i in range(self.tfidf_matrix.shape[1]):
            feature_values = self.tfidf_matrix[:, i].toarray().flatten()
            
            # Handle constant features
            if np.var(feature_values) == 0:
                correlations.append(0)
                p_values.append(1.0)
            else:
                corr, p_val = spearmanr(feature_values, self.encoded_labels)
                correlations.append(abs(corr) if not np.isnan(corr) else 0)
                p_values.append(p_val if not np.isnan(p_val) else 1.0)
        
        # Create feature ranking
        feature_scores = list(zip(self.feature_names, correlations, p_values))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top k features
        top_features = feature_scores[:k]
        
        results = {
            'method': 'Spearman Correlation',
            'top_features': [(feat, corr) for feat, corr, _ in top_features],
            'all_scores': feature_scores,
            'selected_feature_names': [feat for feat, _, _ in top_features]
        }
        
        print(f"\nSpearman Correlation Results (Top {k} features):")
        for i, (feature, corr, p_val) in enumerate(top_features[:10]):
            print(f"{i+1:2d}. {feature:20s} | Correlation: {corr:8.6f} | p-value: {p_val:.4e}")
        
        return results
    
    def tfidf_scoring(self, k=100):
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        # Calculate mean TF-IDF scores
        mean_tfidf_scores = np.array(self.tfidf_matrix.mean(axis=0)).flatten()
        
        # Create feature ranking
        feature_scores = list(zip(self.feature_names, mean_tfidf_scores))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top k features
        top_features = feature_scores[:k]
        
        results = {
            'method': 'TF-IDF Scoring',
            'top_features': top_features,
            'all_scores': feature_scores,
            'selected_feature_names': [feat for feat, _ in top_features]
        }
        
        print(f"\nTF-IDF Scoring Results (Top {k} features):")
        for i, (feature, score) in enumerate(top_features[:10]):
            print(f"{i+1:2d}. {feature:20s} | Mean TF-IDF: {score:8.6f}")
        
        return results
    
    def pearson_correlation(self, k=100):
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        # Calculate Pearson correlation for each feature
        correlations = []
        p_values = []
        
        for i in range(self.tfidf_matrix.shape[1]):
            feature_values = self.tfidf_matrix[:, i].toarray().flatten()
            
            # Handle constant features
            if np.var(feature_values) == 0:
                correlations.append(0)
                p_values.append(1.0)
            else:
                corr, p_val = pearsonr(feature_values, self.encoded_labels)
                correlations.append(abs(corr) if not np.isnan(corr) else 0)
                p_values.append(p_val if not np.isnan(p_val) else 1.0)
        
        # Create feature ranking
        feature_scores = list(zip(self.feature_names, correlations, p_values))
        feature_scores.sort(key=lambda x: x[1], reverse=True)
        
        # Select top k features
        top_features = feature_scores[:k]
        
        results = {
            'method': 'Pearson Correlation',
            'top_features': [(feat, corr) for feat, corr, _ in top_features],
            'all_scores': feature_scores,
            'selected_feature_names': [feat for feat, _, _ in top_features]
        }
        
        print(f"\nPearson Correlation Results (Top {k} features):")
        for i, (feature, corr, p_val) in enumerate(top_features[:10]):
            print(f"{i+1:2d}. {feature:20s} | Correlation: {corr:8.6f} | p-value: {p_val:.4e}")
        
        return results
    
    def compare_all_methods(self, k=100):
        """
        Compare all feature selection methods
        
        Args:
            k: Number of top features to select for each method
            
        Returns:
            Dictionary with results from all methods
        """
        print("=" * 80)
        print("COMPREHENSIVE FILTER-BASED FEATURE SELECTION COMPARISON")
        print("=" * 80)
        
        results = {}
        
        # Run all methods
        results['chi_squared'] = self.chi_squared_test(k)
        results['information_gain'] = self.information_gain(k)
        results['spearman_correlation'] = self.spearman_correlation(k)
        results['tfidf_scoring'] = self.tfidf_scoring(k)
        results['pearson_correlation'] = self.pearson_correlation(k)
        
        # Find common features across methods
        all_selected_features = []
        for method_results in results.values():
            all_selected_features.extend(method_results['selected_feature_names'])
        
        from collections import Counter
        feature_counts = Counter(all_selected_features)
        
        print(f"\n" + "=" * 80)
        print("CONSENSUS FEATURES (Selected by multiple methods):")
        print("=" * 80)
        
        consensus_features = [(feat, count) for feat, count in feature_counts.items() if count > 1]
        consensus_features.sort(key=lambda x: x[1], reverse=True)
        
        for i, (feature, count) in enumerate(consensus_features[:20]):
            print(f"{i+1:2d}. {feature:25s} | Selected by {count} methods")
        
        results['consensus_features'] = consensus_features
        
        return results
    
    def get_selected_features_matrix(self, selected_features):
        """
        Get TF-IDF matrix with only selected features
        
        Args:
            selected_features: List of feature names to include
            
        Returns:
            Reduced TF-IDF matrix and feature names
        """
        if self.tfidf_matrix is None:
            self._prepare_tfidf_matrix()
        
        # Find indices of selected features
        feature_indices = [i for i, name in enumerate(self.feature_names) 
                          if name in selected_features]
        
        # Create reduced matrix
        reduced_matrix = self.tfidf_matrix[:, feature_indices]
        reduced_feature_names = [self.feature_names[i] for i in feature_indices]
        
        print(f"Original matrix shape: {self.tfidf_matrix.shape}")
        print(f"Reduced matrix shape: {reduced_matrix.shape}")
        print(f"Dimensionality reduction: {(1 - reduced_matrix.shape[1]/self.tfidf_matrix.shape[1])*100:.1f}%")
        
        return reduced_matrix, reduced_feature_names


# Example usage and demonstration
def main():
    """
    Demonstration of filter-based feature selection methods
    """
    
    # Sample text documents and labels
    sample_documents = [
        "This is a great product with excellent quality and fast delivery",
        "Poor service and terrible customer support, very disappointed",
        "Amazing features and user-friendly interface, highly recommended",
        "Waste of money, product broke after one day of use",
        "Outstanding performance and reliable functionality",
        "Horrible experience, would not recommend to anyone",
        "Excellent value for money and great customer service",
        "Defective item received, requesting immediate refund",
        "Perfect solution for my needs, very satisfied",
        "Complete disaster, poor quality and late delivery",
        "Superb product quality and exceptional customer care",
        "Terrible product, broke immediately after purchase",
        "Fantastic features and smooth user experience",
        "Awful service, unprofessional staff behavior",
        "Great investment, exceeded my expectations completely"
    ]
    
    sample_labels = [
        "positive", "negative", "positive", "negative", "positive",
        "negative", "positive", "negative", "positive", "negative",
        "positive", "negative", "positive", "negative", "positive"
    ]
    
    print("Filter-Based Feature Selection for Text Documents")
    print("=" * 60)
    print(f"Dataset: {len(sample_documents)} documents")
    print(f"Classes: {set(sample_labels)}")
    
    # Initialize feature selector
    selector = TextFeatureSelector(sample_documents, sample_labels)
    
    # Run comprehensive comparison
    results = selector.compare_all_methods(k=50)
    
    # Example: Get reduced feature matrix using consensus features
    if results['consensus_features']:
        top_consensus = [feat for feat, _ in results['consensus_features'][:20]]
        reduced_matrix, reduced_features = selector.get_selected_features_matrix(top_consensus)
        
        print(f"\nReduced feature set contains {len(reduced_features)} features:")
        print(", ".join(reduced_features[:10]) + "...")

if __name__ == "__main__":
    main()

Filter-Based Feature Selection for Text Documents
Dataset: 15 documents
Classes: {'positive', 'negative'}
COMPREHENSIVE FILTER-BASED FEATURE SELECTION COMPARISON
TF-IDF Matrix Shape: (15, 124)
Number of features: 124

Chi-Squared Test Results (Top 50 features):
 1. great                | Score:   0.6769 | p-value: 4.1064e-01
 2. terrible             | Score:   0.6748 | p-value: 4.1137e-01
 3. broke                | Score:   0.6741 | p-value: 4.1163e-01
 4. product broke        | Score:   0.6741 | p-value: 4.1163e-01
 5. poor                 | Score:   0.6306 | p-value: 4.2713e-01
 6. experience recommend | Score:   0.5242 | p-value: 4.6907e-01
 7. horrible             | Score:   0.5242 | p-value: 4.6907e-01
 8. horrible experience  | Score:   0.5242 | p-value: 4.6907e-01
 9. recommend            | Score:   0.5242 | p-value: 4.6907e-01
10. excellent            | Score:   0.4995 | p-value: 4.7973e-01

Information Gain Results (Top 50 features):
 1. great                | IG Score: 0.2128