In [29]:
"""
INFORMATION RETRIEVAL SYSTEM - PHASE 1: DATA PREPROCESSING
============================================================
This is the foundation of our IR system. We'll load and preprocess the news articles.

Author: Your Name
Course: Information Retrieval
"""

import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict, Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download required NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

class DocumentPreprocessor:
    """
    This class handles all text preprocessing tasks.
    
    Why do we need preprocessing?
    - Raw text contains noise (punctuation, special characters)
    - Common words like "the", "is" don't help in retrieval
    - Different forms of words (running, runs, ran) should be treated similarly
    """
    
    def __init__(self):
        # Initialize stopwords (common words to remove)
        self.stop_words = set(stopwords.words('english'))
        
        # Initialize stemmer (reduces words to their root form)
        # Example: "running" -> "run", "better" -> "better"
        self.stemmer = PorterStemmer()
        
        print("✓ Preprocessor initialized")
        print(f"✓ Loaded {len(self.stop_words)} stopwords")
    
    def clean_text(self, text):
        """
        Step 1: Clean the raw text
        - Convert to lowercase
        - Remove special characters
        - Remove extra whitespace
        """
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        """
        Step 2: Split text into individual words (tokens)
        Example: "I love IR" -> ["I", "love", "IR"]
        """
        return word_tokenize(text)
    
    def remove_stopwords(self, tokens):
        """
        Step 3: Remove common words that don't add meaning
        Example: ["the", "quick", "brown", "fox"] -> ["quick", "brown", "fox"]
        """
        return [word for word in tokens if word not in self.stop_words]
    
    def stem_tokens(self, tokens):
        """
        Step 4: Reduce words to their root form
        Example: ["running", "runs", "ran"] -> ["run", "run", "ran"]
        """
        return [self.stemmer.stem(word) for word in tokens]
    
    def preprocess(self, text):
        """
        Complete preprocessing pipeline
        Combines all steps: clean -> tokenize -> remove stopwords -> stem
        """
        # Step 1: Clean
        cleaned = self.clean_text(text)
        
        # Step 2: Tokenize
        tokens = self.tokenize(cleaned)
        
        # Step 3: Remove stopwords
        tokens = self.remove_stopwords(tokens)
        
        # Step 4: Stem
        tokens = self.stem_tokens(tokens)
        
        # Filter out very short tokens (less than 2 characters)
        tokens = [token for token in tokens if len(token) > 1]
        
        return tokens


class DocumentCollection:
    """
    This class manages the entire document collection.
    It loads, preprocesses, and stores all documents.
    """
    
    def __init__(self, csv_path):
        """
        Initialize and load the document collection
        
        Args:
            csv_path: Path to your CSV file containing news articles
        """
        self.preprocessor = DocumentPreprocessor()
        self.documents = []  # Stores original documents
        self.processed_docs = []  # Stores preprocessed documents
        self.doc_ids = []  # Document identifiers
        
        print("\n" + "="*60)
        print("LOADING DOCUMENT COLLECTION")
        print("="*60)
        
        # Load the CSV file
        self.load_documents(csv_path)
        
    def load_documents(self, csv_path):
        """
        Load documents from CSV file
        """
        try:
            # Read the CSV file
            df = pd.read_csv(csv_path)
            
            print(f"\n✓ Loaded {len(df)} documents from CSV")
            print(f"✓ Columns available: {df.columns.tolist()}")
            
            # Assume the dataset has 'title' and 'content' columns
            # Adjust these column names based on your actual dataset
            for idx, row in df.iterrows():
                # Combine title and content for better retrieval
                # You can modify this based on your dataset structure
                doc_text = ""
                
                if 'title' in df.columns:
                    doc_text += str(row['title']) + " "
                if 'content' in df.columns or 'text' in df.columns or 'article' in df.columns:
                    content_col = 'content' if 'content' in df.columns else ('text' if 'text' in df.columns else 'article')
                    doc_text += str(row[content_col])
                
                # Store original document
                self.documents.append({
                    'id': idx,
                    'text': doc_text,
                    'original': row.to_dict()
                })
                
                self.doc_ids.append(idx)
            
            print(f"\n✓ Stored {len(self.documents)} documents")
            
            # Preprocess all documents
            self.preprocess_collection()
            
        except FileNotFoundError:
            print(f"✗ Error: File not found at {csv_path}")
            print("Please make sure you've downloaded the dataset from Kaggle")
        except Exception as e:
            print(f"✗ Error loading documents: {e}")
    
    def preprocess_collection(self):
        """
        Preprocess all documents in the collection
        This is done once and stored for efficiency
        """
        print("\n" + "-"*60)
        print("PREPROCESSING DOCUMENTS")
        print("-"*60)
        
        for idx, doc in enumerate(self.documents):
            # Preprocess the document text
            processed = self.preprocessor.preprocess(doc['text'])
            
            self.processed_docs.append({
                'id': doc['id'],
                'tokens': processed,
                'token_count': len(processed)
            })
            
            # Show progress for every 100 documents
            if (idx + 1) % 100 == 0:
                print(f"Processed {idx + 1}/{len(self.documents)} documents...")
        
        print(f"\n✓ All {len(self.processed_docs)} documents preprocessed!")
        
        # Calculate and display statistics
        self.display_statistics()
    
    def display_statistics(self):
        """
        Display useful statistics about the collection
        """
        print("\n" + "="*60)
        print("COLLECTION STATISTICS")
        print("="*60)
        
        # Calculate average document length
        avg_length = np.mean([doc['token_count'] for doc in self.processed_docs])
        
        # Calculate vocabulary size (unique terms)
        vocabulary = set()
        for doc in self.processed_docs:
            vocabulary.update(doc['tokens'])
        
        print(f"Total Documents: {len(self.documents)}")
        print(f"Average Document Length: {avg_length:.2f} tokens")
        print(f"Vocabulary Size: {len(vocabulary)} unique terms")
        
        # Show example of preprocessing
        if len(self.documents) > 0:
            print("\n" + "-"*60)
            print("PREPROCESSING EXAMPLE")
            print("-"*60)
            example_text = self.documents[0]['text'][:200]
            example_tokens = self.processed_docs[0]['tokens'][:20]
            
            print(f"Original (first 200 chars):\n{example_text}...")
            print(f"\nAfter preprocessing (first 20 tokens):\n{example_tokens}")
    
    def get_document(self, doc_id):
        """Get original document by ID"""
        return self.documents[doc_id]
    
    def get_processed_document(self, doc_id):
        """Get preprocessed document by ID"""
        return self.processed_docs[doc_id]


# ============================================================
# USAGE EXAMPLE
# ============================================================

if __name__ == "__main__":
    """
    HOW TO USE THIS CODE:
    
    1. Download the dataset from Kaggle
    2. Place the CSV file in the same directory as this script
    3. Update the file path below
    4. Run this script
    """
    
    # Path to your CSV file
    CSV_FILE_PATH = r"C:\Users\lenovo\news_articles.csv"  
    
    print("\n" + "="*60)
    print("INFORMATION RETRIEVAL SYSTEM - PHASE 1")
    print("DATA LOADING AND PREPROCESSING")
    print("="*60)
    
    # Create the document collection
    collection = DocumentCollection(CSV_FILE_PATH)
    
    print("\n" + "="*60)
    print("PHASE 1 COMPLETE!")
    print("="*60)
    print("\nYou now have:")
    print("1. Loaded all news articles")
    print("2. Cleaned and preprocessed the text")
    print("3. Created tokens (individual words)")
    print("4. Removed stopwords and stemmed words")
    print("\nNext Phase: We'll implement the Boolean Retrieval Model")
    print("="*60)


INFORMATION RETRIEVAL SYSTEM - PHASE 1
DATA LOADING AND PREPROCESSING
✓ Preprocessor initialized
✓ Loaded 198 stopwords

LOADING DOCUMENT COLLECTION
✗ Error loading documents: 'utf-8' codec can't decode byte 0xb4 in position 1644: invalid start byte

PHASE 1 COMPLETE!

You now have:
1. Loaded all news articles
2. Cleaned and preprocessed the text
3. Created tokens (individual words)
4. Removed stopwords and stemmed words

Next Phase: We'll implement the Boolean Retrieval Model
