# Document Discovery & Web Scraping Experiment

This notebook tests web scraping capabilities for TCS financial documents from screener.in and implements document classification logic.

## Objectives:
1. Test web scraping from screener.in/company/TCS/consolidated/#documents
2. Implement document classification (quarterly reports, earnings calls, etc.)
3. Validate document download and storage mechanisms
4. Test document metadata extraction

In [None]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Configuration
BASE_URL = "https://www.screener.in/company/TCS/consolidated/#documents"
DOWNLOAD_DIR = "data/raw_documents"
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = 1  # seconds

# Create download directory
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Headers to mimic browser request
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

print(f"Download directory: {DOWNLOAD_DIR}")
print(f"Target URL: {BASE_URL}")

In [None]:
def fetch_page_content(url, retries=MAX_RETRIES):
    """
    Fetch page content with retry logic
    """
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=30)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            logger.warning(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
            else:
                raise
    return None

# Test page fetching
try:
    page_content = fetch_page_content(BASE_URL)
    print(f"✅ Successfully fetched page content ({len(page_content)} characters)")
except Exception as e:
    print(f"❌ Failed to fetch page: {e}")
    page_content = None

In [None]:
def extract_document_links(html_content):
    """
    Extract document links from screener.in page
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    documents = []
    
    # Look for document links (adapt selectors based on actual page structure)
    # This is a placeholder - actual selectors need to be determined by inspecting the page
    
    # Common patterns for financial document links
    link_patterns = [
        'a[href*="annual"]',
        'a[href*="quarterly"]', 
        'a[href*=".pdf"]',
        'a[href*="result"]',
        'a[href*="earnings"]'
    ]
    
    for pattern in link_patterns:
        links = soup.select(pattern)
        for link in links:
            href = link.get('href')
            text = link.get_text(strip=True)
            
            if href and text:
                documents.append({
                    'title': text,
                    'url': urljoin(BASE_URL, href),
                    'type': classify_document_type(text),
                    'discovered_at': datetime.now().isoformat()
                })
    
    return documents

def classify_document_type(title):
    """
    Classify document based on title
    """
    title_lower = title.lower()
    
    if any(term in title_lower for term in ['annual', 'yearly', '10-k']):
        return 'annual_report'
    elif any(term in title_lower for term in ['quarterly', 'q1', 'q2', 'q3', 'q4', '10-q']):
        return 'quarterly_report'
    elif any(term in title_lower for term in ['earnings', 'call', 'transcript']):
        return 'earnings_call'
    elif any(term in title_lower for term in ['presentation', 'investor']):
        return 'investor_presentation'
    else:
        return 'other'

# Extract documents if page content is available
if page_content:
    documents = extract_document_links(page_content)
    print(f"Found {len(documents)} potential documents")
    
    # Display sample documents
    if documents:
        df = pd.DataFrame(documents)
        print("\nSample documents:")
        print(df.head())
    else:
        print("⚠️ No documents found - selectors may need adjustment")
else:
    print("⚠️ Cannot extract documents - no page content available")

In [None]:
def download_document(doc_info, download_dir):
    """
    Download a single document
    """
    try:
        response = requests.get(doc_info['url'], headers=HEADERS, timeout=60)
        response.raise_for_status()
        
        # Generate filename
        parsed_url = urlparse(doc_info['url'])
        filename = f"{doc_info['type']}_{int(time.time())}_{os.path.basename(parsed_url.path)}"
        
        if not filename.endswith('.pdf'):
            filename += '.pdf'
        
        filepath = os.path.join(download_dir, filename)
        
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        logger.info(f"Downloaded: {filename}")
        return {
            'status': 'success',
            'filepath': filepath,
            'size_bytes': len(response.content)
        }
        
    except Exception as e:
        logger.error(f"Failed to download {doc_info['url']}: {e}")
        return {
            'status': 'failed',
            'error': str(e)
        }

# Test document download (limit to first 2 documents for testing)
if 'documents' in locals() and documents:
    test_documents = documents[:2]  # Test with first 2 documents
    download_results = []
    
    for doc in test_documents:
        print(f"Attempting to download: {doc['title']}")
        result = download_document(doc, DOWNLOAD_DIR)
        download_results.append(result)
        time.sleep(DELAY_BETWEEN_REQUESTS)  # Respectful scraping
    
    # Display results
    success_count = sum(1 for r in download_results if r['status'] == 'success')
    print(f"\n✅ Successfully downloaded {success_count}/{len(test_documents)} documents")
    
    for i, result in enumerate(download_results):
        if result['status'] == 'success':
            print(f"  📄 {test_documents[i]['title']} -> {result['filepath']} ({result['size_bytes']} bytes)")
        else:
            print(f"  ❌ {test_documents[i]['title']} -> {result['error']}")
else:
    print("⚠️ No documents available for download testing")

In [None]:
# Document validation and metadata extraction
def validate_downloaded_files():
    """
    Validate downloaded files and extract metadata
    """
    if not os.path.exists(DOWNLOAD_DIR):
        print("Download directory not found")
        return []
    
    files = os.listdir(DOWNLOAD_DIR)
    pdf_files = [f for f in files if f.endswith('.pdf')]
    
    file_info = []
    for file in pdf_files:
        filepath = os.path.join(DOWNLOAD_DIR, file)
        try:
            stat = os.stat(filepath)
            file_info.append({
                'filename': file,
                'size_bytes': stat.st_size,
                'size_mb': round(stat.st_size / (1024*1024), 2),
                'created': datetime.fromtimestamp(stat.st_ctime).isoformat(),
                'is_valid_pdf': stat.st_size > 1000  # Basic size check
            })
        except Exception as e:
            logger.error(f"Error processing {file}: {e}")
    
    return file_info

# Validate downloaded files
file_metadata = validate_downloaded_files()
if file_metadata:
    print(f"\n📊 Downloaded file summary:")
    df_files = pd.DataFrame(file_metadata)
    print(df_files.to_string(index=False))
    
    total_size_mb = sum(f['size_mb'] for f in file_metadata)
    valid_files = sum(1 for f in file_metadata if f['is_valid_pdf'])
    print(f"\n📈 Total size: {total_size_mb} MB")
    print(f"📈 Valid PDFs: {valid_files}/{len(file_metadata)}")
else:
    print("No files found for validation")

## Experiment Results & Next Steps

### Key Findings:
1. **Web Scraping**: Test results for screener.in document discovery
2. **Document Classification**: Accuracy of automatic document type detection
3. **Download Success Rate**: Percentage of successful document downloads
4. **File Validation**: Quality and completeness of downloaded files

### Improvements Needed:
- [ ] Refine CSS selectors for better document discovery
- [ ] Add robust error handling for failed downloads
- [ ] Implement document content validation (PDF structure check)
- [ ] Add duplicate detection and handling
- [ ] Create document indexing and search capabilities

### Integration Points:
- **Table Extraction**: Feed downloaded PDFs to 02_table_extraction.ipynb
- **Financial Analysis**: Use classified documents in 03_financial_analysis.ipynb
- **RAG Implementation**: Index documents for 05_rag_implementation.ipynb