In [1]:
pip install requests beautifulsoup4 fake-useragent pillow

Collecting fake-useragent
  Downloading fake_useragent-2.2.0-py3-none-any.whl.metadata (17 kB)
Downloading fake_useragent-2.2.0-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fake-useragent
Successfully installed fake-useragent-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
# -*- coding: utf-8 -*-
"""
News Article Extraction Agent
Extracts Header, Body, and Images from news article URLs
"""

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import time
from fake_useragent import UserAgent
import json
import os
from PIL import Image
import io

class NewsArticleExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.ua = UserAgent()
        self.session.headers.update({
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
    def is_valid_url(self, url):
        """Check if the URL is valid"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False
    
    def get_page_content(self, url):
        """Fetch the content of a web page"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response.content
        except Exception as e:
            print(f"Error fetching page: {e}")
            return None
    
    def extract_title(self, soup):
        """Extract the article title"""
        # Try different meta tags and elements that might contain the title
        selectors = [
            'meta[property="og:title"]',
            'meta[name="twitter:title"]',
            'h1',
            '.headline',
            '.title',
            '[class*="title"]',
            '[class*="headline"]',
            'title'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    title = element.get('content', '').strip()
                else:
                    title = element.get_text().strip()
                
                if title and len(title) > 10 and len(title) < 200:
                    return title
        
        # Fallback to the page title
        if soup.title:
            return soup.title.get_text().strip()
        
        return "Title not found"
    
    def extract_body(self, soup):
        """Extract the main article body text"""
        # Try to find the main content area
        selectors = [
            'article',
            '.article-body',
            '.post-content',
            '.entry-content',
            '.story-content',
            '[class*="content"]',
            '[class*="body"]',
            'main'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                # Clean up the text
                text = element.get_text()
                text = re.sub(r'\s+', ' ', text).strip()
                
                if len(text) > 100:  # Reasonable minimum length for an article
                    return text
        
        # If no specific content area found, try to find paragraphs
        paragraphs = soup.find_all('p')
        text = ' '.join([p.get_text() for p in paragraphs])
        text = re.sub(r'\s+', ' ', text).strip()
        
        if len(text) > 100:
            return text
        
        return "Body content not found or too short"
    
    def extract_images(self, soup, base_url):
        """Extract images from the article"""
        images = []
        
        # Look for images in various locations
        img_selectors = [
            'img',
            'meta[property="og:image"]',
            'meta[name="twitter:image"]',
            'link[rel="image_src"]'
        ]
        
        for selector in img_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    img_url = element.get('content', '')
                elif selector.startswith('link'):
                    img_url = element.get('href', '')
                else:
                    img_url = element.get('src', '')
                
                if img_url:
                    # Make URL absolute
                    img_url = urljoin(base_url, img_url)
                    
                    # Get alt text for regular img tags
                    alt_text = element.get('alt', '') if selector == 'img' else ''
                    
                    # Avoid duplicates
                    if img_url not in [img['url'] for img in images]:
                        images.append({
                            'url': img_url,
                            'alt': alt_text
                        })
        
        return images
    
    def extract_metadata(self, soup):
        """Extract additional metadata from the article"""
        metadata = {}
        
        # Publication date
        date_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="date"]',
            'meta[name="publish_date"]',
            'time',
            '[class*="date"]',
            '[class*="time"]'
        ]
        
        for selector in date_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    date = element.get('content', '')
                else:
                    date = element.get('datetime', '') or element.get_text()
                
                if date and 'date' not in metadata:
                    metadata['date'] = date.strip()
                    break
        
        # Author
        author_selectors = [
            'meta[name="author"]',
            'meta[property="article:author"]',
            '[rel="author"]',
            '.author',
            '[class*="author"]',
            '[class*="byline"]'
        ]
        
        for selector in author_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    author = element.get('content', '')
                else:
                    author = element.get_text()
                
                if author and 'author' not in metadata:
                    metadata['author'] = author.strip()
                    break
        
        # Description
        desc_selectors = [
            'meta[property="og:description"]',
            'meta[name="description"]',
            'meta[name="twitter:description"]'
        ]
        
        for selector in desc_selectors:
            elements = soup.select(selector)
            for element in elements:
                description = element.get('content', '')
                if description and 'description' not in metadata:
                    metadata['description'] = description.strip()
                    break
        
        return metadata
    
    def download_image(self, image_url, save_dir=None):
        """Download an image and return information about it"""
        try:
            response = self.session.get(image_url, timeout=10)
            response.raise_for_status()
            
            img_data = response.content
            img = Image.open(io.BytesIO(img_data))
            
            image_info = {
                'url': image_url,
                'format': img.format,
                'size': len(img_data),
                'dimensions': img.size,
                'mode': img.mode,
                'downloaded': True
            }
            
            # Save image if directory provided
            if save_dir:
                os.makedirs(save_dir, exist_ok=True)
                filename = os.path.join(save_dir, f"image_{int(time.time())}_{hash(image_url)}.{img.format.lower()}")
                with open(filename, 'wb') as f:
                    f.write(img_data)
                image_info['saved_path'] = filename
            
            return image_info
            
        except Exception as e:
            print(f"Error downloading image {image_url}: {e}")
            return {
                'url': image_url,
                'error': str(e),
                'downloaded': False
            }
    
    def extract_article(self, url, download_images=False, image_save_dir=None):
        """
        Extract information from a news article URL
        
        Args:
            url (str): The URL of the news article
            download_images (bool): Whether to download images
            image_save_dir (str): Directory to save downloaded images
            
        Returns:
            dict: Article information including header, body, and images
        """
        if not self.is_valid_url(url):
            return {"error": "Invalid URL"}
        
        print(f"Extracting article from: {url}")
        
        # Fetch the page content
        content = self.get_page_content(url)
        if not content:
            return {"error": "Failed to fetch page content"}
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Extract article components
        title = self.extract_title(soup)
        body = self.extract_body(soup)
        images = self.extract_images(soup, url)
        metadata = self.extract_metadata(soup)
        
        # Download images if requested
        downloaded_images = []
        if download_images and images:
            print(f"Downloading {len(images)} images...")
            for img in images:
                img_info = self.download_image(img['url'], image_save_dir)
                img_info['alt'] = img.get('alt', '')
                downloaded_images.append(img_info)
        
        # Prepare result
        result = {
            "url": url,
            "title": title,
            "body": body,
            "images": downloaded_images if download_images else images,
            "metadata": metadata,
            "success": True
        }
        
        return result
    
    def process_user_input(self):
        """Interactive mode for processing URLs"""
        print("News Article Extraction Agent")
        print("Enter news article URLs to extract content (or type 'quit' to exit):")
        
        while True:
            url = input("\nEnter URL: ").strip()
            
            if url.lower() in ['quit', 'exit', 'q']:
                print("Exiting News Article Extractor.")
                break
                
            if not url:
                print("Please enter a URL.")
                continue
                
            if not self.is_valid_url(url):
                print("Invalid URL. Please enter a valid URL including http:// or https://")
                continue
            
            # Ask if user wants to download images
            download_choice = input("Download images? (y/n): ").strip().lower()
            download_images = download_choice in ['y', 'yes']
            
            image_save_dir = None
            if download_images:
                image_save_dir = input("Enter directory to save images (press Enter for current directory): ").strip()
                if not image_save_dir:
                    image_save_dir = "downloaded_images"
            
            # Extract article
            print(f"\nProcessing {url}...")
            start_time = time.time()
            
            result = self.extract_article(url, download_images, image_save_dir)
            
            processing_time = time.time() - start_time
            print(f"Processing completed in {processing_time:.2f} seconds")
            
            # Display results
            if result.get('error'):
                print(f"Error: {result['error']}")
                continue
                
            print("\n" + "="*50)
            print(f"TITLE: {result['title']}")
            print("="*50)
            
            if result['metadata']:
                print("\nMETADATA:")
                for key, value in result['metadata'].items():
                    print(f"  {key.capitalize()}: {value}")
            
            print(f"\nBODY (first 500 characters):")
            body_preview = result['body'][:500] + "..." if len(result['body']) > 500 else result['body']
            print(f"{body_preview}")
            
            print(f"\nIMAGES FOUND: {len(result['images'])}")
            for i, img in enumerate(result['images'], 1):
                print(f"  {i}. {img['url']}")
                if img.get('alt'):
                    print(f"     Alt text: {img['alt']}")
                if img.get('downloaded', False) and img['downloaded']:
                    print(f"     Downloaded: {img.get('saved_path', 'Yes')}")
                elif img.get('error'):
                    print(f"     Error: {img['error']}")
            
            # Ask if user wants to save results to file
            save_choice = input("\nSave results to JSON file? (y/n): ").strip().lower()
            if save_choice in ['y', 'yes']:
                filename = f"article_{int(time.time())}.json"
                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)
                print(f"Results saved to {filename}")

# Demonstration and Testing
if __name__ == "__main__":
    # Initialize the extractor
    extractor = NewsArticleExtractor()
    
    # Start interactive mode
    extractor.process_user_input()

News Article Extraction Agent
Enter news article URLs to extract content (or type 'quit' to exit):



Enter URL:   https://mothership.sg/2025/07/soviet-era-plane/
Download images? (y/n):  n



Processing https://mothership.sg/2025/07/soviet-era-plane/...
Extracting article from: https://mothership.sg/2025/07/soviet-era-plane/
Error fetching page: 403 Client Error: Forbidden for url: https://mothership.sg/2025/07/soviet-era-plane/
Processing completed in 0.11 seconds
Error: Failed to fetch page content



Enter URL:  https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming
Download images? (y/n):  n



Processing https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming...
Extracting article from: https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming
Processing completed in 0.20 seconds

TITLE: Disney+ K-drama Delusion faces backlash over leaving piles of trash in Jeju after filming

METADATA:
  Date: 2025-08-29T16:25:00+08:00
  Description: The clip revealed discarded items ranging from butane gas canisters to branded coffee cup sleeves.  Read more at straitstimes.com. Read more at straitstimes.com.

BODY (first 500 characters):
Disney+ K-drama Delusion faces backlash over leaving piles of trash in Jeju after filmingSign up now: Get ST's newsletters delivered to your inboxDelusion stars South Korean actors Bae Suzy (middle) and Kim Seon-ho (left).PHOTO: DISNEY+Follow topic:South KoreaPublished Aug 29, 2025, 04:25 PMUpdated Aug 29, 2025, 04:25 PMSEOUL

KeyboardInterrupt: Interrupted by user

In [5]:
# -*- coding: utf-8 -*-
"""
Enhanced News Article Extraction Agent
With better handling of anti-scraping measures
"""

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import time
from fake_useragent import UserAgent
import json
import os
from PIL import Image
import io
import random
import cloudscraper  # For bypassing Cloudflare protection

class EnhancedNewsArticleExtractor:
    def __init__(self):
        self.ua = UserAgent()
        self.session = self._create_session()
        self.scraper = cloudscraper.create_scraper()  # For Cloudflare protection
        self.request_delay = 1  # Delay between requests in seconds
        self.max_retries = 3
        
    def _create_session(self):
        """Create a requests session with realistic headers"""
        session = requests.Session()
        session.headers.update({
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'DNT': '1',
        })
        return session
    
    def _rotate_user_agent(self):
        """Rotate to a new user agent"""
        self.session.headers['User-Agent'] = self.ua.random
    
    def is_valid_url(self, url):
        """Check if the URL is valid"""
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except:
            return False
    
    def get_page_content(self, url, retry_count=0):
        """Fetch the content of a web page with retries and agent rotation"""
        if retry_count >= self.max_retries:
            return None
            
        try:
            # Add delay to be more polite
            time.sleep(self.request_delay)
            
            # Try with regular session first
            response = self.session.get(url, timeout=15)
            
            # If we get a 403, try with cloudscraper
            if response.status_code == 403:
                print("403 detected, trying with cloudscraper...")
                response = self.scraper.get(url, timeout=15)
            
            response.raise_for_status()
            return response.content
            
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 403:
                print(f"403 Forbidden error (attempt {retry_count + 1}/{self.max_retries})")
                self._rotate_user_agent()
                return self.get_page_content(url, retry_count + 1)
            else:
                print(f"HTTP error {e.response.status_code}: {e}")
                return None
        except Exception as e:
            print(f"Error fetching page: {e}")
            return None
    
    def extract_title(self, soup):
        """Extract the article title"""
        # Try different meta tags and elements that might contain the title
        selectors = [
            'meta[property="og:title"]',
            'meta[name="twitter:title"]',
            'h1',
            '.headline',
            '.title',
            '[class*="title"]',
            '[class*="headline"]',
            'title'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    title = element.get('content', '').strip()
                else:
                    title = element.get_text().strip()
                
                if title and len(title) > 10 and len(title) < 200:
                    return title
        
        # Fallback to the page title
        if soup.title:
            return soup.title.get_text().strip()
        
        return "Title not found"
    
    def extract_body(self, soup):
        """Extract the main article body text"""
        # Remove unwanted elements
        for unwanted in soup(['script', 'style', 'nav', 'footer', 'aside', 'form']):
            unwanted.decompose()
        
        # Try to find the main content area
        selectors = [
            'article',
            '.article-body',
            '.post-content',
            '.entry-content',
            '.story-content',
            '.content',
            '.body',
            'main',
            '[itemprop="articleBody"]'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                # Clean up the text
                text = element.get_text()
                text = re.sub(r'\s+', ' ', text).strip()
                
                if len(text) > 100:  # Reasonable minimum length for an article
                    return text
        
        # If no specific content area found, try to find paragraphs with most text
        all_paragraphs = soup.find_all('p')
        if all_paragraphs:
            # Find the longest continuous block of paragraphs
            paragraphs_text = []
            current_block = []
            
            for p in all_paragraphs:
                p_text = p.get_text().strip()
                if len(p_text) > 20:  # Minimum paragraph length
                    current_block.append(p_text)
                else:
                    if len(current_block) > 3:  # Minimum block size
                        paragraphs_text.append(' '.join(current_block))
                    current_block = []
            
            if current_block and len(current_block) > 3:
                paragraphs_text.append(' '.join(current_block))
            
            if paragraphs_text:
                # Return the longest block
                longest_block = max(paragraphs_text, key=len)
                if len(longest_block) > 100:
                    return longest_block
        
        return "Body content not found or too short"
    
    def extract_images(self, soup, base_url):
        """Extract images from the article"""
        images = []
        
        # Look for images in various locations
        img_selectors = [
            'img',
            'meta[property="og:image"]',
            'meta[name="twitter:image"]',
            'link[rel="image_src"]'
        ]
        
        for selector in img_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    img_url = element.get('content', '')
                elif selector.startswith('link'):
                    img_url = element.get('href', '')
                else:
                    img_url = element.get('src', '') or element.get('data-src', '')
                
                if img_url:
                    # Make URL absolute
                    img_url = urljoin(base_url, img_url)
                    
                    # Get alt text for regular img tags
                    alt_text = element.get('alt', '') if selector == 'img' else ''
                    
                    # Avoid duplicates
                    if img_url not in [img['url'] for img in images]:
                        images.append({
                            'url': img_url,
                            'alt': alt_text
                        })
        
        return images
    
    def extract_metadata(self, soup):
        """Extract additional metadata from the article"""
        metadata = {}
        
        # Publication date
        date_selectors = [
            'meta[property="article:published_time"]',
            'meta[name="date"]',
            'meta[name="publish_date"]',
            'time',
            '[class*="date"]',
            '[class*="time"]',
            '[itemprop="datePublished"]'
        ]
        
        for selector in date_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    date = element.get('content', '')
                else:
                    date = element.get('datetime', '') or element.get_text()
                
                if date and 'date' not in metadata:
                    metadata['date'] = date.strip()
                    break
        
        # Author
        author_selectors = [
            'meta[name="author"]',
            'meta[property="article:author"]',
            '[rel="author"]',
            '.author',
            '[class*="author"]',
            '[class*="byline"]',
            '[itemprop="author"]'
        ]
        
        for selector in author_selectors:
            elements = soup.select(selector)
            for element in elements:
                if selector.startswith('meta'):
                    author = element.get('content', '')
                else:
                    author = element.get_text()
                
                if author and 'author' not in metadata:
                    metadata['author'] = author.strip()
                    break
        
        # Description
        desc_selectors = [
            'meta[property="og:description"]',
            'meta[name="description"]',
            'meta[name="twitter:description"]'
        ]
        
        for selector in desc_selectors:
            elements = soup.select(selector)
            for element in elements:
                description = element.get('content', '')
                if description and 'description' not in metadata:
                    metadata['description'] = description.strip()
                    break
        
        return metadata
    
    def download_image(self, image_url, save_dir=None):
        """Download an image and return information about it"""
        try:
            response = self.session.get(image_url, timeout=10)
            response.raise_for_status()
            
            img_data = response.content
            img = Image.open(io.BytesIO(img_data))
            
            image_info = {
                'url': image_url,
                'format': img.format,
                'size': len(img_data),
                'dimensions': img.size,
                'mode': img.mode,
                'downloaded': True
            }
            
            # Save image if directory provided
            if save_dir:
                os.makedirs(save_dir, exist_ok=True)
                filename = os.path.join(save_dir, f"image_{int(time.time())}_{hash(image_url)}.{img.format.lower()}")
                with open(filename, 'wb') as f:
                    f.write(img_data)
                image_info['saved_path'] = filename
            
            return image_info
            
        except Exception as e:
            print(f"Error downloading image {image_url}: {e}")
            return {
                'url': image_url,
                'error': str(e),
                'downloaded': False
            }
    
    def extract_article(self, url, download_images=False, image_save_dir=None):
        """
        Extract information from a news article URL
        
        Args:
            url (str): The URL of the news article
            download_images (bool): Whether to download images
            image_save_dir (str): Directory to save downloaded images
            
        Returns:
            dict: Article information including header, body, and images
        """
        if not self.is_valid_url(url):
            return {"error": "Invalid URL"}
        
        print(f"Extracting article from: {url}")
        
        # Fetch the page content
        content = self.get_page_content(url)
        if not content:
            return {"error": "Failed to fetch page content"}
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Extract article components
        title = self.extract_title(soup)
        body = self.extract_body(soup)
        images = self.extract_images(soup, url)
        metadata = self.extract_metadata(soup)
        
        # Download images if requested
        downloaded_images = []
        if download_images and images:
            print(f"Downloading {len(images)} images...")
            for img in images:
                img_info = self.download_image(img['url'], image_save_dir)
                img_info['alt'] = img.get('alt', '')
                downloaded_images.append(img_info)
        
        # Prepare result
        result = {
            "url": url,
            "title": title,
            "body": body,
            "images": downloaded_images if download_images else images,
            "metadata": metadata,
            "success": True
        }
        
        return result
    
    def process_user_input(self):
        """Interactive mode for processing URLs"""
        print("Enhanced News Article Extraction Agent")
        print("Enter news article URLs to extract content (or type 'quit' to exit):")
        
        while True:
            url = input("\nEnter URL: ").strip()
            
            if url.lower() in ['quit', 'exit', 'q']:
                print("Exiting News Article Extractor.")
                break
                
            if not url:
                print("Please enter a URL.")
                continue
                
            if not self.is_valid_url(url):
                print("Invalid URL. Please enter a valid URL including http:// or https://")
                continue
            
            # Ask if user wants to download images
            download_choice = input("Download images? (y/n): ").strip().lower()
            download_images = download_choice in ['y', 'yes']
            
            image_save_dir = None
            if download_images:
                image_save_dir = input("Enter directory to save images (press Enter for current directory): ").strip()
                if not image_save_dir:
                    image_save_dir = "downloaded_images"
            
            # Extract article
            print(f"\nProcessing {url}...")
            start_time = time.time()
            
            result = self.extract_article(url, download_images, image_save_dir)
            
            processing_time = time.time() - start_time
            print(f"Processing completed in {processing_time:.2f} seconds")
            
            # Display results
            if result.get('error'):
                print(f"Error: {result['error']}")
                continue
                
            print("\n" + "="*50)
            print(f"TITLE: {result['title']}")
            print("="*50)
            
            if result['metadata']:
                print("\nMETADATA:")
                for key, value in result['metadata'].items():
                    print(f"  {key.capitalize()}: {value}")
            
            print(f"\nBODY (first 500 characters):")
            body_preview = result['body'][:500] + "..." if len(result['body']) > 500 else result['body']
            print(f"{body_preview}")
            
            print(f"\nIMAGES FOUND: {len(result['images'])}")
            for i, img in enumerate(result['images'][:5], 1):  # Show first 5 images
                print(f"  {i}. {img['url']}")
                if img.get('alt'):
                    print(f"     Alt text: {img['alt']}")
                if img.get('downloaded', False) and img['downloaded']:
                    print(f"     Downloaded: {img.get('saved_path', 'Yes')}")
                elif img.get('error'):
                    print(f"     Error: {img['error']}")
            
            if len(result['images']) > 5:
                print(f"  ... and {len(result['images']) - 5} more images")
            
            # Ask if user wants to save results to file
            save_choice = input("\nSave results to JSON file? (y/n): ").strip().lower()
            if save_choice in ['y', 'yes']:
                filename = f"article_{int(time.time())}.json"
                with open(filename, 'w', encoding='utf-8') as f:
                    json.dump(result, f, indent=2, ensure_ascii=False)
                print(f"Results saved to {filename}")

# Alternative approach using RSS feeds or APIs for difficult sites
def try_alternative_sources(url):
    """Try alternative methods for difficult-to-scrape sites"""
    print(f"Trying alternative methods for: {url}")
    
    # For Mothership.sg, we can try their RSS feed
    if "mothership.sg" in url:
        rss_url = "https://mothership.sg/feed/"
        try:
            response = requests.get(rss_url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'xml')
                items = soup.find_all('item')
                
                # Try to find the article in the RSS feed
                for item in items:
                    if url in item.find('link').text:
                        title = item.find('title').text if item.find('title') else "Title not found"
                        description = item.find('description').text if item.find('description') else "Description not found"
                        
                        return {
                            "url": url,
                            "title": title,
                            "body": description,
                            "images": [],
                            "metadata": {},
                            "source": "RSS feed",
                            "success": True
                        }
        except Exception as e:
            print(f"Error accessing RSS feed: {e}")
    
    return {"error": "Could not extract content using alternative methods"}

# Demonstration and Testing
if __name__ == "__main__":
    # Initialize the enhanced extractor
    extractor = EnhancedNewsArticleExtractor()
    
    # Test with the problematic URL
    test_url = "https://mothership.sg/2025/07/soviet-era-plane/"
    print(f"Testing with: {test_url}")
    
    # Try direct extraction first
    result = extractor.extract_article(test_url)
    
    if result.get('error'):
        print("Direct extraction failed, trying alternative methods...")
        result = try_alternative_sources(test_url)
    
    if result.get('success'):
        print(f"Successfully extracted: {result['title']}")
        print(f"Body preview: {result['body'][:200]}...")
    else:
        print(f"Failed to extract content: {result.get('error', 'Unknown error')}")
    
    # Start interactive mode
    extractor.process_user_input()

Testing with: https://mothership.sg/2025/07/soviet-era-plane/
Extracting article from: https://mothership.sg/2025/07/soviet-era-plane/
403 detected, trying with cloudscraper...
Successfully extracted: At least 46 on board Soviet-era plane killed in crash in Russia, criminal probe launched
Body preview: News July 26, 2025, 10:09 AM At least 46 on board Soviet-era plane killed in crash in Russia, criminal probe launched...
Enhanced News Article Extraction Agent
Enter news article URLs to extract content (or type 'quit' to exit):



Enter URL:  https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming
Download images? (y/n):  n



Processing https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming...
Extracting article from: https://www.straitstimes.com/life/disney-k-drama-delusion-faces-backlash-over-leaving-piles-of-trash-in-jeju-after-filming
Processing completed in 1.24 seconds

TITLE: Disney+ K-drama Delusion faces backlash over leaving piles of trash in Jeju after filming

METADATA:
  Date: 2025-08-29T16:25:00+08:00
  Description: The clip revealed discarded items ranging from butane gas canisters to branded coffee cup sleeves.  Read more at straitstimes.com. Read more at straitstimes.com.

BODY (first 500 characters):
Disney+ K-drama Delusion faces backlash over leaving piles of trash in Jeju after filmingSign up now: Get ST's newsletters delivered to your inboxDelusion stars South Korean actors Bae Suzy (middle), Kim Seon-ho (left) and director Han Jae-rim.PHOTO: DISNEY+Follow topic:South KoreaPublished Aug 29, 2025, 04:25 PMUpdated Aug 2

KeyboardInterrupt: Interrupted by user

In [3]:
pip install requests beautifulsoup4 fake-useragent pillow cloudscraper

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting requests-toolbelt>=0.9.1 (from cloudscraper)
  Using cached requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.7/99.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
Installing collected packages: requests-toolbelt, cloudscraper
Successfully installed cloudscraper-1.2.71 requests-toolbelt-1.0.0
Note: you may need to restart the kernel to use updated packages.
