In [1]:
pip install feedparser stix2

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting stix2
  Downloading stix2-3.0.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting simplejson (from stix2)
  Downloading simplejson-3.19.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting stix2-patterns>=1.2.0 (from stix2)
  Downloading stix2_patterns-2.0.0-py2.py3-none-any.whl.metadata (8.3 kB)
Collecting antlr4-python3-runtime~=4.9.0 (from stix2-patterns>=1.2.0->stix2)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━

In [None]:
import feedparser
import json
import uuid
from stix2 import Bundle, IntrusionSet
from datetime import datetime, timezone
import subprocess
import logging

# Configure logging
logging.basicConfig(
    filename="rss_scraper_errors.log",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# Proxy Configuration
APP_PROXY = "http://placeholder.com:85"  # Replace with actual proxy address
USERNAME = "username_ui"  # Replace with your username
PASSWORD = "password_ui"  # Replace with your password
PASSWORD = PASSWORD.replace("?", "\\?")  # Escape special characters in the password


def log_error(feed_url, message):
    """
    Log an error message to the log file.
    """
    logging.error(f"Feed: {feed_url} - {message}")


def fetch_feed_content_with_proxy(feed_url):
    """
    Fetch the content of an RSS feed using a proxy.

    Args:
        feed_url (str): The URL of the RSS feed.

    Returns:
        str: The raw feed content or None if fetching fails.
    """
    try:
        # Build the curl command with proxy and NTLM authentication
        command = (
            f'echo "{PASSWORD}" | '
            f'curl -s -x "{APP_PROXY}" -U "{USERNAME}" "{feed_url}" --proxy-ntlm'
        )
        # Execute the command and get the response
        response = subprocess.check_output(command, shell=True, text=True)
        return response
    except subprocess.CalledProcessError as e:
        log_error(feed_url, f"Exception during fetch: {e}")
        print(f"Error fetching feed: {feed_url}. Exception: {e}")
        return None


def parse_feed(feed_content):
    """
    Parse the content of an RSS feed.

    Args:
        feed_content (str): The raw feed content.

    Returns:
        dict: The parsed feed or None if parsing fails.
    """
    try:
        return feedparser.parse(feed_content)
    except Exception as e:
        print(f"Error parsing feed content. Exception: {e}")
        return None


def retry_feed_parsing(feed_url, retries=3):
    """
    Retry fetching and parsing an RSS feed using a proxy.

    Args:
        feed_url (str): The URL of the RSS feed.
        retries (int): The number of retry attempts.

    Returns:
        dict: The parsed feed or None if all retries fail.
    """
    for attempt in range(retries):
        print(f"Attempt {attempt + 1} to fetch and parse feed: {feed_url}")
        feed_content = fetch_feed_content_with_proxy(feed_url)
        if feed_content:
            feed = parse_feed(feed_content)
            if feed and not feed.bozo:  # Ensure the feed is valid
                return feed
        print(f"Retrying feed: {feed_url}")
    log_error(feed_url, "Failed to fetch or parse after retries.")
    print(f"Failed to parse feed after {retries} attempts: {feed_url}")
    return None


def scrape_rss_feed_to_stix(feed_url):
    """
    Scrape an RSS feed and transform the data into STIX 2.1 JSON format.

    Args:
        feed_url (str): The URL of the RSS feed.

    Returns:
        str: JSON string in STIX format or None if parsing fails.
    """
    feed = retry_feed_parsing(feed_url)
    if not feed:
        return None

    # Collect STIX objects
    stix_objects = []

    for entry in feed.entries:
        intrusion_set_id = f"intrusion-set--{uuid.uuid4()}"

        # Parse published date
        published_date = entry.get("published", None)
        if published_date:
            try:
                published_date = datetime.strptime(published_date, "%a, %d %b %Y %H:%M:%S %z")
            except ValueError:
                try:
                    published_date = datetime.strptime(published_date, "%a, %d %b %Y %H:%M:%S +0000")
                except ValueError:
                    print(f"Failed to parse date for entry: {entry.title}")
                    continue

            published_date = published_date.astimezone(timezone.utc)

            intrusion_set = IntrusionSet(
                id=intrusion_set_id,
                name=entry.title,
                description=entry.get("summary", "No description available."),
                first_seen=published_date.isoformat().replace("+00:00", "Z"),
                created=published_date.isoformat().replace("+00:00", "Z"),
                modified=published_date.isoformat().replace("+00:00", "Z"),
                resource_level="unknown",
                primary_motivation="unknown",
                aliases=[]
            )
            stix_objects.append(intrusion_set)

    # Create a STIX Bundle
    stix_bundle = Bundle(objects=stix_objects)
    return stix_bundle.serialize(pretty=True)


def process_multiple_feeds(feed_urls):
    """
    Process multiple RSS feeds and save STIX bundles for each feed.

    Args:
        feed_urls (list): List of RSS feed URLs.
    """
    for feed_url in feed_urls:
        print(f"Processing feed: {feed_url}")
        stix_json = scrape_rss_feed_to_stix(feed_url)
        if stix_json:
            domain = feed_url.split("//")[-1].split("/")[0]
            filename = f"stix_bundle_{domain}.json"
            with open(filename, "w") as json_file:
                json_file.write(stix_json)
            print(f"STIX bundle saved to '{filename}'.")


if __name__ == "__main__":
    # List of RSS feed URLs
    rss_feed_urls = [
        "https://feeds.feedburner.com/TheHackersNews?format=xml",
        "https://www.wired.com/feed/category/security/latest/rss",
        "https://www.bleepingcomputer.com/feed/"
    ]

    process_multiple_feeds(rss_feed_urls)



In [35]:
import feedparser
import json
import uuid
from stix2 import Bundle, IntrusionSet
from datetime import datetime, timezone
import requests
import logging

# Configure logging
logging.basicConfig(
    filename="rss_scraper_errors.log",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_error(feed_url, message):
    """
    Log an error message to the log file.
    """
    logging.error(f"Feed: {feed_url} - {message}")

def fetch_feed_content(feed_url):
    """
    Fetch the content of an RSS feed.

    Args:
        feed_url (str): The URL of the RSS feed.

    Returns:
        str: The raw feed content or None if fetching fails.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        response = requests.get(feed_url, headers=headers)
        response.encoding = 'utf-8'  # Ensure proper encoding
        if response.status_code != 200:
            log_error(feed_url, f"HTTP Status Code: {response.status_code}")
            print(f"Failed to fetch feed: {feed_url}. HTTP Status: {response.status_code}")
            return None
        return response.text
    except Exception as e:
        log_error(feed_url, f"Exception during fetch: {e}")
        print(f"Error fetching feed: {feed_url}. Exception: {e}")
        return None

def parse_feed(feed_content):
    """
    Parse the content of an RSS feed.

    Args:
        feed_content (str): The raw feed content.

    Returns:
        dict: The parsed feed or None if parsing fails.
    """
    try:
        return feedparser.parse(feed_content)
    except Exception as e:
        print(f"Error parsing feed content. Exception: {e}")
        return None

def retry_feed_parsing(feed_url, retries=3):
    """
    Retry fetching and parsing an RSS feed.

    Args:
        feed_url (str): The URL of the RSS feed.
        retries (int): The number of retry attempts.

    Returns:
        dict: The parsed feed or None if all retries fail.
    """
    for attempt in range(retries):
        print(f"Attempt {attempt + 1} to fetch and parse feed: {feed_url}")
        feed_content = fetch_feed_content(feed_url)
        if feed_content:
            feed = parse_feed(feed_content)
            if feed and not feed.bozo:  # Ensure the feed is valid
                return feed
        print(f"Retrying feed: {feed_url}")
    log_error(feed_url, "Failed to fetch or parse after retries.")
    print(f"Failed to parse feed after {retries} attempts: {feed_url}")
    return None

def scrape_rss_feed_to_stix(feed_url):
    """
    Scrape an RSS feed and transform the data into STIX 2.1 JSON format.

    Args:
        feed_url (str): The URL of the RSS feed.

    Returns:
        str: JSON string in STIX format or None if parsing fails.
    """
    feed = retry_feed_parsing(feed_url)
    if not feed:
        return None

    # Collect STIX objects
    stix_objects = []

    for entry in feed.entries:
        intrusion_set_id = f"intrusion-set--{uuid.uuid4()}"

        # Parse published date
        published_date = entry.get("published", None)
        if published_date:
            try:
                published_date = datetime.strptime(published_date, "%a, %d %b %Y %H:%M:%S %z")
            except ValueError:
                try:
                    published_date = datetime.strptime(published_date, "%a, %d %b %Y %H:%M:%S +0000")
                except ValueError:
                    print(f"Failed to parse date for entry: {entry.title}")
                    continue

            published_date = published_date.astimezone(timezone.utc)

            intrusion_set = IntrusionSet(
                id=intrusion_set_id,
                name=entry.title,
                description=entry.get("summary", "No description available."),
                first_seen=published_date.isoformat().replace("+00:00", "Z"),
                created=published_date.isoformat().replace("+00:00", "Z"),
                modified=published_date.isoformat().replace("+00:00", "Z"),
                resource_level="unknown",
                primary_motivation="unknown",
                aliases=[]
            )
            stix_objects.append(intrusion_set)

    # Create a STIX Bundle
    stix_bundle = Bundle(objects=stix_objects)
    return stix_bundle.serialize(pretty=True)

def process_multiple_feeds(feed_urls):
    """
    Process multiple RSS feeds and save STIX bundles for each feed.

    Args:
        feed_urls (list): List of RSS feed URLs.
    """
    for feed_url in feed_urls:
        print(f"Processing feed: {feed_url}")
        stix_json = scrape_rss_feed_to_stix(feed_url)
        if stix_json:
            domain = feed_url.split("//")[-1].split("/")[0]
            filename = f"stix_bundle_{domain}.json"
            with open(filename, "w") as json_file:
                json_file.write(stix_json)
            print(f"STIX bundle saved to '{filename}'.")

if __name__ == "__main__":
    # List of RSS feed URLs
    rss_feed_urls = [
        "https://feeds.feedburner.com/TheHackersNews?format=xml",
        "https://www.wired.com/feed/category/security/latest/rss",
        "https://www.bleepingcomputer.com/feed/"
    ]

    process_multiple_feeds(rss_feed_urls)


Processing feed: https://feeds.feedburner.com/TheHackersNews?format=xml
Attempt 1 to fetch and parse feed: https://feeds.feedburner.com/TheHackersNews?format=xml
STIX bundle saved to 'stix_bundle_feeds.feedburner.com.json'.
Processing feed: https://www.wired.com/feed/category/security/latest/rss
Attempt 1 to fetch and parse feed: https://www.wired.com/feed/category/security/latest/rss
STIX bundle saved to 'stix_bundle_www.wired.com.json'.
Processing feed: https://www.bleepingcomputer.com/feed/
Attempt 1 to fetch and parse feed: https://www.bleepingcomputer.com/feed/
STIX bundle saved to 'stix_bundle_www.bleepingcomputer.com.json'.


rss_scraper/
├── __init__.py
├── config.py             
├── main.py               
├── scraper.py            
├── stix_converter.py     
├── tests/                
│   ├── __init__.py
│   ├── test_scraper.py   
│   ├── test_stix_converter.py  
├── logs/                 
│   ├── rss_scraper_errors.log
├── stix_bundles/         
├── requirements.txt      
├── setup.py              


In [None]:
def retry_feed_with_backoff(feed_url, retries=3, delay=5):
    """
    Retry fetching the feed with exponential backoff.

    Args:
        feed_url (str): The URL of the RSS feed.
        retries (int): Maximum number of retry attempts.
        delay (int): Initial delay in seconds before retrying.

    Returns:
        str: The feed content if successful, None otherwise.
    """
    for attempt in range(1, retries + 1):
        print(f"Attempt {attempt} to fetch feed: {feed_url}")
        response = fetch_feed_content_with_proxy(feed_url)
        if response:
            return response
        print(f"Retrying in {delay} seconds...")
        time.sleep(delay)
        delay *= 2  # Exponential backoff
    print(f"Failed to fetch feed after {retries} attempts: {feed_url}")
    return None


In [None]:
def retry_feed_parsing(feed_url, retries=3):
    """
    Retry fetching and parsing an RSS feed using exponential backoff.

    Args:
        feed_url (str): The URL of the RSS feed.
        retries (int): Maximum number of retry attempts.

    Returns:
        dict: The parsed feed or None if all retries fail.
    """
    feed_content = retry_feed_with_backoff(feed_url, retries=retries)
    if feed_content:
        feed = parse_feed(feed_content)
        if feed and not feed.bozo:  # Ensure the feed is valid
            return feed
    log_error(feed_url, "Failed to fetch or parse after retries.")
    print(f"Failed to parse feed after {retries} attempts: {feed_url}")
    return None
