<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/RSS_Feed_Source.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q feedparser requests beautifulsoup4

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [10]:
import feedparser
import requests
from bs4 import BeautifulSoup
import json
import os
import time

In [11]:
# List of RSS feed URLs (deduplicated)
SECURITY_RSS_FEEDS = list(set([
    "https://www.bleepingcomputer.com/feed/",
    "https://feeds.feedburner.com/TheHackersNews",
    "https://krebsonsecurity.com/feed/",
    "https://www.darkreading.com/rss.xml",
    "https://www.securityweek.com/feed/",
    "https://www.csoonline.com/feed/",
    "https://www.threatpost.com/feed/",
    "https://www.helpnetsecurity.com/feed/",
    "https://www.infosecurity-magazine.com/rss/news/",
    "https://www.cybersecurity-insiders.com/feed/",
    "https://www.zdnet.com/topic/security/rss.xml",
    "https://www.schneier.com/feed/atom/",
    "https://www.theregister.com/security/headlines.atom",
    "https://www.govinfosecurity.com/rss/feeds/rss",
    "https://www.crowdstrike.com/blog/feed/",
    "https://www.troyhunt.com/rss/",
    "https://www.securelist.com/feed/",
    "https://www.sans.org/rss/security-headlines/",
    "https://www.fireeye.com/blog/threat-research/_jcr_content.feed",
    "https://www.symantec.com/connect/item-feeds/blog/feed/all/feed",
    "https://www.trendmicro.com/vinfo/us/security/rss/",
    "https://www.mcafee.com/blogs/feed/",
    "https://www.paloaltonetworks.com/blog/feed",
    "https://www.cisa.gov/cybersecurity-advisories/ics-advisories.xml",
    "https://www.ncsc.gov.uk/api/1/services/v1/report-rss-feed.xml",
    "https://www.recordedfuture.com/feed",
    "https://www.cybereason.com/blog/feed",
    "https://www.rapid7.com/blog/rss/",
    "https://www.akamai.com/blog/rss.xml",
    "https://www.imperva.com/blog/feed/",
    "https://www.checkpoint.com/blog/feed/",
    "https://www.fortinet.com/blog.xml",
    "https://www.proofpoint.com/us/rss.xml",
    "https://www.sophos.com/en-us/medialibrary/RSS/rss.aspx",
    "https://www.qualys.com/blog/feed/",
    "https://www.tenable.com/blog/rss.xml",
    "https://www.ibm.com/security/blog/feed/",
    "https://www.cisco.com/c/en/us/products/security/security-labs-rss-feed.html",
    "https://www.microsoft.com/security/blog/feed/",
    "https://www.oracle.com/security-alerts/rss.xml",
    "https://www.elastic.co/security-labs-rss",
    "https://www.cloudflare.com/blog/security/feed/",
    "https://www.varonis.com/blog/feed/",
    "https://www.sentinelone.com/blog/feed/",
    "https://www.carbonblack.com/blog/feed/",
    "https://www.cyberark.com/blog/feed/",
    "https://www.duo.com/blog/rss.xml",
    "https://www.secureworks.com/rss?feed=blog",
    "https://www.forcepoint.com/blog/rss.xml",
    "https://www.zscaler.com/blogs/security-research/feed",
    "https://www.broadcom.com/site/xml/rss/feed/symantec",
    "https://www.radware.com/security/rss/",
    "https://www.f5.com/labs/rss",
    "https://www.extrahop.com/company/blog/feed/",
    "https://www.vmware.com/security/advisories.xml",
    "https://www.redcanary.com/blog/feed/",
]))

In [12]:
# Function to fetch and parse RSS feeds with retries
def fetch_rss_feeds(feed_urls, max_retries=3):
    entries = []
    for url in feed_urls:
        for attempt in range(max_retries):
            try:
                print(f"Fetching {url} (Attempt {attempt + 1})...")
                feed = feedparser.parse(url, request_headers={'User-Agent': 'Mozilla/5.0'})
                for entry in feed.entries:
                    entries.append({
                        "title": entry.title,
                        "link": entry.link,
                        "published": entry.published if "published" in entry else None,
                        "summary": entry.summary if "summary" in entry else None,
                        "source": url,
                    })
                break  # Success, exit the retry loop
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {url}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(2)  # Wait before retrying
                else:
                    print(f"Max retries reached for {url}. Skipping.")
    return entries

In [13]:
# Function to fetch media and enrich data with retries
def enrich_data_with_media(entries, max_retries=3):
    enriched = []
    for entry in entries:
        for attempt in range(max_retries):
            try:
                print(f"Enriching {entry['link']} (Attempt {attempt + 1})...")
                response = requests.get(entry["link"], headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
                soup = BeautifulSoup(response.content, "html.parser")

                # Fetch the main image
                main_image = soup.find("meta", property="og:image")
                entry["main_image"] = main_image["content"] if main_image else None

                # Fetch all images on the page
                entry["all_images"] = [img["src"] for img in soup.find_all("img") if "src" in img.attrs]

                # Optionally download the images (not recommended for large-scale scraping)
                images_dir = 'downloaded_images'
                os.makedirs(images_dir, exist_ok=True)
                for img_url in entry["all_images"]:
                    try:
                        img_response = requests.get(img_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
                        img_name = os.path.join(images_dir, os.path.basename(img_url))
                        with open(img_name, 'wb') as img_file:
                            img_file.write(img_response.content)
                    except requests.exceptions.RequestException as e:
                        print(f"Failed to download image {img_url}: {e}")
                break  # Success, exit the retry loop
            except Exception as e:
                print(f"Attempt {attempt + 1} failed for {entry['link']}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(2)  # Wait before retrying
                else:
                    print(f"Max retries reached for {entry['link']}. Skipping.")
        enriched.append(entry)
    return enriched

In [14]:
# Function to save data to a JSON file
def save_to_json(data, filename="security_intelligence_data.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"Data saved to {filename}")

In [None]:
# Main execution
if __name__ == "__main__":
    # Fetch RSS feeds
    print("Fetching RSS feeds...")
    rss_data = fetch_rss_feeds(SECURITY_RSS_FEEDS)

    # Enrich with media
    print("Enriching data with media...")
    enriched_data = enrich_data_with_media(rss_data)

    # Save to file
    save_to_json(enriched_data)
    print("Data collection complete!")

Fetching RSS feeds...
Fetching https://www.crowdstrike.com/blog/feed/ (Attempt 1)...
Fetching https://www.zdnet.com/topic/security/rss.xml (Attempt 1)...
Fetching https://www.schneier.com/feed/atom/ (Attempt 1)...
Fetching https://www.infosecurity-magazine.com/rss/news/ (Attempt 1)...
Fetching https://www.paloaltonetworks.com/blog/feed (Attempt 1)...
Fetching https://www.oracle.com/security-alerts/rss.xml (Attempt 1)...
Fetching https://www.sophos.com/en-us/medialibrary/RSS/rss.aspx (Attempt 1)...
