In [7]:
from bs4 import BeautifulSoup
from pprint import pprint
import requests
from datetime import datetime, timezone
from zoneinfo import ZoneInfo

In [None]:
def get_html_sync(url: str, timeout: int = 10) -> str | None:
    """
    주어진 URL의 HTML 콘텐츠를 동기적으로 가져옴

    :param url: HTML을 가져올 웹사이트 URL
    :param timeout: 요청 대기 시간 (초)
    :return: 성공 시 HTML 텍스트, 실패 시 None
    """
    try:
        # User-Agent를 설정하여 봇으로 인식되는 것을 방지
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        
        # requests.get을 사용하여 URL에 GET 요청을 보냄
        response = requests.get(url, headers=headers, timeout=timeout)
        
        # HTTP 상태 코드가 200 (OK)이 아닐 경우 예외를 발생
        response.raise_for_status()
        
        print(f"✅ [{url}] - 동기 호출 성공")
        return response.text
    
    except Exception as e:
        # 모든 requests 관련 예외를 처리
        print(f"❌ [{url}] - 오류 발생: {e}")
        return None

# --- 함수 사용 예시 ---
URL = "https://www.ransomware.live/"
html_content = get_html_sync(URL)
print(html_content)

✅ [https://www.ransomware.live/] - 동기 호출 성공
<!DOCTYPE html>
<html lang="en">
<head>
<!--

██████╗  █████╗ ███╗   ██╗███████╗ ██████╗ ███╗   ███╗██╗    ██╗ █████╗ ██████╗ ███████╗   ██╗     ██╗██╗   ██╗███████╗
██╔══██╗██╔══██╗████╗  ██║██╔════╝██╔═══██╗████╗ ████║██║    ██║██╔══██╗██╔══██╗██╔════╝   ██║     ██║██║   ██║██╔════╝
██████╔╝███████║██╔██╗ ██║███████╗██║   ██║██╔████╔██║██║ █╗ ██║███████║██████╔╝█████╗     ██║     ██║██║   ██║█████╗  
██╔══██╗██╔══██║██║╚██╗██║╚════██║██║   ██║██║╚██╔╝██║██║███╗██║██╔══██║██╔══██╗██╔══╝     ██║     ██║╚██╗ ██╔╝██╔══╝  
██║  ██║██║  ██║██║ ╚████║███████║╚██████╔╝██║ ╚═╝ ██║╚███╔███╔╝██║  ██║██║  ██║███████╗██╗███████╗██║ ╚████╔╝ ███████╗
╚═╝  ╚═╝╚═╝  ╚═╝╚═╝  ╚═══╝╚══════╝ ╚═════╝ ╚═╝     ╚═╝ ╚══╝╚══╝ ╚═╝  ╚═╝╚═╝  ╚═╝╚══════╝╚═╝╚══════╝╚═╝  ╚═══╝  ╚══════╝
Non nobis, sed securitati communi

version 2025-08 - BSides Bournemouth Edition 
by Julien Mousqueton
  Contact : @JMousqueton on X (Twitter) or @Julien.io on BlueSky

-->
<meta charset="UTF

In [11]:
import re

def parse_ransomware_live_data(html_content):
    """
    ransomware.live HTML 콘텐츠를 파싱하여 통계와 피해자 목록을 반환

    :param html_content: requests를 통해 가져온 HTML 텍스트
    :return: 통계와 피해자 리스트가 포함된 딕셔너리
    """
    kst_timezone = ZoneInfo("Asia/Seoul")
    crawl_time_utc = datetime.now(timezone.utc).isoformat()
    crawl_time_kst = datetime.now(kst_timezone).isoformat()

    soup = BeautifulSoup(html_content, 'html.parser')

    # --- 내부 헬퍼 함수 정의 ---
    def get_victim_details(item):
        try:
            name = item.select_one('strong').get_text(strip=True)
            group = item.select_one('small a span.badge').get_text(strip=True)
            date_container = item.select_one('div.text-body-secondary')
            date_text = date_container.get_text(" ", strip=True) if date_container else ""
            discovery_date_match = re.search(r"Discovery Date: ([\d-]+)", date_text)
            discovery_date = discovery_date_match.group(1) if discovery_date_match else 'N/A'
            attack_date_match = re.search(r"Estimated Attack Date: ([\d-]+)", date_text)
            estimated_attack_date = attack_date_match.group(1) if attack_date_match else 'Not available'
            description_tag = item.select_one('div.bg-body-secondary')
            description = description_tag.get_text(strip=True) if description_tag else 'No description available.'
            country_tag = item.select_one('img[style*="width: 32px"]')
            country = country_tag['alt'] if country_tag and country_tag.has_attr('alt') else 'N/A'
            website_tag = item.select_one('a:has(i.fa-globe-americas)')
            website = website_tag['href'] if website_tag and website_tag.has_attr('href') else 'Not available'
            details_link_tag = item.select_one('a[href*="/id/"]')
            details_url = "https://www.ransomware.live" + details_link_tag['href'] if details_link_tag and details_link_tag.has_attr('href') else 'Not available'
            
            return {
                "company_name": name,
                "ransomware_group": group,
                "discovery_date": discovery_date,
                "estimated_attack_date": estimated_attack_date,
                "description": description,
                "country": country,
                "website": website,
                "details_url": details_url
            }
        except Exception:
            return None

    # --- 1. 통계 데이터 추출 (JavaScript 파싱 방식) ---
    statistics = {}
    try:
        # animateCounter 함수가 포함된 스크립트 태그를 모두 찾습니다.
        scripts = soup.find_all('script')
        script_text = ""
        for script in scripts:
            if 'animateCounter' in script.text:
                script_text = script.text
                break
        
        # 정규표현식을 사용하여 각 카운터의 실제 값을 추출합니다.
        # r"animateCounter\('groupsCounter',\s*\d+,\s*([\d,]+)" 패턴은 다음을 의미합니다:
        # 'groupsCounter'를 찾고, 시작값(0)을 지나, 우리가 원하는 실제 값([\d,]+)을 캡처합니다.
        groups_match = re.search(r"animateCounter\('groupsCounter',\s*\d+,\s*([\d,]+)", script_text)
        victims_match = re.search(r"animateCounter\('victimsCounter',\s*\d+,\s*([\d,]+)", script_text)
        year_match = re.search(r"animateCounter\('victimsThisYearCounter',\s*\d+,\s*([\d,]+)", script_text)
        month_match = re.search(r"animateCounter\('victimsThisMonthCounter',\s*\d+,\s*([\d,]+)", script_text)

        # 캡처된 값에서 쉼표(,)를 제거하고 정수로 변환합니다.
        statistics = {
            "Total Groups": int(groups_match.group(1).replace(',', '')) if groups_match else 0,
            "Total Victims": int(victims_match.group(1).replace(',', '')) if victims_match else 0,
            "Victims This Year": int(year_match.group(1).replace(',', '')) if year_match else 0,
            "Victims This Month": int(month_match.group(1).replace(',', '')) if month_match else 0
        }
    except Exception as e:
        print(f"통계 데이터 추출 중 오류 발생: {e}")
        statistics = {}


    # --- 2. 피해 기업 리스트 추출 ---
    victim_items = soup.select('#victim-list .victim-item')
    victims_list = []
    for item in victim_items:
        victim_data = get_victim_details(item)
        if victim_data:
            victims_list.append(victim_data)

    # --- 최종 결과 반환 ---
    return {
        "crawled_at_utd": crawl_time_utc,
        "crawled_at_kst": crawl_time_kst,
        "statistics": statistics,
        "victims": victims_list
    }

# 이전에 로드한 html_content를 사용하여 함수를 실행하고 결과를 출력합니다.
ransomware_data = parse_ransomware_live_data(html_content)
pprint(ransomware_data)

{'crawled_at_kst': '2025-10-15T22:16:24.599907+09:00',
 'crawled_at_utd': '2025-10-15T13:16:24.599907+00:00',
 'statistics': {'Total Groups': 295,
                'Total Victims': 22613,
                'Victims This Month': 409,
                'Victims This Year': 6126},
 'victims': [{'company_name': 'The North Stonington School District',
              'country': 'US',
              'description': 'North Stonington Public Schools have two public '
                             'schools and 736 students, strives to create a '
                             'safe e...',
              'details_url': 'https://www.ransomware.live/id/VGhlIE5vcnRoIFN0b25pbmd0b24gU2Nob29sIERpc3RyaWN0QGludGVybG9jaw==',
              'discovery_date': '2025-10-15',
              'estimated_attack_date': 'Not available',
              'ransomware_group': 'Interlock',
              'website': 'https://northstonington.k12.ct.us'},
             {'company_name': 'Navigator Business Solutions',
              'country'

파이썬 파일로 바꾼뒤 실행 결과

'https://www.ransomware.live/'에서 데이터 크롤링을 시작합니다...

✅ [https://www.ransomware.live/] - HTML 콘텐츠 로드 성공

크롤링 성공! 데이터 파싱을 시작합니다...

--- 파싱 완료된 데이터 ---
```python
{'crawled_at_kst': '2025-10-15T22:20:00.271465+09:00',
 'crawled_at_utc': '2025-10-15T13:20:00.271465+00:00',
 'statistics': {'Total Groups': 295,
                'Total Victims': 22614,
                'Victims This Month': 410,
                'Victims This Year': 6127},
 'victims': [{'company_name': 'www.cbsaust.org.au',
              'country': 'AU',
              'description': 'CBS Tasmania is a not-for-profit organization '
                             'that provides personalized aged care and '
                             'disability se...',
              'details_url': 'https://www.ransomware.live/id/d3d3LmNic2F1c3Qub3JnLmF1QGx5bng=',
              'discovery_date': '2025-10-15',
              'estimated_attack_date': '2025-10-10',
              'ransomware_group': 'Lynx',
              'website': 'https://www.cbsaust.org.au'},
             {'company_name': 'The North Stonington School District',
              'country': 'US',
              'description': 'North Stonington Public Schools have two public '
                             'schools and 736 students, strives to create a '
                             'safe e...',
              'details_url': 'https://www.ransomware.live/id/VGhlIE5vcnRoIFN0b25pbmd0b24gU2Nob29sIERpc3RyaWN0QGludGVybG9jaw==',
              'discovery_date': '2025-10-15',
              'estimated_attack_date': 'Not available',
              'ransomware_group': 'Interlock',
              'website': 'https://northstonington.k12.ct.us'},
             ...
             {'company_name': 'dsv.com',
              'country': 'DK',
              'description': 'DSV is a global transport and logistics company '
                             'that provides and manages supply chain solutions '
                             'for...',
              'details_url': 'https://www.ransomware.live/id/ZHN2LmNvbUBjb2luYmFzZWNhcnRlbA==',
              'discovery_date': '2025-10-13',
              'estimated_attack_date': 'Not available',
              'ransomware_group': 'Coinbasecartel',
              'website': 'https://dsv.com'},
             {'company_name': 'Kuehne + Nagel',
              'country': 'CH',
              'description': 'With more than 82,000 employees at almost 1,300 '
                             'sites in close to 100 countries, the '
                             'Kuehne+Nagel Gr...',
              'details_url': 'https://www.ransomware.live/id/S3VlaG5lICsgTmFnZWxAY29pbmJhc2VjYXJ0ZWw=',
              'discovery_date': '2025-10-13',
              'estimated_attack_date': 'Not available',
              'ransomware_group': 'Coinbasecartel',
              'website': 'https://kuehne-nagel.com'}]}
```

🎉 프로그램이 성공적으로 실행되었습니다.