In [49]:
import csv
import zipfile
import re
from bs4 import BeautifulSoup

company_name_pattern = r'<h2>(.*?)</h2>'
country_pattern = r'<p class="attrs">Country: <span class="old">(.*?)</span></p>'
date_pattern = r'data-deadline="(.*?)"'
revenue_pattern = r'<p class="attrs">Revenue: <span class="old">(.*?)</span></p>'
industry_pattern = r'<p class="attrs">Industry: <span class="old">(.*?)</span></p>'
data_volume_pattern = r'(\d+(?:\.\d+)?)\s?(TB|Tb|GB|Gb)'
data_description_pattern = r'company:\s*(.*)</p>'  

extracted_data = []

def extract_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    company_name = re.search(company_name_pattern, html_content)
    country = re.search(country_pattern, html_content)
    attack_date = re.search(date_pattern, html_content)
    revenue = re.search(revenue_pattern, html_content)
    industry = re.search(industry_pattern, html_content)
    data_volume = re.findall(data_volume_pattern, html_content)
    data_description = re.search(data_description_pattern, html_content)

    if data_volume:
        data_volume = f"{data_volume[0][0]} {data_volume[0][1]}"  # Combine the number and unit
    else:
        data_volume = 'N/A'

    data = {
        'company_name': company_name.group(1) if company_name else 'N/A',
        'country': country.group(1) if country else 'N/A',
        'date_of_attack': attack_date.group(1) if attack_date else 'N/A',
        'revenue': revenue.group(1) if revenue else 'N/A',
        'industry': industry.group(1) if industry else 'N/A',
        'data_volume': data_volume,
        'data_description': data_description.group(1).strip() if data_description else 'N/A',
        
    }

    return data

zip_file_path = 'C:\\Users\\Sruthi\\OneDrive\\Sruthi\\Studies\\Uni\\Uni Courses-Sruthi-Zenbook\\Year 3 Sem 1\\Cyber Threat Intelligence\\Project\\danon\\dAn0n scrapped txt.zip'

with zipfile.ZipFile(zip_file_path, 'r') as z:
    for file_name in z.namelist():
        with z.open(file_name) as file:
            content = file.read().decode('utf-8')
            extracted_info = extract_data(content)
            extracted_data.append(extract_data(content))
            print(f"Extracted data from file {file_name}:")
            for key, value in extracted_info.items():
                print(f"{key}: {value}")
            print("\n" + "-"*50 + "\n")

csv_columns = ['company_name', 'country', 'date_of_attack', 'revenue', 'industry', 'data_volume', 'data_description']

csv_file = 'extracted_data.csv'
try:
    with open('danon_extracted_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in extracted_data:
            writer.writerow(data)
except IOError:
    print("I/O error")


Extracted data from file dAn0n scrapped txt/:
company_name: N/A
country: N/A
date_of_attack: N/A
revenue: N/A
industry: N/A
data_volume: N/A
data_description: N/A

--------------------------------------------------

Extracted data from file dAn0n scrapped txt/cyjpjnylhbcozbxn.txt:
company_name: RSH legal
country: USA
date_of_attack: Apr 29, 2024 14:04:54
revenue: $5.6M
industry: Law Firms &amp; Legal Services
data_volume: 6 TB
data_description: Financial, legal, medical information of clients and employees, personal data employees, partners and clients.

--------------------------------------------------

Extracted data from file dAn0n scrapped txt/dlvzmycbvlcrbwvp.txt:
company_name: Erler &amp; Kalinowski
country: USA
date_of_attack: May 17, 2024 09:05:43
revenue: $14.5 M
industry: Architecture, Engineering &amp; Design
data_volume: 1 TB
data_description: Financial, legal, information on employees and partners. Information on clients was also received: Personal data of clients, all si