In [55]:
import csv
import zipfile
import re
from bs4 import BeautifulSoup

# Define regex patterns for extracting information
company_name_pattern = r'src="/uploads/(.*?)_'
address_pattern = r'Address:\s([^<]+)<'
date_pattern = r'<mark class="marker-yellow"><strong>(\d{2}\.\d{2}\.\d{4})'
revenue_pattern = r'Revenue\s:\s(\$[\d\.]+M)'
industry_pattern = r'<p class="attrs">Industry: <span class="old">(.*?)</span></p>'
data_volume_pattern = r'(\d+(?:\.\d+)?)\s?(TB|Tb|GB|Gb)'
data_description_pattern = r'DATA DESCRIPTIONS:\u003c/strong\u003e\u003c/mark\u003e\s([^<]+)<'  


# Initialize a list to store extracted data
extracted_data = []

# Function to extract data from HTML content
def extract_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract data using regex or BeautifulSoup
    company_name = re.search(company_name_pattern, html_content)
    address = re.search(address_pattern, html_content)
    attack_date = re.search(date_pattern, html_content)
    revenue = re.search(revenue_pattern, html_content)
    industry = re.search(industry_pattern, html_content)
    data_volume = re.findall(data_volume_pattern, html_content)
    data_description = re.search(data_description_pattern, html_content)

    # If data_volume is found, concatenate the number and the unit
    if data_volume:
        data_volume = f"{data_volume[0][0]} {data_volume[0][1]}"  # Combine the number and unit
    else:
        data_volume = 'N/A'

    # Store data in a dictionary
    data = {
        'company_name': company_name.group(1) if company_name else 'N/A',
        'address': address.group(1) if address else 'N/A',
        'date_of_attack': attack_date.group(1) if attack_date else 'N/A',
        'revenue': revenue.group(1) if revenue else 'N/A',
        'industry': industry.group(1) if industry else 'N/A',
        'data_volume': data_volume,
        'data_description': data_description.group(1).strip() if data_description else 'N/A',
        
    }

    return data

# Path to the zip file
zip_file_path = 'C:\\Users\\Sruthi\\OneDrive\\Sruthi\\Studies\\Uni\\Uni Courses-Sruthi-Zenbook\\Year 3 Sem 1\\Cyber Threat Intelligence\\Project\\cactus\\cactus scrapped txt.zip'


# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as z:
    # Iterate through each file in the zip
    for file_name in z.namelist():
        # Read the file content
        with z.open(file_name) as file:
            content = file.read().decode('utf-8')
            # Extract data from the content
            extracted_info = extract_data(content)
            extracted_data.append(extract_data(content))
            # Print extracted information
            print(f"Extracted data from file {file_name}:")
            for key, value in extracted_info.items():
                print(f"{key}: {value}")
            print("\n" + "-"*50 + "\n")

csv_columns = ['company_name', 'address', 'date_of_attack', 'revenue', 'industry', 'data_volume', 'data_description']

# Write extracted data to CSV
csv_file = 'extracted_data.csv'
try:
    with open('cactus_extracted_data.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in extracted_data:
            writer.writerow(data)
except IOError:
    print("I/O error")


Extracted data from file cactus scrapped txt/:
company_name: N/A
address: N/A
date_of_attack: N/A
revenue: N/A
industry: N/A
data_volume: N/A
data_description: N/A

--------------------------------------------------

Extracted data from file cactus scrapped txt/abzlvlslacizrznq.txt:
company_name: gocco
address: &nbsp;CALLE DE LA CALENDULA (MINIPARC III, SOTO DE LA MORALEJA) 93 28109, ALCOBENDAS, Madrid Spain
date_of_attack: 18.02.2024
revenue: $937.9M
industry: N/A
data_volume: 136 GB
data_description: N/A

--------------------------------------------------

Extracted data from file cactus scrapped txt/afkrwkdpzyshxyqi.txt:
company_name: geocom
address: 3071 Dionisio Oribe, Montevideo, Montevideo, 11600, Uruguay
date_of_attack: 17.12.2023
revenue: $63.2M
industry: N/A
data_volume: 77 GB
data_description: N/A

--------------------------------------------------

Extracted data from file cactus scrapped txt/aqbiyjvfrevsyjol.txt:
company_name: AXIOM
address: 1841 Front St Ste A, Lynden, Wa