# Extracting STIX bundle from raw text with GPT-4

In [1]:
import openai
import uuid
import json
import re
from stix2 import parse, exceptions, Bundle
from datetime import datetime

### Utilities functions

In [2]:
# Add UUIDs to 'id' fields in STIX objects, ensuring each object has a unique identifier.
def add_uuid_to_ids(stix_data):
    """
    Add UUIDs to 'id' fields in STIX objects.
    """
    for item in stix_data:
        if 'id' in item:
            object_type = item['type']
            item['id'] = f"{object_type}--{uuid.uuid4()}"
    return stix_data

#Validate a list of STIX objects against the STIX 2.1 standard, identifying any invalid objects.
def validate_stix_objects(stix_objects):
    """
    Validate STIX objects against the STIX 2.1 standard.
    """
    all_valid = True
    invalid_objects = []
    for obj in stix_objects:
        try:
            # Parse the object to validate against the STIX 2.1 standard
            stix_obj = parse(json.dumps(obj), allow_custom=True)
            print(f"Validation passed for object ID: {obj.get('id')}")
        except exceptions.STIXError as se:
            print(f"STIX parsing error for object ID {obj.get('id')}: {se}")
            invalid_objects.append(obj)
            all_valid = False
        except json.JSONDecodeError as je:
            print(f"JSON parsing error: {je}")
            invalid_objects.append(obj)
            all_valid = False
    return all_valid, invalid_objects

## OpenAI client

In [None]:
client = openai.OpenAI(
    api_key=''
)
temperature = 0.7

## Text extraction

In [4]:
# Read the extracted text file
txt_filename = "mandiant-apt1.txt"

with open(txt_filename, "r", encoding="utf-8") as txt_file:
    content = txt_file.read()

# Define token (character) limit
TOKEN_LIMIT = 15000

# Check the number of tokens
num_tokens = len(content)

# Split the text if it exceeds the limit
if num_tokens <= TOKEN_LIMIT:
    text = content
else:
    text = [content[i:i+TOKEN_LIMIT] for i in range(0, num_tokens, TOKEN_LIMIT)]

# Print results
if isinstance(text, str):
    print(f"Text stored in a single variable with {num_tokens} tokens.")
else:
    print(f"Text split into {len(text)} blocks of {TOKEN_LIMIT} tokens each.")

Text split into 10 blocks of 15000 tokens each.


## STIX Domain Objects

In [7]:
print("Generating STIX SDOs with GPT-4")

stix_sdos = []
model_responses = []

if isinstance(text, list):  

    for i in range(len(text)):
        print("\n***************************************")
        print(f"* EXTRACTING FROM TEXT BLOCK {i}/{len(text)} ... *")
        print("***************************************\n")

        system_prompt_sdo = (
            # Role
            "You are a high skilled CTI analyst who focuses on STIX 2.1 Domain Objects (SDOs), with a strong drive for accuracy and validity.\n"
            # Task 
            "You are tasked with creating new STIX 2.1 Domain Objects (SDOs) from the provided threat intelligence text and expanding the already present SDOs in the provided list.\n"
            "You should follow this process:\n"
            "1. Extract any Named Entity (NE) in the text that aligns with the definition of STIX 2.1 Domain Objects.\n"
            "2. Collect all related information to the NE that should be present in an SDO from the text.\n"
            "3. If the NE is already present in the list of already found SDOs, extend the related SDO in the list with the newly found informations.\n"
            "4. If the NE is not present in the list, create a new SDO following the specified format and include it in the list.\n"
            "5. Do not, for any reason, remove an already present NE from the SDO list.\n"
            "Finally provide the entire, updated list.\n"
            # Specifics
            "Possible SDOs include: Attack Pattern, Campaign, Course of Action, Identity, Indicator, Intrusion Set, Malware, Observed Data, Report, Threat Actor, Tool, Vulnerability, Infrastructure, Sighting, Note, Opinion, Grouping, Incident, Location, Malware Analysis.\n"
            "Create relevant SDOs in JSON format, strictly adhering to the STIX 2.1 specification.\n"
            "For id property write just SDO_type-- following this example: \"id\": \"malware--\"\n"
            "The is_family field indicates whether the malware is a family (if true) or an instance (if false). The values true or false are always enclosed in quotes.\n"
            "Don't use created_by_ref and source_ref.\n"
            "Timestamp must be in ISO 8601 format.\n"
            "The values of \"created\" and \"modified\" should be both today's date.\n"
            "The labels property in malware is used to categorize or tag the malware object with descriptive terms (e.g., \"trojan\", \"backdoor\", \"ransomware\"), Must contain at least one string.\n"
            "The threat-actor labels property should be an array of strings representing categories or descriptive terms for the threat actor.\n"
            # Context
            "The goal is to extract a STIX 2.1 bundle from the text provided, your role is to extract the SDOs found in the text which will be bundled with the relationships and observable found by others.\n"
            "By accurately extracting the SDOs, you contribute importantly to the creation of the bundle, so keep in mind to be very accurate and reliable.\n"
            # Example
            "This is an example of an SDOs:\n"
            """[
            {
                "type": "threat-actor",
                "id": "threat-actor--a63b9ab7-7253-432d-98ea-5381207c74af",
                "name": "APT42",
                "description": "APT42 is an Iranian state-sponsored cyber espionage actor targeting Western and Middle Eastern NGOs, media organizations, academia, legal services, and activists. It operates on behalf of the Islamic Revolutionary Guard Corps Intelligence Organization (IRGC-IO).",
                "labels": [
                    "state-sponsored",
                    "cyber-espionage",
                    "Iranian"
                ],
                "created": "2024-05-01T00:00:00Z",
                "modified": "2024-05-01T00:00:00Z"
            },
            {
                "type": "intrusion-set",
                "id": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
                "name": "APT42 Operations",
                "description": "APT42 uses enhanced social engineering schemes to gain access to victim networks, including cloud environments, by harvesting credentials and using them to gain initial access. It exfiltrates data of strategic interest to Iran, using built-in features and open-source tools to avoid detection.",
                "labels": [
                    "intrusion-set",
                    "APT42"
                ],
                "created": "2024-05-01T00:00:00Z",
                "modified": "2024-05-01T00:00:00Z"
            },
            {
                "type": "malware",
                "id": "malware--d10904bd-10b9-4045-b514-f6c5f7d1de50",
                "name": "NICECURL",
                "description": "NICECURL is a backdoor written in VBScript that can download additional modules to be executed, including data mining and arbitrary command execution.",
                "is_family": "true",
                "labels": [
                    "backdoor",
                    "VBScript"
                ],
                "created": "2024-05-01T00:00:00Z",
                "modified": "2024-05-01T00:00:00Z"
            }
            ]\n"""
            # Notes
            "Ensure the output is a valid JSON array ([...]) containing only the updated SDOs list.\n"
            "The output should be correctly formatted as json with identation 4.\n"
            "Return only the JSON array, without any additional text, commentary, or code block delimiters (e.g., json).\n"
            "Always ensure that the output is complete, the JSON array should never be truncated.\n"
        )

        user_prompt = f"Text:\n{text[i]}"

        sdo_list = f"Already Found SDOs:\n{json.dumps(stix_sdos, indent=4)}"

        #print(system_prompt_sdo)
        #print(user_prompt)
        print(sdo_list)

        print("\nModel Response:")
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": system_prompt_sdo},
                {"role": "user", "content": sdo_list},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature
        )

        raw_stix_sdo = response.choices[0].message.content

        print(raw_stix_sdo)

        raw_stix_sdo = raw_stix_sdo[raw_stix_sdo.find('['):]

        stix_sdos = json.loads(raw_stix_sdo)
        #model_responses.append(json.loads(raw_stix_sdo))

    stix_sdo_data = json.loads(raw_stix_sdo)
    if not isinstance(stix_sdo_data, list):
        raise ValueError("STIX SDOs must be a list of dictionaries.")

    stix_sdo_data = add_uuid_to_ids(stix_sdo_data)
    print("\n*****************************************")
    print("* Complete list of STIX Domain Objects: *")
    print("*****************************************\n")
    print(json.dumps(stix_sdo_data, indent=4))

Generating STIX SDOs with GPT-4

***************************************
* EXTRACTING FROM TEXT BLOCK 0/10 ... *
***************************************

Already Found SDOs:
[]

Model Response:
[
    {
        "type": "threat-actor",
        "id": "threat-actor--",
        "name": "APT1",
        "description": "APT1 is a single organization of operators that has conducted a cyber espionage campaign against a broad range of victims since at least 2006. It is based primarily in China and is likely government-sponsored. The group is known for stealing large volumes of valuable intellectual property and is one of the most prolific cyber espionage groups in terms of the sheer quantity of information stolen.",
        "labels": [
            "state-sponsored",
            "cyber-espionage",
            "Chinese"
        ],
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "identity",
        "id": "identity--",
        "name":

JSONDecodeError: Unterminated string starting at: line 167 column 24 (char 7032)

In [None]:

def generate_sdo(text):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sdo},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

def correct_invalid_stix(text, invalid_objects, original_stix):
    """
    Ask the LLM to correct invalid STIX output based on the original text and invalid objects.
    """
    # Create a focused invalid output section
    invalid_output = json.dumps(invalid_objects, indent=4)
    print(invalid_output)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sdo},
            {
                "role": "user",
                "content": f"""
                            Correct the following invalid STIX objects based on the original text and ensure consistency with the overall STIX data:
                            
                            Input text:
                            {text}
                            
                            Invalid STIX objects:
                            {invalid_output}
                            
                            Existing valid STIX data:
                            {original_stix}
                            """
            }
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
# Generate STIX data and ensure valid output
is_valid = False
stix_sdo = ""

print("Generating STIX SDOs with GPT-4")


print("Querying GPT-4...")
stix_sdo = generate_sdo(text)
print(f"Raw object:\n{stix_sdo}")

try:
    stix_data = json.loads(stix_sdo)
    if not isinstance(stix_data, list):
        raise ValueError("STIX SDOs must be a list of dictionaries.")
    
    stix_data = add_uuid_to_ids(stix_data)
    print("\nSTIX data after UUID insertion:")
    print(json.dumps(stix_data, indent=4))
    is_valid, invalid_objects = validate_stix_objects(stix_data)
    print(invalid_objects)
    if not is_valid:
        print("Validation failed. Requesting correction from LLM...")
        stix_sdo = correct_invalid_stix(text, invalid_objects, stix_sdo)
    else:
        print("Validation successful: all objects are valid.")
except json.JSONDecodeError as e:
    print(f"Error parsing SDOs: {e}")
    stix_sdo = correct_invalid_stix(text, [], stix_sdo)
except ValueError as e:
    print(e)
    stix_sdo = correct_invalid_stix(text, [], stix_sdo)

stix_sdo = json.dumps(stix_data, indent=4)
print(stix_sdo)

## STIX Cyber-Observable Objects

In [None]:
system_prompt_sco = (
    # Role
    "You are a high skilled CTI analyst who focuses on STIX Cyber-Observable Objects (SCOs), with a strong drive for accuracy and validity."
    # Task 
    "You are tasked with creating STIX 2.1 Cyber-Observable Objects (SCOs) from the provided threat intelligence text." # add step-by-step process
    ""
    # Specifics
    "Possible SCOs include: Artifact, Autonomous System, Directory, Domain Name, Email Address, Email Message, File, IPv4 Address, IPv6 Address, MAC Address, Mutex, Network Traffic, Process, Software, URL, User Account, Windows Registry Key, X.509 Certificate, HTTP Request, ICMP, Socket Ext, TCP Ext, Archive Ext, Raster Image Ext, NTFS Ext, PDF Ext, UNIX Account Ext, Windows PE Binary Ext, Windows Process Ext, Windows Service Ext, Windows Registry Ext, JPEG File Ext, Email MIME Component, Email MIME Multipart Type, Email MIME Message Type, Email MIME Text Type."   
    "Create relevant STIX 2.1 SCOs in JSON format based on the information provided in the text."
    "Ensure the SCO id STIX identifier must match <object-type>--<UUID>" 
    "Strictly follow the STIX 2.1 specification, ensuring no properties are used that are not defined in the specification"
    "STIX SCO objects require at least type, id and value properties"    
    "Only provide output if one or more SCOs can be identified with reasonable certainty from the text."
    "Ensure the structure and format are fully compliant with STIX 2.1."
    # Context
    "The goal is to extract a STIX 2.1 bundle from the text provided, your role is to extract the SCOs found in the text which will be bundled with the relationships and domains found by others."
    "By accurately extracting the SCOs, you contribute importantly to the creation of the bundle, so keep in mind to be very accurate and reliable."
    # Example
    "This is an example of an SCOs:"
    """[
    {
        "type": "domain-name",
        "id": "domain-name--5f2730f5-a17a-4842-9744-73aacfcefcdb",
        "value": "washinqtonpost.press"
    },
    {
        "type": "domain-name",
        "id": "domain-name--31e83530-c412-47f0-a636-f35293916b52",
        "value": "ksview.top"
    },
    {
        "type": "domain-name",
        "id": "domain-name--dde332d5-e63a-4796-965b-39e84bbf7f3a",
        "value": "honest-halcyon-fresher.buzz"
    },
    {
        "type": "domain-name",
        "id": "domain-name--ca21e8b0-0f63-4747-8039-5ebf7e9f17e1",
        "value": "n9.cl"
    },
    {
        "type": "domain-name",
        "id": "domain-name--8fdd479b-587b-434c-95f5-499165203aac",
        "value": "review.modification-check.online"
    },
    {
        "type": "domain-name",
        "id": "domain-name--1b837c95-98c9-4316-bd60-da67f956dfe9",
        "value": "nterview.site"
    },
    {
        "type": "domain-name",
        "id": "domain-name--332b3542-2f73-4254-9c02-66d2e0123e32",
        "value": "admin-stable-right.top"
    },
    {
        "type": "domain-name",
        "id": "domain-name--17c8fe4c-9bb1-451c-8f2a-d3e5f54962be",
        "value": "shortlinkview.live"
    },
    {
        "type": "domain-name",
        "id": "domain-name--8d9122d8-2bf6-4fe6-86da-d8e826e68331",
        "value": "reconsider.site"
    },
    {
        "type": "domain-name",
        "id": "domain-name--07f5ca46-0260-4fbf-bf05-c4524c34d05c",
        "value": "last-check-leave.buzz"
    },
    {
        "type": "domain-name",
        "id": "domain-name--9e628138-f801-4626-98cc-89fa5ce98ec0",
        "value": "email-daemon.online"
    },
    {
        "type": "domain-name",
        "id": "domain-name--9a66b85f-5847-43cb-b32e-36dbe52bcfa6",
        "value": "onmicrosofl.com"
    },
    {
        "type": "domain-name",
        "id": "domain-name--4eb73cd1-d832-4869-953c-b7bb8aee5ad4",
        "value": "bitly.org.il"
    },
    {
        "type": "domain-name",
        "id": "domain-name--26b853f7-ffb3-4437-b55b-e8e52e84dab8",
        "value": "youtransfer.live"
    },
    {
        "type": "domain-name",
        "id": "domain-name--76f78672-5ba5-457d-845b-918c2260ac5f",
        "value": "drive-file-share.site"
    },
    {
        "type": "domain-name",
        "id": "domain-name--8023cb8e-39a6-45c9-bbf3-d8b35a4d6a9c",
        "value": "prism-west-candy.glitch.me"
    },
    {
        "type": "domain-name",
        "id": "domain-name--7b14af40-0ee6-4afa-8962-c9b5c03090dc",
        "value": "worried-eastern-salto.glitch.me"
    },
    {
        "type": "domain-name",
        "id": "domain-name--9453ccee-c8a6-4383-a7b8-76e0e8decf64",
        "value": "tnt200.mywire.org"
    },
    {
        "type": "domain-name",
        "id": "domain-name--1b11e99a-ab73-4312-9838-fb0c9c85c2a1",
        "value": "accurate-sprout-porpoise.glitch.me"
    }
    ]"""
    # Notes
    "Ensure the output is a valid JSON array ([...]) containing only SCOs identified with high confidence."   
    "Return only the JSON array, without any additional text, commentary, or code block delimiters (e.g., json)."
)

def generate_sco(text):
    """
    Generate STIX SCOs using the LLM.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sco},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

def correct_invalid_stix(text, invalid_objects, original_stix):
    """
    Ask the LLM to correct invalid STIX output based on the original text and invalid objects.
    """
    # Create a focused invalid output section
    invalid_output = json.dumps(invalid_objects, indent=4)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sco},
            {
                "role": "user",
                "content": f"""
                Correct the following invalid STIX objects based on the original text and ensure consistency with the overall STIX data:

                Input text:
                {text}

                Invalid STIX objects:
                {invalid_output}

                Existing valid STIX data:
                {original_stix}
                """
            }
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
is_valid = False
stix_sco = ""

print("Generating STIX SCOs with GPT-4")


print("Querying GPT-4...")
stix_sco = generate_sco(text)
print(f"Raw object:\n{stix_sco}")

try:

    stix_data = json.loads(stix_sco)
    if not isinstance(stix_data, list):
        raise ValueError("Parsed STIX SCO must be a list of dictionaries.")
    stix_data = add_uuid_to_ids(stix_data)
    print("\nSTIX SCO data after UUID insertion:")
    print(json.dumps(stix_data, indent=4))
    is_valid, invalid_objects = validate_stix_objects(stix_data)
    if not is_valid:
        print("Validation failed. Requesting correction from LLM...")
        stix_sco = correct_invalid_stix(text, invalid_objects, stix_sco)
    else:
        print("Validation successful. All objects are valid.")
        
except json.JSONDecodeError as e:
    print(f"Error parsing STIX SCO: {e}")
    stix_sco = correct_invalid_stix(text, [], stix_sco)

except ValueError as e:
    print(e)
    stix_sco = correct_invalid_stix(text, [], stix_sco)

stix_sco = json.dumps(stix_data, indent=4)
print(stix_sco)

## STIX Relationship Objects

In [None]:
stix_sco = [
    {
        "type": "ipv4-addr",
        "id": "ipv4-addr--b30b4516-5348-40b9-b83b-72a23a923a8c",
        "value": "832"
    },
    {
        "type": "ipv4-addr",
        "id": "ipv4-addr--d0d526f8-d42e-48a7-8f6e-f4aac7dc8669",
        "value": "817"
    },
    {
        "type": "ipv4-addr",
        "id": "ipv4-addr--a024c61c-f4f4-4877-8b40-44ceb762ec49",
        "value": "613"
    },
    {
        "type": "domain-name",
        "id": "domain-name--9b64dd6c-54b3-4821-b522-d4c6e1cc97aa",
        "value": "www.mandiant.com"
    },
    {
        "type": "x509-certificate",
        "id": "x509-certificate--afa42859-9f75-4911-acad-aaba69e24c04",
        "hashes": {
            "SHA-256": "13"
        }
    },
    {
        "type": "ipv4-addr",
        "id": "ipv4-addr--728b4e0d-58b8-46cd-a514-2185d1e13b29",
        "value": "849"
    },
    {
        "type": "ipv4-addr",
        "id": "ipv4-addr--694446f9-e763-45bd-ba16-dd4d091d3500",
        "value": "709"
    },
    {
        "type": "domain-name",
        "id": "domain-name--0c4d434b-66ed-4b5e-9912-b2a10c21947f",
        "value": "www.washingtonpost.com"
    },
    {
        "type": "domain-name",
        "id": "domain-name--98cc31f7-0ec7-4041-9a60-61a3fb720851",
        "value": "intelligence.house.gov"
    },
    {
        "type": "domain-name",
        "id": "domain-name--6caffd64-4596-4e7d-ba02-4e1b1600503d",
        "value": "www.rand.org"
    },
    {
        "type": "domain-name",
        "id": "domain-name--998c0f02-3297-4b11-b427-05e0afb3679f",
        "value": "www.uscc.gov"
    },
    {
        "type": "software",
        "id": "software--79f3bb17-7682-4f8b-b10a-4de225402e3e",
        "name": "Microsoft\u00ae"
    }
]
system_prompt_sro = (
    # Role
    "You are a high skilled CTI analyst who focuses on STIX Relationship Objects (SROs), with a strong drive for accuracy and validity."
    # Task 
    "You are tasked with creating a STIX 2.1 Relationship Object (SRO) based on the provided writeup about threat intelligence text SDOs and SCOs"      ""
    # Specifics
    "Remember a relationship is a link between STIX Domain Objects (SDOs), STIX Cyber-observable Objects (SCOs), or between an SDO and a SCO that describes the way in which the objects are related. Relationships can be represented using an external STIX Relationship Object (SRO) or, in some cases, through certain properties which store an identifier reference that comprises an embedded relationship, (for example the created_by_ref property)."
    "Identify Relationships: For each entity (like intrusion-set, malware, infrastructure, domain-name, file, directory), identify how they relate to each other. For example, malware might use infrastructure for command and control, or an intrusion set might leverage certain domains"
    "Use relationship Objects: Use relationship objects to connect entities. This object will specify the source and target entities and define the nature of the relationship (e.g., \"uses\", \"communicates with\")"
    "Ensure Consistent Referencing: Make sure that every object you want to relate is referenced correctly using their id in the relationship objects."
    "Pay attention to properties, don't use properties not defined in STIX 2.1 specification"
    "If you cannot identify a specific SCO from the provided text, simply do not do anything."
    "Provide output only if you can identify one or more SCOs with reasonable certainty."
    "Ensure the structure and format are fully compliant with STIX 2.1."
    # Context
    "The goal is to extract a STIX 2.1 bundle from the text provided, your role is to combine the SDOs and SCOs found by others in the text, by finding the right relationships."
    "By accurately extracting the SROs, you contribute importantly to the creation of the bundle, so keep in mind to be very accurate and reliable."
    # Example
    "This is an example of an SROs:"
    """[
    {
        "type": "relationship",
        "id": "relationship--8624e252-244b-43b2-828b-4860687979cd",
        "relationship_type": "uses",
        "source_ref": "threat-actor--a63b9ab7-7253-432d-98ea-5381207c74af",
        "target_ref": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--6a653878-988c-49fe-9463-7b5a91541d9e",
        "relationship_type": "uses",
        "source_ref": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
        "target_ref": "malware--d10904bd-10b9-4045-b514-f6c5f7d1de50",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--b0dd9712-f460-422a-92d7-9d229f645a03",
        "relationship_type": "uses",
        "source_ref": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
        "target_ref": "malware--00c9240a-4365-4101-8be4-66a0bc8f0530",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--d6ef9860-6982-44c1-ad34-56e52fed4ad2",
        "relationship_type": "uses",
        "source_ref": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
        "target_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--21e85f01-fbac-4568-a695-86fc30cdc2d4",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--5f2730f5-a17a-4842-9744-73aacfcefcdb",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--0dcf6c28-c5ab-489d-b086-de6b8e5dfb56",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--31e83530-c412-47f0-a636-f35293916b52",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--41b7981b-9866-4b17-862a-7b614944ca0d",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--dde332d5-e63a-4796-965b-39e84bbf7f3a",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--f53ae137-d0bb-4752-b7fd-8e29efcecb0c",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--ca21e8b0-0f63-4747-8039-5ebf7e9f17e1",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--146166ac-027a-419e-9059-dc6def3e4920",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--8fdd479b-587b-434c-95f5-499165203aac",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--d00105ee-bcdf-4cba-9358-b0535cff30aa",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--1b837c95-98c9-4316-bd60-da67f956dfe9",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--2749e554-2bcd-4d15-be2b-4f8808904eac",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--332b3542-2f73-4254-9c02-66d2e0123e32",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    {
        "type": "relationship",
        "id": "relationship--10680e58-642e-4fa4-b2d5-cc9ef2bf514f",
        "relationship_type": "uses",
        "source_ref": "campaign--d00537dc-68aa-494b-b8fc-3e0ebf3204ce",
        "target_ref": "domain-name--17c8fe4c-9bb1-451c-8f2a-d3e5f54962be",
        "created": "2024-05-01T00:00:00Z",
        "modified": "2024-05-01T00:00:00Z"
    },
    
    ]"""
    # Notes
    "Ensure the output is a valid JSON array ([...]) containing only SROs identified with high confidence."  
    "Timestamp must be in ISO 8601 format."
    "The values of \"created\" and \"modified\" should be both today's date."
    "Return only the JSON array, without any additional text, commentary, or code block delimiters (e.g., json)."
    "Ensure only the JSON array is returned."
  )

user_stix_sdo_sco_text = f"""
    Text of writeup: {text},  {stix_sdo} , {stix_sco}
"""

def generate_sro(text, stix_sdo, stix_sco):
    """
    Generate STIX SROs using the LLM.
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sro},
            {
                "role": "user",
                "content": f"""
Generate STIX 2.1 relationship objects (SROs) based on the following:

Input text:
{text}

SDO:
{stix_sdo}

SCO:
{stix_sco}
"""
            }
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

def correct_invalid_stix(text, invalid_objects, original_stix):
    """
    Ask the LLM to correct invalid STIX output based on the original text and invalid objects.
    """
    # Create a focused invalid output section
    invalid_output = json.dumps(invalid_objects, indent=4)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt_sro},
            {
                "role": "user",
                "content": f"""
Correct the following invalid STIX relationship objects (SROs) based on the original text and ensure consistency with the SDO and SCO data:

Input text:
{text}

Invalid SRO objects:
{invalid_output}

SDO:
{stix_sdo}

SCO:
{stix_sco}
"""
            }
        ],
        temperature=temperature
    )
    return response.choices[0].message.content

In [None]:
is_valid = False
stix_sro = ""

print("Generating STIX SROs with GPT-4")

while not is_valid:
    print("Querying GPT-4...")
    stix_sro = generate_sro(text, stix_sdo, stix_sco)
    print(f"Raw object:\n{stix_sro}")

    try:

        stix_data = json.loads(stix_sro)
        if not isinstance(stix_data, list):
            raise ValueError("Parsed STIX SRO must be a list of dictionaries.")

        stix_data = add_uuid_to_ids(stix_data)

        print("\nSTIX SRO data after UUID insertion:")
        print(json.dumps(stix_data, indent=4))

        is_valid, invalid_objects = validate_stix_objects(stix_data)

        if not is_valid:
            print("Validation failed. Requesting correction from LLM...")
            stix_sro = correct_invalid_stix(text, invalid_objects, stix_sro)
        else:
            print("Validation successful. All objects are valid.")

    except json.JSONDecodeError as e:
        print(f"Error parsing STIX SRO: {e}")
        stix_sro = correct_invalid_stix(text, [], stix_sro)
    except ValueError as e:
        print(e)
        stix_sro = correct_invalid_stix(text, [], stix_sro)

stix_sro = json.dumps(stix_data, indent=4)
print(stix_sro)

## Creating the STIX bundle

In [None]:
def remove_brackets(text):
    """
    Remove leading '[' and trailing ']' and format inner objects into a valid JSON array.
    """
    if not text.startswith('[') or not text.endswith(']'):
        raise ValueError("Input text does not start with '[' or end with ']'")

    # Remove brackets and split into individual JSON objects
    objects_str = text[1:-1].strip()  # Remove brackets and leading/trailing whitespace
    objects_list = objects_str.split('}{')

    # Add back the missing curly braces and format into a valid JSON array
    formatted_objects = "[" + ", ".join(["{" + obj + "}" if not obj.startswith("{") and not obj.endswith("}") else obj for obj in objects_list]) + "]"

    return formatted_objects

def create_stix_bundle(sdo_data, sco_data, sro_data):
    """
    Create a STIX 2.1 bundle from input SDO, SCO, and SRO data.

    Args:
        sdo_data (list): List of valid SDO objects.
        sco_data (list): List of valid SCO objects.
        sro_data (list): List of valid SRO objects.

    Returns:
        str: A STIX 2.1 bundle in JSON format.
    """
    # Combine SDO, SCO, and SRO data
    all_objects = sdo_data + sco_data + sro_data
    #print("all_objects:", all_objects)

    bundle = {
        "type": "bundle",
        "id": f"bundle--{uuid.uuid4()}",
        "objects": all_objects
    }

    # Return the serialized bundle
    return json.dumps(bundle, indent=4)

# Remove brackets and parse as JSON
stix_sdo_objects = json.loads(remove_brackets(stix_sdo))
stix_sco_objects = json.loads(remove_brackets(stix_sco))
stix_sro_objects = json.loads(remove_brackets(stix_sro))

# Create STIX bundle
stix_bundle = create_stix_bundle(stix_sdo_objects, stix_sco_objects, stix_sro_objects)

# Output the result
print("STIX Bundle:")
print(stix_bundle)

# Write the STIX bundle to the file
filename = f"bundle_{datetime.now().strftime('%Y_%m_%d_%H_%M')}.json"
with open(filename, 'w') as file:
    file.write(stix_bundle)
print(f"STIX bundle written to file: {filename}")

In [None]:
combine_responses_prompt = (
             # Role
             "You are a high skilled CTI analyst who focuses on STIX 2.1 Domain Objects (SDOs), with a strong drive for accuracy and validity.\n"
             # Task 
             "You are tasked with combining the provided lists of SDOs into one final list.\n"
             "You should follow this process:\n"
             "1. All SDOs with the same name should be merged into one single object if they refer to the same Named Entity (eg. APT1).\n"
             "2. Enhance each of these objects with all the information found across all lists that relate to the Named Entity.\n"
             "3. All Named Entities should be present in some form.\n"
             "4. Every object in the input lists should contain unique informations that should not be discarded.\n"
             "5. Provide a JSON array containing unique SDOs that are enriched with all the information found in the input lists.\n"
             # Specifics
             "Possible SDOs include: Attack Pattern, Campaign, Course of Action, Identity, Indicator, Intrusion Set, Malware, Observed Data, Report, Threat Actor, Tool, Vulnerability, Infrastructure, Sighting, Note, Opinion, Grouping, Incident, Location, Malware Analysis.\n"
             "Create relevant SDOs in JSON format, strictly adhering to the STIX 2.1 specification.\n"
             "For id property write just SDO_type-- following this example: \"id\": \"malware--\"\n"
             "The is_family field indicates whether the malware is a family (if true) or an instance (if false). The values true or false are always enclosed in quotes.\n"
             "Don't use created_by_ref and source_ref.\n"
             "Timestamp must be in ISO 8601 format.\n"
             "The values of \"created\" and \"modified\" should be both today's date.\n"
             "The labels property in malware is used to categorize or tag the malware object with descriptive terms (e.g., \"trojan\", \"backdoor\", \"ransomware\"), Must contain at least one string.\n"
             "The threat-actor labels property should be an array of strings representing categories or descriptive terms for the threat actor.\n"
             # Context
             "The goal is to extract a STIX 2.1 bundle from the text provided, your role is to extract the SDOs found in the text which will be bundled with the relationships and observable found by others.\n"
             "By accurately extracting the SDOs, you contribute importantly to the creation of the bundle, so keep in mind to be very accurate and reliable.\n"
             # Example
             "This is an example of an SDOs:\n"
             """[
             {
                 "type": "threat-actor",
                 "id": "threat-actor--a63b9ab7-7253-432d-98ea-5381207c74af",
                 "name": "APT42",
                 "description": "APT42 is an Iranian state-sponsored cyber espionage actor targeting Western and Middle Eastern NGOs, media organizations, academia, legal services, and activists. It operates on behalf of the Islamic Revolutionary Guard Corps Intelligence Organization (IRGC-IO).",
                 "labels": [
                     "state-sponsored",
                     "cyber-espionage",
                     "Iranian"
                 ],
                 "created": "2024-05-01T00:00:00Z",
                 "modified": "2024-05-01T00:00:00Z"
             },
             {
                 "type": "intrusion-set",
                 "id": "intrusion-set--7e9242ea-dcab-49cf-8e6b-22919d2b361c",
                 "name": "APT42 Operations",
                 "description": "APT42 uses enhanced social engineering schemes to gain access to victim networks, including cloud environments, by harvesting credentials and using them to gain initial access. It exfiltrates data of strategic interest to Iran, using built-in features and open-source tools to avoid detection.",
                 "labels": [
                     "intrusion-set",
                     "APT42"
                 ],
                 "created": "2024-05-01T00:00:00Z",
                 "modified": "2024-05-01T00:00:00Z"
             },
             {
                 "type": "malware",
                 "id": "malware--d10904bd-10b9-4045-b514-f6c5f7d1de50",
                 "name": "NICECURL",
                 "description": "NICECURL is a backdoor written in VBScript that can download additional modules to be executed, including data mining and arbitrary command execution.",
                 "is_family": "true",
                 "labels": [
                     "backdoor",
                     "VBScript"
                 ],
                 "created": "2024-05-01T00:00:00Z",
                 "modified": "2024-05-01T00:00:00Z"
             }
             ]\n"""
             # Notes
             "Ensure the output is a valid JSON array ([...]) containing only the updated SDOs list.\n"
             "Return only the JSON array, without any additional text, commentary, or code block delimiters (e.g., json).\n"
    )
        
sdo_list = f"Lists of SDOs:\n"
for response in model_responses:
    sdo_list += f"{response}\n"
print("\n*************************************")
print("* COMBINING STIX DOMAIN OBJECTS ... *")
print("*************************************\n")
    
response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": combine_responses_prompt},
            {"role": "user", "content": sdo_list},
        ],
        temperature=temperature
)
combined_sdos = response.choices[0].message.content
stix_sdo_data = json.loads(combined_sdos)
if not isinstance(stix_sdo_data, list):
    raise ValueError("STIX SDOs must be a list of dictionaries.")

stix_data = add_uuid_to_ids(stix_sdo_data)
print("\n*********************************")
print("* EXTRACTED STIX Domain Objects *")
print("*********************************\n")
print(json.dumps(stix_data, indent=4))