In [1]:
# Get the environment variables
from dotenv import load_dotenv
import os

load_dotenv()
print(f"OpenAI API key => {len(os.getenv('OPENAI_API_KEY')) * '#'}")
print(f"Langchain API Key => {len(os.getenv('LANGCHAIN_API_KEY')) * '#'}")
print(f"Langchain project name => {os.getenv('LANGCHAIN_PROJECT_NAME')}")
print(f"Langchain endpoint => {os.getenv('LANGCHAIN_ENDPOINT')}")
print(f"Langchain tracking => {os.getenv('LANGCHAIN_TRACING_V2')}")

print(f"OpenSearch HOST => {os.getenv('OPENSEARCH_HOST').split('.')[0]}.XXX.XXX.XXX")
print(f"OpenSearch PORT => {os.getenv('OPENSEARCH_PORT')}")
print(f"OpenSearch index name => {os.getenv('OPENSEARCH_INDEX_NAME')}")
print(f"OpenSearch account ID => {os.getenv('OPENSEARCH_ACCOUNT_ID')}")
print(f"OpenSearch account password => {len(os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')) * '#'}")

OpenAI API key => ########################################################
Langchain API Key => ###################################################
Langchain project name => ex_cti_rag_mitre_002
Langchain endpoint => https://api.smith.langchain.com
Langchain tracking => true
OpenSearch HOST => 15.XXX.XXX.XXX
OpenSearch PORT => 9200
OpenSearch index name => ex_cti_rag_mitre_002
OpenSearch account ID => admin
OpenSearch account password => ###################


In [2]:
# Start Langsmith tracking
from langsmith import traceable
from langchain.callbacks.tracers import LangChainTracer
from langchain.callbacks.manager import CallbackManager

tracer = LangChainTracer(
    project_name=os.getenv('LANGCHAIN_PROJECT_NAME'),
)
callback_manager = CallbackManager([tracer])

# Wrap OpenAI functions
from langchain_openai import OpenAI
openai_llm = OpenAI(temperature=0, callback_manager=callback_manager)

In [23]:
# Import necessary libraries
from MITREAttackScrapper.cti.groups import MITREAttackCTIGroups
from langchain_core.documents import Document
from typing import List, Dict
from tqdm import tqdm

# Initialize an empty list to store documents
documents: List[Document] = []

# Get the list of MITRE ATT&CK Group IDs
attack_group_id_list: List[str] = [attack_group['id'] for attack_group in MITREAttackCTIGroups.get_list()]

# Iterate over a subset of the attack group IDs to fetch detailed information
for attack_group_id in tqdm(attack_group_id_list):
    attack_group_detail = MITREAttackCTIGroups.get(attack_group_id)
    
    # Extract the details of the threat group
    attack_group_id: str                                    = attack_group_detail['id']
    attack_group_name: str                                  = attack_group_detail['name']
    attack_group_description: str                           = attack_group_detail['description']
    attack_group_contributors: List[str]                    = attack_group_detail.get('contributors', [])
    attack_group_version: str                               = attack_group_detail.get('version', 'N/A')
    attack_group_created: str                               = attack_group_detail.get('created', 'N/A')
    attack_group_last_modified: str                         = attack_group_detail.get('last_modified', 'N/A')
    attack_group_url: str                                   = attack_group_detail['url']
    attack_group_associated_groups: List[Dict[str, str]]    = attack_group_detail.get('associated_group_descriptions', [])
    attack_group_techniques: List[Dict[str, str]]           = attack_group_detail.get('techniques_used', [])
    attack_group_softwares: List[Dict[str, str]]            = attack_group_detail.get('software', [])
    attack_group_references: Dict[int, Dict[str, str]]      = attack_group_detail.get('references', {})

    # Create a description related to the threat group
    description = f"{attack_group_name} (MITRE ATT&CK Group ID: {attack_group_id}) is a threat group that {attack_group_description}.\n"
    
    # Add contributors
    if attack_group_contributors:
        description += f"\nContributors: {', '.join(attack_group_contributors)}\n"
    
    # Add creation and modification dates
    description += f"\nVersion: {attack_group_version}\nCreated: {attack_group_created}\nLast Modified: {attack_group_last_modified}\n"
    
    # Add associated groups
    if attack_group_associated_groups:
        description += "\nAssociated Groups:\n"
        for assoc_group in attack_group_associated_groups:
            description += f"- {assoc_group['name']}: {assoc_group['description']}\n"
    else:
        description += "\nAssociated Groups: None\n"
    
    # Add techniques used
    if attack_group_techniques:
        description += "\nTechniques Used:\n"
        for technique in attack_group_techniques:
            description += (
                f"- {technique['main_technique_name']} (ID: {technique['main_technique_id']}): "
                f"{technique['use']}"
            )
            if 'sub_technique_id' in technique and 'sub_technique_name' in technique:
                description += f" (Sub-technique: {technique['sub_technique_name']} (ID: {technique['sub_technique_id']}))"
            description += "\n"
    else:
        description += "\nTechniques Used: None\n"

    # Add software used
    if attack_group_softwares:
        description += "\nSoftware Used:\n"
        for software in attack_group_softwares:
            description += f"- {software['name']} (ID: {software['id']})"
            if 'techniques' in software:
                description += ", associated techniques: "
                for technique in software['techniques']:
                    description += f"{technique['name']} (ID: {technique['url'].split('/')[-1]}), "
            description += "\n"
    else:
        description += "\nSoftware Used: None\n"

    # Create a document and add it to the list
    document = Document(page_content=description, 
                        metadata={"id": attack_group_id, 
                                  "name": attack_group_name,
                                  "url": attack_group_url,
                                  "reference": attack_group_references})
    documents.append(document)

# The 'documents' list now contains the detailed documents for each MITRE ATT&CK group

100%|██████████| 152/152 [02:06<00:00,  1.20it/s]


In [24]:
print(documents[0])

page_content='admin@338 (MITRE ATT&CK Group ID: G0018) is a threat group that admin@338 is a China-based cyber threat group. It has previously used newsworthy events as lures to deliver malware and has primarily targeted organizations involved in financial, economic, and trade policy, typically using publicly available RATs such as PoisonIvy, as well as some non-public backdoors. [1].

Contributors: Tatsuya Daitoku, Cyber Defense Institute, Inc.

Version: 1.2
Created: 2017-05-31
Last Modified: 2020-03-18

Associated Groups: None

Techniques Used:
- Account Discovery (ID: T1087): admin@338 actors used the following commands following exploitation of a machine with LOWBALL malware to enumerate user accounts: net user >> %temp%\download net user /domain >> %temp%\download [1] (Sub-technique: Local Account (ID: T1087.001))
- Command and Scripting Interpreter (ID: T1059): Following exploitation with LOWBALL malware, admin@338 actors created a file containing a list of commands to be execute

In [25]:
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_openai import OpenAIEmbeddings
from opensearchpy import OpenSearch
import urllib3

# Shut up the SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Delete the index if it exists
opensearch = OpenSearch(
    hosts=[f"{os.getenv('OPENSEARCH_HOST')}:{os.getenv('OPENSEARCH_PORT')}"],
    http_auth=(os.getenv('OPENSEARCH_ACCOUNT_ID'), os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')),
    use_ssl = True,
    verify_certs = False,
    ignore_ssl_warnings = True
)

if opensearch.indices.exists(index=os.getenv('OPENSEARCH_INDEX_NAME')):
    opensearch.indices.delete(index=os.getenv('OPENSEARCH_INDEX_NAME'))
    print(f"Deleted the index {os.getenv('OPENSEARCH_INDEX_NAME')} because it already exists")

print(f"Creating an index with the name {os.getenv('OPENSEARCH_INDEX_NAME')}")

vectorstore = OpenSearchVectorSearch.from_documents(
    index_name=os.getenv('OPENSEARCH_INDEX_NAME'),
    documents=documents,
    embedding=OpenAIEmbeddings(),
    opensearch_url=f"http://{os.getenv('OPENSEARCH_HOST')}:{os.getenv('OPENSEARCH_PORT')}",
    http_auth=(os.getenv('OPENSEARCH_ACCOUNT_ID'), os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')),
    use_ssl = True,
    verify_certs = False,
    ignore_ssl_warnings = True,
)

print(f"Index created with {len(documents)} documents")



Deleted the index ex_cti_rag_mitre_002 because it already exists
Creating an index with the name ex_cti_rag_mitre_002
Index created with 152 documents


In [26]:
from typing import List

# TODO: Update the function to handle the cite in the retrieved documents (looks like "[number]")
#       by associating with the metadata of the document, returning the metadata along with the content
#       Or, we can make a output parser that attaches the metadata to the content
#       We need to distinguish the same cite number in different documents!
@traceable(name="retrieve_relevant_documents")
def retrieve_relevant_documents(query: str, top_k: int = 3) -> List[Document]:
    return vectorstore.similarity_search(query, k=top_k)

# Example usage
user_question = "Which groups are based on the North Korean threat actor?"
relevant_docs = retrieve_relevant_documents(user_question)

print(f"Retrieved {len(relevant_docs)} relevant documents:")
for i, doc in enumerate(relevant_docs, 1):
    print(f"\nDocument {i}:")
    print(f"Content: {doc.page_content}...") 
    print(f"Metadata: {doc.metadata}")

Retrieved 3 relevant documents:

Document 1:
Content: Lazarus Group (MITRE ATT&CK Group ID: G0032) is a threat group that Lazarus Group is a North Korean state-sponsored cyber threat group that has been attributed to the Reconnaissance General Bureau.[1][2] The group has been active since at least 2009 and was reportedly responsible for the November 2014 destructive wiper attack against Sony Pictures Entertainment as part of a campaign named Operation Blockbuster by Novetta. Malware used by Lazarus Group correlates to other reported campaigns, including Operation Flame, Operation 1Mission, Operation Troy, DarkSeoul, and Ten Days of Rain.[3].

Contributors: Kyaw Pyiyt Htet, @KyawPyiytHtet, Dragos Threat Intelligence

Version: 4.0
Created: 2017-05-31
Last Modified: 2024-04-11

Associated Groups:
- Labyrinth Chollima: [4]
- HIDDEN COBRA: The U.S. Government refers to malicious cyber activity by the North Korean government as HIDDEN COBRA.[1][5]
- Guardians of Peace: [1]
- ZINC: [6]
- NICK

In [27]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.schema import StrOutputParser

# Define the prompt template
# TODO: Update the prompt template to match the specific use case; cyber threat intelligence

# TODO: Few shot prompting would be useful here
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Answer in English.

Question: {question} 
Context: {context} 

Answer:"""
)

# Initialize the GPT-4 model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0, callback_manager=callback_manager)

# Create the retrieval chain
@traceable(name="rag_qa_chain")
def rag_qa_chain(query: str) -> str:
    # Retrieve relevant documents
    relevant_docs = retrieve_relevant_documents(query)
    
    # Combine the retrieved documents into a single context string
    context = "\n\n".join([doc.page_content for doc in relevant_docs])
    
    # Create the chain
    chain = (
        {"context": lambda x: context, "question": lambda x: x}
        | prompt
        | llm
        | StrOutputParser()
    )
    
    # Run the chain
    return chain.invoke(query)

# Example usage
# user_question = "What are the primary targets of admin@338?"        # example
user_question = "What race is APT32 made up of (e.g. Americans)?"
answer = rag_qa_chain(user_question)

# TODO: Deploy this user interactions with Streamlit
print(f"User question: {user_question}")
print(f"Answer: {answer}")

Parent run be6f681f-16b9-45f2-a9c2-4767f11ee43f not found for run dfaaddf4-9ce5-47aa-af77-75f17f2131ae. Treating as a root run.


User question: What race is APT32 made up of (e.g. Americans)?
Answer: APT32 is made up of individuals suspected to be based in Vietnam.
