In [1]:
# Get the environment variables
from dotenv import load_dotenv
import os

load_dotenv()
print(f"OpenAI API key => {len(os.getenv('OPENAI_API_KEY')) * '#'}")
print(f"Langchain API Key => {len(os.getenv('LANGCHAIN_API_KEY')) * '#'}")
print(f"Langchain project name => {os.getenv('LANGCHAIN_PROJECT_NAME')}")
print(f"Langchain endpoint => {os.getenv('LANGCHAIN_ENDPOINT')}")
print(f"Langchain tracking => {os.getenv('LANGCHAIN_TRACING_V2')}")

print(f"OpenSearch HOST => {os.getenv('OPENSEARCH_HOST').split('.')[0]}.XXX.XXX.XXX")
print(f"OpenSearch PORT => {os.getenv('OPENSEARCH_PORT')}")
print(f"OpenSearch index name => {os.getenv('OPENSEARCH_INDEX_NAME')}")
print(f"OpenSearch account ID => {os.getenv('OPENSEARCH_ACCOUNT_ID')}")
print(f"OpenSearch account password => {len(os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')) * '#'}")

OpenAI API key => ########################################################
Langchain API Key => ###################################################
Langchain project name => ex_cti_rag_mitre_002
Langchain endpoint => https://api.smith.langchain.com
Langchain tracking => true
OpenSearch HOST => 15.XXX.XXX.XXX
OpenSearch PORT => 9200
OpenSearch index name => mdr_threatghoul_cti_rag_mitre
OpenSearch account ID => admin
OpenSearch account password => ###################


In [2]:
# Start Langsmith tracking
from langsmith import traceable
from langchain.callbacks.tracers import LangChainTracer
from langchain.callbacks.manager import CallbackManager

tracer = LangChainTracer(
    project_name=os.getenv('LANGCHAIN_PROJECT_NAME'),
)
callback_manager = CallbackManager([tracer])

# Wrap OpenAI functions
from langchain_openai import OpenAI
openai_llm = OpenAI(temperature=0, callback_manager=callback_manager)

In [3]:
# Import necessary libraries
from MITREAttackScrapper.cti.groups import MITREAttackCTIGroups
from langchain_core.documents import Document
from typing import List, Dict
from tqdm import tqdm

# Initialize an empty list to store documents
documents: List[Document] = []

# Get the list of MITRE ATT&CK Group IDs
attack_group_id_list: List[str] = [attack_group['id'] for attack_group in MITREAttackCTIGroups.get_list()]

# Iterate over a subset of the attack group IDs to fetch detailed information
for attack_group_id in tqdm(attack_group_id_list):
    attack_group_detail = MITREAttackCTIGroups.get(attack_group_id)
    
    # Extract the details of the threat group
    attack_group_id: str                                    = attack_group_detail['id']
    attack_group_name: str                                  = attack_group_detail['name']
    attack_group_description: str                           = attack_group_detail['description']
    attack_group_contributors: List[str]                    = attack_group_detail.get('contributors', [])
    attack_group_version: str                               = attack_group_detail.get('version', 'N/A')
    attack_group_created: str                               = attack_group_detail.get('created', 'N/A')
    attack_group_last_modified: str                         = attack_group_detail.get('last_modified', 'N/A')
    attack_group_url: str                                   = attack_group_detail['url']
    attack_group_associated_groups: List[Dict[str, str]]    = attack_group_detail.get('associated_group_descriptions', [])
    attack_group_techniques: List[Dict[str, str]]           = attack_group_detail.get('techniques_used', [])
    attack_group_softwares: List[Dict[str, str]]            = attack_group_detail.get('software', [])
    attack_group_references: Dict[int, Dict[str, str]]      = attack_group_detail.get('references', {})

    # Create a description related to the threat group
    description = f"{attack_group_name} (MITRE ATT&CK Group ID: {attack_group_id}) is a threat group that {attack_group_description}.\n"
    
    # Add contributors
    if attack_group_contributors:
        description += f"\nContributors: {', '.join(attack_group_contributors)}\n"
    
    # Add creation and modification dates
    description += f"\nVersion: {attack_group_version}\nCreated: {attack_group_created}\nLast Modified: {attack_group_last_modified}\n"
    
    # Add associated groups
    if attack_group_associated_groups:
        description += "\nAssociated Groups:\n"
        for assoc_group in attack_group_associated_groups:
            description += f"- {assoc_group['name']}: {assoc_group['description']}\n"
    else:
        description += "\nAssociated Groups: None\n"
    
    # Add techniques used
    if attack_group_techniques:
        description += "\nTechniques Used:\n"
        for technique in attack_group_techniques:
            description += (
                f"- {technique['main_technique_name']} (ID: {technique['main_technique_id']}): "
                f"{technique['use']}"
            )
            if 'sub_technique_id' in technique and 'sub_technique_name' in technique:
                description += f" (Sub-technique: {technique['sub_technique_name']} (ID: {technique['sub_technique_id']}))"
            description += "\n"
    else:
        description += "\nTechniques Used: None\n"

    # Add software used
    if attack_group_softwares:
        description += "\nSoftware Used:\n"
        for software in attack_group_softwares:
            description += f"- {software['name']} (ID: {software['id']})"
            if 'techniques' in software:
                description += ", associated techniques: "
                for technique in software['techniques']:
                    description += f"{technique['name']} (ID: {technique['url'].split('/')[-1]}), "
            description += "\n"
    else:
        description += "\nSoftware Used: None\n"

    # Create a document and add it to the list
    document = Document(page_content=description, 
                        metadata={"id": attack_group_id, 
                                  "name": attack_group_name,
                                  "url": attack_group_url,
                                  "reference": attack_group_references})
    documents.append(document)

# The 'documents' list now contains the detailed documents for each MITRE ATT&CK group

100%|██████████| 152/152 [01:19<00:00,  1.90it/s]


In [4]:
print(documents[0])

page_content='admin@338 (MITRE ATT&CK Group ID: G0018) is a threat group that admin@338 is a China-based cyber threat group. It has previously used newsworthy events as lures to deliver malware and has primarily targeted organizations involved in financial, economic, and trade policy, typically using publicly available RATs such as PoisonIvy, as well as some non-public backdoors. [1].

Contributors: Tatsuya Daitoku, Cyber Defense Institute, Inc.

Version: 1.2
Created: 2017-05-31
Last Modified: 2020-03-18

Associated Groups: None

Techniques Used:
- Account Discovery (ID: T1087): admin@338 actors used the following commands following exploitation of a machine with LOWBALL malware to enumerate user accounts: net user >> %temp%\download net user /domain >> %temp%\download [1] (Sub-technique: Local Account (ID: T1087.001))
- Command and Scripting Interpreter (ID: T1059): Following exploitation with LOWBALL malware, admin@338 actors created a file containing a list of commands to be execute

In [5]:
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_openai import OpenAIEmbeddings
from opensearchpy import OpenSearch
import urllib3

# Shut up the SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Delete the index if it exists
opensearch = OpenSearch(
    hosts=[f"{os.getenv('OPENSEARCH_HOST')}:{os.getenv('OPENSEARCH_PORT')}"],
    http_auth=(os.getenv('OPENSEARCH_ACCOUNT_ID'), os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')),
    use_ssl = True,
    verify_certs = False,
    ignore_ssl_warnings = True
)

if opensearch.indices.exists(index=os.getenv('OPENSEARCH_INDEX_NAME')):
    opensearch.indices.delete(index=os.getenv('OPENSEARCH_INDEX_NAME'))
    print(f"Deleted the index {os.getenv('OPENSEARCH_INDEX_NAME')} because it already exists")

print(f"Creating an index with the name {os.getenv('OPENSEARCH_INDEX_NAME')}")

vectorstore = OpenSearchVectorSearch.from_documents(
    index_name=os.getenv('OPENSEARCH_INDEX_NAME'),
    documents=documents,
    embedding=OpenAIEmbeddings(),
    opensearch_url=f"http://{os.getenv('OPENSEARCH_HOST')}:{os.getenv('OPENSEARCH_PORT')}",
    http_auth=(os.getenv('OPENSEARCH_ACCOUNT_ID'), os.getenv('OPENSEARCH_ACCOUNT_PASSWORD')),
    use_ssl = True,
    verify_certs = False,
    ignore_ssl_warnings = True,
)

print(f"Index created with {len(documents)} documents")



Deleted the index mdr_threatghoul_cti_rag_mitre because it already exists
Creating an index with the name mdr_threatghoul_cti_rag_mitre
Index created with 152 documents


In [6]:
# Extract the ontology(top keywords) from the MITRE ATT&CK group information
from MITREAttackScrapper.cti.groups import MITREAttackCTIGroups
from typing import Any
import json

# Extract the list of MITRE ATT&CK group information and save it to the cti_group.json
attack_group_information: List[Dict[str, Any]] = MITREAttackCTIGroups.get_list()

# Extract keywords(ontologies) from each description
def extract_keywords(description: str) -> List[str]:
    keywords = set()
    words = description.split()
    for word in words:
        if len(word) > 3:
            keywords.add(word.lower())

    common_keywords: List[str] = [
        'group', 'groups', 'name', 'id', 'associated', 'description', 'url', 'threat',
        'since', 'least', 'targeted', 'including', 'based', 'primarily', 'well',
        'has', 'have', 'been', 'that', 'this', 'with', 'and', 'for', 'the', 'are', 'was',
    ]

    for keyword in common_keywords:
        if keyword in keywords:
            keywords.remove(keyword)

    return list(keywords)

attack_group_ontology: Dict[str, List[str]] = {}
for attack_group in attack_group_information:
    attack_group_id: str = attack_group['id']
    attack_group_description: str = attack_group['description']
    attack_group_ontology[attack_group_id] = extract_keywords(attack_group_description)

# Print the first five ontology
from pprint import pprint
pprint(dict(list(attack_group_ontology.items())[:5]))

# A function to use ontology for query expansion
def expand_query(query: str, ontology: Dict[str, List[str]]) -> str:
    expanded_terms: List[str] = []
    for group_id, keywords in ontology.items():
        for keyword in keywords:
            if keyword in query.lower():
                expanded_terms.append(group_id)
                break

    expanded_query: str = query + " " + " ".join(expanded_terms)
    return expanded_query

{'G0018': ['policy,',
           'some',
           'malware',
           'typically',
           'backdoors.',
           'lures',
           'china-based',
           'using',
           'admin@338',
           'events',
           'involved',
           'economic,',
           'trade',
           'poisonivy,',
           'publicly',
           'organizations',
           'financial,',
           'rats',
           'cyber',
           'deliver',
           'newsworthy',
           'available',
           'previously',
           'group.',
           'non-public',
           'such',
           'used'],
 'G0130': ['iran.',
           'base',
           'iranian',
           'defense',
           'anti-censorship',
           'malware-based',
           'targeting',
           'campaigns',
           '2010',
           'website',
           'team',
           'transitioned',
           'espionage',
           'defacement',
           'ajax',
           'from',
           '2014',
       

In [7]:
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

@traceable(name="retrieve_relevant_documents")
def retrieve_relevant_documents(query: str, top_k: int = 3) -> List[Document]:

    # Step 1: Query Expansion
    expanded_query: str = expand_query(query, attack_group_ontology)

    # Step 2: Retrieve relevant documents with 3 times the top_k
    retrieved_documents: List[Document] = vectorstore.similarity_search(expanded_query, k=top_k*3)

    # Step 3: Embed the expanded query and retrieved documents
    embedding_model: SentenceTransformer = SentenceTransformer("sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
    expanded_query_embedding: List[float] = embedding_model.encode([expanded_query])[0]
    document_embeddings: List[List[float]] = embedding_model.encode([doc.page_content for doc in retrieved_documents])

    # Step 4: Calculate the cosine similarity between the expanded query and retrieved documents
    # The cosine similarity is calculated between the expanded query and the retrieved documents
    cosine_similarities: List[float] = cosine_similarity([expanded_query_embedding], document_embeddings)[0]

    # Step 5: Sort the documents based on cosine similarity and return the top_k documents
    sorted_documents: List[Document] = [doc for _, doc in sorted(zip(cosine_similarities, retrieved_documents), reverse=True)]
    return sorted_documents[:top_k]

# Example usage
user_question = "Which groups are based on the North Korean threat actor?"
relevant_docs = retrieve_relevant_documents(user_question)

print(f"Retrieved {len(relevant_docs)} relevant documents:")
for index, document in enumerate(relevant_docs):
    print(f"\nDocument {index + 1}:")
    print(f"Content: {document.page_content}...") 
    print(f"Metadata: {document.metadata}")

  from tqdm.autonotebook import tqdm, trange


Retrieved 3 relevant documents:

Document 1:
Content: Kimsuky (MITRE ATT&CK Group ID: G0094) is a threat group that Kimsuky is a North Korea-based cyber espionage group that has been active since at least 2012. The group initially focused on targeting South Korean government entities, think tanks, and individuals identified as experts in various fields, and expanded its operations to include the United States, Russia, Europe, and the UN. Kimsuky has focused its intelligence collection activities on foreign policy and national security issues related to the Korean peninsula, nuclear policy, and sanctions.[1][2][3][4][5].

Contributors: Taewoo Lee, KISA, Dongwook Kim, KISA

Version: 4.0
Created: 2019-08-26
Last Modified: 2024-04-17

Associated Groups:
- Black Banshee: [3][4]
- Velvet Chollima: [9][10][4]
- Emerald Sleet: [11]
- THALLIUM: [3][4]

Techniques Used:
- Account Manipulation (ID: T1098): Kimsuky has added accounts to specific groups with net localgroup . [12] (Sub-technique: No

In [8]:
import re

# Realign the references of the documents
# References are stored as a dictionary at the metadata of each retrieved document.
# Merge the references of the retrieved documents as the multiple retrieved document set is merged into a single document.
def realign_retrieved_documents(retrieved_documents: List[Document]) -> Dict[str, Any]:
    contents: str = ""
    references: Dict[str, Any] = {}
    reference_number: int = 1               # Start reference numbering from 1
    document_number: int = 1

    for document in retrieved_documents:
        content = document.page_content
        new_content = content

        # Replace the reference marks in the content with new unique numbers
        ref_indices = re.findall(r'\[(\d+)\]', content)
        for ref_index in ref_indices:
            original_ref_number = int(ref_index)
            if original_ref_number not in references:
                references[reference_number] = document.metadata['reference'][str(original_ref_number)]
                new_content = new_content.replace(f'[{original_ref_number}]', f'[{reference_number}]')
                reference_number += 1

        # Append the modified content to the final contents
        contents += f"Retrieved document {document_number}:\n"
        contents += new_content + "\n\n"

    return {
        "contents": contents,
        "references": references
    }

pprint(realign_retrieved_documents(relevant_docs))

{'contents': 'Retrieved document 1:\n'
             'Kimsuky (MITRE ATT&CK Group ID: G0094) is a threat group that '
             'Kimsuky is a North Korea-based cyber espionage group that has '
             'been active since at least 2012. The group initially focused on '
             'targeting South Korean government entities, think tanks, and '
             'individuals identified as experts in various fields, and '
             'expanded its operations to include the United States, Russia, '
             'Europe, and the UN. Kimsuky has focused its intelligence '
             'collection activities on foreign policy and national security '
             'issues related to the Korean peninsula, nuclear policy, and '
             'sanctions.[1][2][3][4][5].\n'
             '\n'
             'Contributors: Taewoo Lee, KISA, Dongwook Kim, KISA\n'
             '\n'
             'Version: 4.0\n'
             'Created: 2019-08-26\n'
             'Last Modified: 2024-04-17\n'
            

In [20]:
# Generate a few shot prompt template for the few-shot learning
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from typing import List, Dict

# Define the few-shot examples
examples: List[Dict[str, str]] = [
    {
        "contexts": "Retrieved document 1:\n"
                    "  - Group A is a threat group that targets the financial sector[1][2] since 2018 in United States[3].\n"
                    "    In 2018, they targeted a bank in New York[4].\n"
                    "  - Group A additionally targeted a bank in California[5], company in Texas[6], resulting in a loss of $2 million[7].\n"
                    "Retrieved document 2:\n"
                    "  - Group B is a threat group that targets the private enterprise banking sector[5][6] since 2019 in United States[7].\n"
                    "    In 2019, they targeted a bank in California[8], company in Texas[9], resulting in a loss of $1 million[10].\n"
                    "  - Group B is currently on the list of FBI's most wanted cybercriminals[11], with $1 million bounty[12].",
        "question": "Which groups usually target the financial sector?",
        "answer": """
{{
    "1, 2": "Group A targets the financial sector since 2018 in United States, and",
    "5, 6": "Group B targets the private enterprise banking sector since 2019 in United States."
}}        
"""
    },
    {
        # Write an example few-shot prompt with the context and question; in case of imaginary video-game sector
        "contexts": "Retrieved document 1:\n"
                    "  - Group A is a threat group that targets the video-game sector[1][2] since 2018 in United States[3].\n"
                    "    In 2018, they targeted a game development company in California[4].\n"
                    "  Retrieved document 2:\n"
                    "  - Group B is a threat group that targets the video-game sector[5][6] since 2019 in United States[7].\n"
                    "    FBI has identified them as a North Korean threat actor[8], and they targeted a game development company in Texas[9].\n"
                    "    Importantly, some of Group B member has been worked while hiding their identities, revealed and arrested by FBI in 2020[10].\n",
        "question": "Which groups are based on the North Korean threat actor and what happened in 2020?",
        "answer": """
{{
    "8": "Group B is identified as a North Korean threat actor",
    "10": "Some of Group B members have been revealed and arrested by FBI in 2020 while working while hiding their identities."
}}
"""
    }
]

example_prompt = PromptTemplate(
    input_variables=["contexts", "question", "answer"],
    template="Example Context: {contexts}\nExample Question: {question}\nExample Answer: {answer}\n",
)

print(example_prompt.format(**examples[0]))

Example Context: Retrieved document 1:
  - Group A is a threat group that targets the financial sector[1][2] since 2018 in United States[3].
    In 2018, they targeted a bank in New York[4].
  - Group A additionally targeted a bank in California[5], company in Texas[6], resulting in a loss of $2 million[7].
Retrieved document 2:
  - Group B is a threat group that targets the private enterprise banking sector[5][6] since 2019 in United States[7].
    In 2019, they targeted a bank in California[8], company in Texas[9], resulting in a loss of $1 million[10].
  - Group B is currently on the list of FBI's most wanted cybercriminals[11], with $1 million bounty[12].
Example Question: Which groups usually target the financial sector?
Example Answer: 
{{
    "1, 2": "Group A targets the financial sector since 2018 in United States, and",
    "5, 6": "Group B targets the private enterprise banking sector since 2019 in United States."
}}        




In [32]:
# Based on the few-shot example, generate the FewShotPromptTemplate
few_shot_prompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="You are a security analyst and you are asked to identify the threat groups based on the given context and question.\n"
           "Based on the context with reference marks such as [1], [2], ... and question, please provide the detailed and analytical answers.\n"
           "If the answer is not clear, you can provide a brief explanation.\n"
           "Or, if the given context doesn't provide enough information, you can mention that as well(Example: 'Not enough information to answer the given question').\n"
           "Refer to the given a few examples above defining the answer format, and provide the answer in the same format.\n"
           "Note that the examle above is not a real context and the question, so you should not include them in the answer.\n"
           "Instead, you should provide the answer based on the given context and question for now."
           "\n\n"
           "Contexts: {contexts}\n"
           "Question: {question}",
    input_variables=["question", "contexts"]
)

print(few_shot_prompt.format(question="Which groups are based on the North Korean threat actor?",
                             contexts="Example context 1"))

Example Context: Retrieved document 1:
  - Group A is a threat group that targets the financial sector[1][2] since 2018 in United States[3].
    In 2018, they targeted a bank in New York[4].
  - Group A additionally targeted a bank in California[5], company in Texas[6], resulting in a loss of $2 million[7].
Retrieved document 2:
  - Group B is a threat group that targets the private enterprise banking sector[5][6] since 2019 in United States[7].
    In 2019, they targeted a bank in California[8], company in Texas[9], resulting in a loss of $1 million[10].
  - Group B is currently on the list of FBI's most wanted cybercriminals[11], with $1 million bounty[12].
Example Question: Which groups usually target the financial sector?
Example Answer: 
{
    "1, 2": "Group A targets the financial sector since 2018 in United States, and",
    "5, 6": "Group B targets the private enterprise banking sector since 2019 in United States."
}        



Example Context: Retrieved document 1:
  - Group A

In [37]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from typing import Dict, Any, Union

# Initialize the GPT-4 model
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Function to generate the answer using the LLM with JSON output
@traceable(name="rag_qa_chain")
def rag_qa_chain(query: str) -> Dict[str, Union[Dict[str, str], Any]]:
    # Retrieve relevant documents
    relevant_docs = retrieve_relevant_documents(query)

    # Realign retrieved documents to handle references
    realigned_docs = realign_retrieved_documents(relevant_docs)

    # Initialize the GPT-4 model
    llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

    # Create the chain with JSONOutputParser
    chain = (
        {"contexts": lambda x: realigned_docs["contents"], "question": lambda x: query}
        | few_shot_prompt
        | llm
        | JsonOutputParser()
    )
    
    # Run the chain
    answer = chain.invoke(query)
    
    # Combine answer with references
    return {
        "answer": answer,
        "references": realigned_docs["references"]
    }

# Example usage
user_question = "Which groups are based on the North Korean threat actor?"
answer = rag_qa_chain(user_question)

import json 
print(f"User question: {user_question}")
print(f"Answer: {json.dumps(answer, indent=2)}")

User question: Which groups are based on the North Korean threat actor?
Answer: {
  "answer": {
    "1, 2, 3, 4, 5": "Kimsuky is a North Korea-based cyber espionage group that has been active since at least 2012."
  },
  "references": "[1]: {'text': 'Alyac. (2019, April 3). Kimsuky Organization Steals Operation Stealth Power. Retrieved August 13, 2019.', 'url': 'https://blog.alyac.co.kr/2234'}\n[2]: {'text': \"BRI. (2019, April). Kimsuky unveils APT campaign 'Smoke Screen' aimed at Korea and America. Retrieved October 7, 2019.\", 'url': 'https://brica.de/alerts/alert/public/1255063/kimsuky-unveils-apt-campaign-smoke-screen-aimed-at-korea-and-america/'}\n[3]: {'text': 'Dahan, A. et al. (2020, November 2). Back to the Future: Inside the Kimsuky KGH Spyware Suite. Retrieved November 6, 2020.', 'url': 'https://www.cybereason.com/blog/back-to-the-future-inside-the-kimsuky-kgh-spyware-suite'}\n[4]: {'text': 'Jazi, H. (2021, June 1). Kimsuky APT continues to target South Korean government usi