In [2]:
import requests
import json
import pandas as pd
import random
from itertools import combinations

def fetch_mitre_techniques():
    """
    Fetches MITRE ATT&CK techniques using their Enterprise ATT&CK API
    """
    url = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        all_techniques = []

        for obj in data['objects']:
            if obj.get('type') == 'attack-pattern':
                tactics = []
                for kill_chain_phase in obj.get('kill_chain_phases', []):
                    if kill_chain_phase.get('kill_chain_name') == 'mitre-attack':
                        tactics.append(kill_chain_phase.get('phase_name'))

                description = obj.get('description', 'No description available')
                description = ' '.join(description.split())

                technique = {
                    'id': obj.get('external_references', [{}])[0].get('external_id', 'N/A'),
                    'name': obj.get('name', 'Unknown'),
                    'description': description,
                    'tactics': ', '.join(tactics)
                }
                all_techniques.append(technique)

        return all_techniques

    except requests.exceptions.RequestException as e:
        print(f"Error fetching MITRE data: {e}")
        return None

def generate_question_from_template(df, row, template_num):
    """
    Generates a single question based on the template number
    """
    try:
        if template_num == 1:
            options = [row['name']] + list(df.sample(3)['name'])
            question = f"Based on the following attack technique description, identify the MITRE ATT&CK technique: '{row['description']}'"

        elif template_num == 2:
            tactics = list(df['tactics'].str.split(', ').explode().unique())
            correct_tactic = row['tactics'].split(', ')[0]
            other_tactics = [t for t in tactics if t != correct_tactic]
            options = [correct_tactic] + random.sample(other_tactics, 3)
            question = f"Which tactic is most commonly associated with the technique '{row['name']}'?"

        elif template_num == 3:
            options = [
                f"To {row['description'].split('.')[0].lower()}",
                "To gain initial access to the system",
                "To maintain persistent access",
                "To extract sensitive data"
            ]
            question = f"An attacker is utilizing '{row['name']}'. What is the primary objective of this technique?"

        elif template_num == 4:
            same_tactic_techniques = list(df[df['tactics'].str.contains(row['tactics'].split(', ')[0])]['name'])
            other_techniques = list(df['name'])
            correct_next = random.choice(same_tactic_techniques)
            options = [correct_next] + random.sample([t for t in other_techniques if t != correct_next], 3)
            question = f"If an attacker has successfully implemented '{row['name']}', which of the following techniques would they most likely use next to advance their attack?"

        elif template_num == 5:
            options = [
                f"Monitor for {row['name'].lower()} activity",
                "Monitor all network traffic",
                "Monitor system resource usage",
                "Monitor user authentication logs"
            ]
            question = f"When monitoring for the technique '{row['name']}', which aspect should security analysts primarily focus on?"

        correct_answer = options[0]
        random.shuffle(options)
        correct_letter = 'ABCD'[options.index(correct_answer)]

        return {
            'Question': question.replace('\n', ' '),  # Ensures single-line question
            'A': options[0],
            'B': options[1],
            'C': options[2],
            'D': options[3],
            'correct_answer': correct_letter
        }

    except Exception as e:
        print(f"Error generating question: {e}")
        return None

def generate_questions(df, num_questions=1000):
    """
    Generates questions in A, B, C, D format for LLM benchmarking
    """
    questions = []

    while len(questions) < num_questions:
        row = df.sample(1).iloc[0]
        template_num = random.randint(1, 5)

        question = generate_question_from_template(df, row, template_num)
        if question:
            questions.append(question)

        if len(questions) % 100 == 0:
            print(f"Generated {len(questions)} questions...")

    return questions

# Main execution
print("Fetching MITRE ATT&CK data...")
techniques = fetch_mitre_techniques()

if techniques:
    # Create DataFrame
    df = pd.DataFrame(techniques)

    # Save MITRE data
    df.to_csv('mitre_techniques.tsv', sep='\t', index=False)
    print("Saved MITRE techniques to mitre_techniques.tsv")

    # Generate questions
    print("\nGenerating questions...")
    questions = generate_questions(df)
    questions_df = pd.DataFrame(questions)

    # Save questions
    questions_df.to_csv('mitre_questions.tsv', sep='\t', index=False)
    questions_df.to_json('mitre_questions.json', orient='records', indent=2)

    print(f"\nGenerated {len(questions)} questions and saved to:")
    print("1. mitre_questions.tsv")
    print("2. mitre_questions.json")

    # Display sample questions
    print("\nSample questions:")
    for i, question in enumerate(questions[:3], 1):
        print(f"\nQuestion {i}:")
        print(f"Q: {question['Question']}")
        print(f"A: {question['A']}")
        print(f"B: {question['B']}")
        print(f"C: {question['C']}")
        print(f"D: {question['D']}")
        print(f"Correct Answer: {question['correct_answer']}")

# Load the TSV file
file_path = "mitre_questions.tsv"
df = pd.read_csv(file_path, sep='\t')

# Add the 'Prompt' column with a constant value
df['Prompt'] = (
    "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge "
    "benchmark dataset. Your task is to choose the best option among the four provided. Return your answer "
    "as a single uppercase letter: A, B, C, or D. **Question:** Which of the following mitigations involves "
    "preventing applications from running that haven't been downloaded from legitimate repositories? "
    "**Options:** A) Audit B) Execution Prevention C) Operating System Configuration D) User Account "
    "Control **Important:** The last line of your answer should contain only the single letter corresponding "
    "to the best option, with no additional text., Emphasis on a singular letter output"
)

# Rename the columns A, B, C, D to Option A, Option B, Option C, Option D
df = df.rename(columns={
    "A": "Option A",
    "B": "Option B",
    "C": "Option C",
    "D": "Option D"
})

# Save the modified DataFrame to a new TSV file
df.to_csv("MCQ_cleaned.tsv", sep='\t', index=False)

# Display the first few rows of the modified DataFrame
print(df.head())


Fetching MITRE ATT&CK data...
Saved MITRE techniques to mitre_techniques.tsv

Generating questions...
Generated 100 questions...
Generated 200 questions...
Generated 300 questions...
Generated 400 questions...
Generated 500 questions...
Generated 600 questions...
Generated 700 questions...
Generated 800 questions...
Generated 900 questions...
Generated 1000 questions...

Generated 1000 questions and saved to:
1. mitre_questions.tsv
2. mitre_questions.json

Sample questions:

Question 1:
Q: An attacker is utilizing 'Network Trust Dependencies'. What is the primary objective of this technique?
A: To adversaries may gather information about the victim's network trust dependencies that can be used during targeting
B: To gain initial access to the system
C: To maintain persistent access
D: To extract sensitive data
Correct Answer: A

Question 2:
Q: An attacker is utilizing 'Application Layer Protocol'. What is the primary objective of this technique?
A: To maintain persistent access
B: To e

In [1]:
import requests
import json
import pandas as pd
import random
from itertools import combinations

def fetch_mitre_techniques():
    """
    Fetches MITRE ATT&CK techniques using their Enterprise ATT&CK API
    """
    url = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        all_techniques = []

        for obj in data['objects']:
            if obj.get('type') == 'attack-pattern':
                tactics = []
                for kill_chain_phase in obj.get('kill_chain_phases', []):
                    if kill_chain_phase.get('kill_chain_name') == 'mitre-attack':
                        tactics.append(kill_chain_phase.get('phase_name'))

                description = obj.get('description', 'No description available')
                description = ' '.join(description.split())

                technique = {
                    'id': obj.get('external_references', [{}])[0].get('external_id', 'N/A'),
                    'name': obj.get('name', 'Unknown'),
                    'description': description,
                    'tactics': ', '.join(tactics)
                }
                all_techniques.append(technique)

        return all_techniques

    except requests.exceptions.RequestException as e:
        print(f"Error fetching MITRE data: {e}")
        return None

def generate_question_from_template(df, row, template_num):
    """
    Generates a single question based on the template number
    """
    try:
        if template_num == 1:
            options = [row['name']] + list(df.sample(3)['name'])
            question = f"Based on the following attack technique description, identify the MITRE ATT&CK technique:\n\n'{row['description']}'"

        elif template_num == 2:
            tactics = list(df['tactics'].str.split(', ').explode().unique())
            correct_tactic = row['tactics'].split(', ')[0]
            other_tactics = [t for t in tactics if t != correct_tactic]
            options = [correct_tactic] + random.sample(other_tactics, 3)
            question = f"Which tactic is most commonly associated with the technique '{row['name']}'?"

        elif template_num == 3:
            options = [
                f"To {row['description'].split('.')[0].lower()}",
                "To gain initial access to the system",
                "To maintain persistent access",
                "To extract sensitive data"
            ]
            question = f"An attacker is utilizing '{row['name']}'. What is the primary objective of this technique?"

        elif template_num == 4:
            same_tactic_techniques = list(df[df['tactics'].str.contains(row['tactics'].split(', ')[0])]['name'])
            other_techniques = list(df['name'])
            correct_next = random.choice(same_tactic_techniques)
            options = [correct_next] + random.sample([t for t in other_techniques if t != correct_next], 3)
            question = f"If an attacker has successfully implemented '{row['name']}', which of the following techniques would they most likely use next to advance their attack?"

        elif template_num == 5:
            options = [
                f"Monitor for {row['name'].lower()} activity",
                "Monitor all network traffic",
                "Monitor system resource usage",
                "Monitor user authentication logs"
            ]
            question = f"When monitoring for the technique '{row['name']}', which aspect should security analysts primarily focus on?"

        correct_answer = options[0]
        random.shuffle(options)
        correct_letter = 'ABCD'[options.index(correct_answer)]

        return {
            'Question': question,
            'A': options[0],
            'B': options[1],
            'C': options[2],
            'D': options[3],
            'correct_answer': correct_letter
        }

    except Exception as e:
        print(f"Error generating question: {e}")
        return None

def generate_questions(df, num_questions=8000):
    """
    Generates questions in A, B, C, D format for LLM benchmarking
    """
    questions = []

    while len(questions) < num_questions:
        row = df.sample(1).iloc[0]
        template_num = random.randint(1, 5)

        question = generate_question_from_template(df, row, template_num)
        if question:
            questions.append(question)

        if len(questions) % 1000 == 0:
            print(f"Generated {len(questions)} questions...")

    return questions

# Main execution
print("Fetching MITRE ATT&CK data...")
techniques = fetch_mitre_techniques()

if techniques:
    # Create DataFrame
    df = pd.DataFrame(techniques)

    # Save MITRE data
    df.to_csv('mitre_techniques.tsv', sep='\t', index=False)
    print("Saved MITRE techniques to mitre_techniques.tsv")

    # Generate questions
    print("\nGenerating questions...")
    questions = generate_questions(df)
    questions_df = pd.DataFrame(questions)

    # Save questions
    questions_df.to_csv('mitre_questions.tsv', sep='\t', index=False)
    questions_df.to_json('mitre_questions.json', orient='records', indent=2)

    print(f"\nGenerated {len(questions)} questions and saved to:")
    print("1. mitre_questions.tsv")
    print("2. mitre_questions.json")

    # Display sample questions
    print("\nSample questions:")
    for i, question in enumerate(questions[:3], 1):
        print(f"\nQuestion {i}:")
        print(f"Q: {question['Question']}")
        print(f"A: {question['A']}")
        print(f"B: {question['B']}")
        print(f"C: {question['C']}")
        print(f"D: {question['D']}")
        print(f"Correct Answer: {question['correct_answer']}")

# Load the TSV file
file_path = "mitre_questions.tsv"
df = pd.read_csv(file_path, sep='\t')

# Add the 'Prompt' column with a constant value
df['Prompt'] = (
    "You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge "
    "benchmark dataset. Your task is to choose the best option among the four provided. Return your answer "
    "as a single uppercase letter: A, B, C, or D. **Question:** Which of the following mitigations involves "
    "preventing applications from running that haven't been downloaded from legitimate repositories? "
    "**Options:** A) Audit B) Execution Prevention C) Operating System Configuration D) User Account "
    "Control **Important:** The last line of your answer should contain only the single letter corresponding "
    "to the best option, with no additional text., Emphasis on a singular letter output"
)

# Rename the columns A, B, C, D to Option A, Option B, Option C, Option D
df = df.rename(columns={
    "A": "Option A",
    "B": "Option B",
    "C": "Option C",
    "D": "Option D"
})

# Save the modified DataFrame to a new TSV file
df.to_csv("MCQ_cleaned.tsv", sep='\t', index=False)

# Display the first few rows of the modified DataFrame
print(df.head())


Fetching MITRE ATT&CK data...
Saved MITRE techniques to mitre_techniques.tsv

Generating questions...
Generated 1000 questions...
Generated 2000 questions...
Generated 3000 questions...
Generated 4000 questions...
Generated 5000 questions...
Generated 6000 questions...
Generated 7000 questions...
Generated 8000 questions...

Generated 8000 questions and saved to:
1. mitre_questions.tsv
2. mitre_questions.json

Sample questions:

Question 1:
Q: When monitoring for the technique 'Stripped Payloads', which aspect should security analysts primarily focus on?
A: Monitor all network traffic
B: Monitor user authentication logs
C: Monitor system resource usage
D: Monitor for stripped payloads activity
Correct Answer: D

Question 2:
Q: An attacker is utilizing 'Plist Modification'. What is the primary objective of this technique?
A: To gain initial access to the system
B: To maintain persistent access
C: To property list (plist) files contain all of the information that macos and os x uses to c