# Section Header

In [None]:
!pip install langchain openai transformers typing python-dotenv google-search-results splunk-sdk tiktoken PyPDF2 faiss-cpu

In [None]:
import os
import json
from dotenv import load_dotenv
from typing import Literal, Dict, Optional, Any, List, Type
from bs4 import BeautifulSoup
import requests

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain
from langchain.chains.summarize import load_summarize_chain
from langchain.agents import initialize_agent, Tool, load_tools
from langchain.agents import AgentType
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from langchain.schema import SystemMessage
from langchain.prompts import MessagesPlaceholder
from langchain.memory import ConversationSummaryBufferMemory
from langchain.agents import ZeroShotAgent, Tool, AgentExecutor
from langchain.embeddings import OpenAIEmbeddings
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

import xml.etree.ElementTree as ET

import splunklib.client as client
import splunklib.results as results
from splunklib.binding import HTTPError
import pandas as pd

import urllib3
urllib3.disable_warnings()

In [None]:
# brwoserless_api_key = os.getenv("BROWSERLESS_API_KEY")
# serper_api_key = os.getenv("SERP_API_KEY")

os.environ["OPENAI_API_KEY"] = "sk-"
openai_api_key = os.getenv('OPENAI_API_KEY')

os.environ["SERPAPI_API_KEY"] = ""
serpapi_api_key = os.getenv('SERPAPI_API_KEY')

os.environ["SERPER_API_KEY"] = ""
serper_api_key = os.getenv('SERPER_API_KEY')

os.environ["BROWSERLESS_API_KEY"] = ""
browserless_api_key = os.getenv('BROWSERLESS_API_KEY')

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "SplunkGPT"
os.environ["LANGCHAIN_API_KEY"] = "ls__" 


# Prompts, Chains, and Helpers

## Helper Classes

In [None]:
#
# Helper Functions/Classes
#
class ScrapeWebsiteInput(BaseModel):
    """Inputs for scrape_website"""
    objective: str = Field(
        description="The objective & task that users give to the agent")
    url: str = Field(description="The url of the website to be scraped")
class ScrapeWebsiteTool(BaseTool):
    name = "scrape_website"
    description = "useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results"
    args_schema: Type[BaseModel] = ScrapeWebsiteInput
    def _run(self, objective: str, url: str):
        return scrape_website(objective, url)
    def _arun(self, url: str):
        raise NotImplementedError("error here")
def search(query):
    '''
    Purpose:

    returns: list of
    '''
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": query})
    headers = {
        'X-API-KEY': serper_api_key,
        'Content-Type': 'application/json'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    #print(response.text)
    return response.text
def scrape_website(objective: str, url: str):
    '''
    scrape website, and also will summarize the content based on objective if the content is too large
    objective is the original objective & task that user give to the agent, url is the url of the website to be scraped
    '''
    print("Scraping website...")
    # Define the headers for the request
    headers = {
        'Cache-Control': 'no-cache',
        'Content-Type': 'application/json',
    }
    # Define the data to be sent in the request
    data = {"url": url}
    # Convert Python object to JSON string
    data_json = json.dumps(data)
    # Send the POST request
    post_url = f"https://chrome.browserless.io/content?token={browserless_api_key}"
    response = requests.post(post_url, headers=headers, data=data_json)

    # Check the response status code
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        text = soup.get_text()
        output = summary(objective, text)
        return output
        # if len(text) > 10000:
        #     output = summary(objective, text)
        #     return output
        # else:
        #     return text
    else:
        print(f"HTTP request failed with status code {response.status_code}")
def summary(objective, content):
    '''
    Purpose:

    returns:
    '''
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
    docs = text_splitter.create_documents([content])
    map_prompt = """
    Write a summary of the following text for {objective}. It is important that you include relevant Windows Event ID, Field Names, expected values for given fields.
    These will be important when using the summary as context to build a Splunk SPL detection query.

    TEXT:
    "{text}"
    SUMMARY:
    """
    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text", "objective"])
    summary_chain = load_summarize_chain( llm=llm, chain_type='map_reduce', map_prompt=map_prompt_template, combine_prompt=map_prompt_template, verbose=True)
    output = summary_chain.run(input_documents=docs, objective=objective)
    return output

def run_splunk_search(search_query):
    """
    Run a Splunk search and return the results.

    Parameters:
    - search_query (str): Splunk search query (e.g., "search index=_internal *")

    Returns:
    - list: List of search results
    """
    # Create a Splunk service instance
    service = client.connect(
    host='splunk.domaintoupdate.com',
    username='admin',
    password='UPDATEME',
    autologin=True)

    kwargs_export = {
        "earliest": '-7d',
        "latest_time": "now",
        "output_mode": 'json'
    }
    try:
        job = service.jobs.create(search_query)
        test_query = search_query
        oneshot_results = service.jobs.oneshot(test_query, **kwargs_export)
        #content = oneshot_results.read().decode('utf-8')
        content = results.JSONResultsReader(oneshot_results)

        #data = json.loads(content)
        return content

    except HTTPError as e:
        error_message = str(e)
        error_portion = error_message.split("Error at position", 1)
        if len(error_portion) > 1:
            #print("Error at position", error_portion[1])
            return f"Error at position {error_portion[1]}"
        else:
            print("Error:", e)
            return e

### END HELPER ###

## Prompts

In [None]:
# Initial Tasks Creation
'''
Create the first list of tasks
'''
#
# Prompts
#

tasks_initializer_prompt = PromptTemplate(
    input_variables=["objective"],
    template="""
    You are an AI agent responsible for creating a detailed JSON checklist of tasks that will guide other AI agents to complete a given objective.
    Your task is to analyze the provided objective and initial background research and generate a well-structured checklist with a clear starting point and end point,
    as well as tasks broken down to be very specific, clear, and executable by other agents without the context of other tasks. Limit the number of tasks to no more than 4 tasks.

    The current agents work as follows:
    - spl_writer_agent: Writes the intial Splunk SPL snippets.
    - spl_filter_agent: Edits the provided Splunk SPL query. Can also modify existing SPL queries to filter for additional fields to meet the requirements of the task.
    - spl_statistical_analysis_agent: Applies a statistical analysis for the provided Splunk SPL query.
    - spl_refactor_agent: Responsible for refactoring the choosen index, source, and field names for the existing SPL Query to meet the requirements of the task.
    - splunk_executor_agent: Executes Splunk search API queries for tasks.
    - analysis_agent: Responsible for analyzing the results from the of executing the Splunk SPL query

    Here is the detection objective you need to create a checklist for: {objective}.

    To generate the checklist, follow these steps:

    1. Analyze the objective to identify the high-level requirements and goals of the project. This will help you understand the scope and create a comprehensive checklist.

    2. Break down the objective into smaller, highly specific tasks that can be worked on independently by other agents.
    Ensure that the tasks are designed to be executed by the available agents (spl_writer_agent, spl_filter_agent, spl_statistical_analysis_agent, spl_refactor_agent, splunk_executor_agent, and analysis_agent).

    3. Assign a unique ID to each task for easy tracking and organization. This will help the agents to identify and refer to specific tasks in the checklist.

    4. Organize the tasks in a logical order, with a clear starting point and end point.
    The starting point should represent the initial research or understanding necessary for the detection, while the end point should signify the completion of the objective and any finalization steps.

    5. Provide the current context for each task, which should be sufficient for the agents to understand and execute the task without referring to other tasks in the checklist.
    This will help agents avoid task duplication.

    6. Pay close attention to the Windows Event ID, Field Names, and Data and make sure the tasks implement all necessary pieces needed to construct a valid detection.

    7. Compile the tasks into a well-structured JSON format, ensuring that it is easy to read and parse by other AI agents. The JSON should only include fields such as task ID and description.

    REMEMBER EACH AGENT WILL ONLY SEE A SINGLE TASK.
    ASK YOURSELF WHAT INFORMATION YOU NEED TO INCLUDE IN THE CONTEXT OF EACH TASK TO MAKE SURE THE AGENT CAN EXECUTE THE TASK WITHOUT SEEING THE OTHER TASKS OR WHAT WAS ACCOMPLISHED IN OTHER TASKS.

    Make sure tasks are not duplicated.

    Do not take long and complex routes, minimize tasks and steps as much as possible. The final step should always be splunk_executer_agent. Select no more than 7 tasks.

    Here is a sample JSON output for a checklist:

            {{
                "tasks": [
                    {{
                    "id": 1,
                    "description": "Write a Splunk SPL query to query to detect the Kerberoasting",
                    "agent": "spl_writer_agent"
                    }},
                    {{
                    "id": 2,
                    "description": "Edit the existing SPL query to filter for Ticket_Encryption_Type=0x17",
                    "agent": "spl_filter_agent"
                    }},
                    "id": 3,
                    "description": "Refactor the existing SPL query to ensure the proper index, source, and field names are used",
                    "agent": "spl_refactor_agent"
                    }},
                    "id": 4,
                    "description": "Apply a statistical analysis of the current SPL query using SPL commands such as stats, where, or table to detect anomolies where a high number of RC4 tickets for an account with an SPN were requested by an IP ",
                    "agent": "spl_statistical_analysis_agent"
                    }},

                    {{
                    "id": 5,
                    "description": "Run a splunk SPL search using the developed SPL query identifity Kerberoasting",
                    "agent": "splunk_executor_agent"
                    }},
                    "id": 6,
                    "description": "Analyze results of the Splunk search to determin the source of the Kerberoasting",
                    "agent": "analysis_agent"
                    }},
                    ...
                    {{
                    "id": N,
                    "description": "...",
                    }}
                ],
            }}
    The tasks will be executed by either of the three agents:  spl_writer_agent, spl_refactor_agent, splunk_executor_agent, and analysis_agent. ALL tasks MUST start either with the following phrases:
    'Write a Splunk SPL query to...', 'Edit existing SPL to...', 'Run a splunk SPL search to...','Apply a statistical analysis...', or 'Analyze results of...' depending on the agent that will execute the task.
    RETURN JSON ONLY:

    """
)

# Gather Detials for each Task
'''
This will look at the description of the task and add details to the tasks
It will Google if neded but make that decison itself through ReACT chain

Baby Original Prompt:
'''
tasks_details_agent = PromptTemplate(
    input_variables=["objective","task_list_json","detection_procedures","splunk_info", "schema", "detection_procedures"],
    template="""
      You are an AI agent responsible for improving a list of tasks in JSON format and adding ALL the necessary details to each task.
      These tasks will be executed individually by agents that have no idea about other tasks.
      It is FUNDAMENTAL that each task has enough details so that an individual isolated agent can execute.
      The metadata of the task is the only information the agents will have.

      Each task should contain the details necessary to execute it.
      For example, if it creates a function, it needs to contain the details about the arguments to be used in that function and this needs to be consistent across all tasks.

      Look at all tasks at once, and update the task description adding details to it for each task so that it can be executed by an agent without seeing the other tasks and to ensure consistency across all tasks.
      DETAILS ARE CRUCIAL.

      For example, if one task references a Windows EventCode it should have the index and source.
      If another task applies statistical analysis on the event data, it should have the details about the field names and possible values.

      RETURN JSON OUTPUTS ONLY.

      Here is the overall objective you need to refactor the tasks for:
      {objective}.

      Here is the task list you need to improve:
      {task_list_json}

      Here are the current detection procedures from the web you need to reference for the tasks:
      ---(Start detection procedures)---
      {detection_procedures}
      ---(End detection procedures)---

      Here is the index and source information in a list data type format:
      {splunk_info}

      Here are the field names and example values for the given event code. The information is in a dictionary data type format. Ensure that the same fieldname format is chosen as what is shown in the sample data:
      {schema}

      RETURN THE SAME TASK LIST but with the description improved to contain the details you are adding for each task in the list. DO NOT MAKE OTHER MODIFICATIONS TO THE LIST. Your input should go in the 'description' field of each task.

      RETURN JSON ONLY:
    """
)

tasks_details_agent_testing = PromptTemplate(
    input_variables=["objective","task_list_json","detection_procedures", "detection_procedures"],
    template="""
      You are an AI agent responsible for improving a list of tasks in JSON format and adding ALL the necessary details to each task.
      These tasks will be executed individually by agents that have no idea about other tasks.
      It is FUNDAMENTAL that each task has enough details so that an individual isolated agent can execute.
      The metadata of the task is the only information the agents will have.

      Each task should contain the details necessary to execute it.
      For example, if it creates a function, it needs to contain the details about the arguments to be used in that function and this needs to be consistent across all tasks.

      Look at all tasks at once, and update the task description adding details to it for each task so that it can be executed by an agent without seeing the other tasks and to ensure consistency across all tasks.
      DETAILS ARE CRUCIAL.

      For example, if one task references a Windows EventCode it should have the index and source.
      If another task applies statistical analysis on the event data, it should have the details about the field names and possible values.

      RETURN JSON OUTPUTS ONLY.

      Here is the overall objective you need to refactor the tasks for:
      {objective}.

      Here is the task list you need to improve:
      {task_list_json}

      Here are the current detection procedures from the web you need to reference for the tasks:
      ---(Start detection procedures)---
      {detection_procedures}
      ---(End detection procedures)---

      RETURN THE SAME TASK LIST but with the description improved to contain the details you are adding for each task in the list. DO NOT MAKE OTHER MODIFICATIONS TO THE LIST. Your input should go in the 'description' field of each task.

      RETURN JSON ONLY:
    """
)

# Create the Context
'''
This will provide Context about specific fields/events if needed
It will have access to a schema lookup tool for windows event codes.
'''

tasks_context_agent = PromptTemplate(
    input_variables=["objective","task_list_json","detection_procedures"],
    template="""
      You are an AI agent responsible for improving a list of tasks in JSON format and adding ALL the necessary context
      to it from the current detection procedures from the web or any knowledge you have for detection.
      These tasks will be executed individually by agents who have no idea about other tasks or what indexes, sources, or fields exist within the Spunk Database.
      It is FUNDAMENTAL that each task has enough context so that an individual isolated agent can execute. The metadata of the task is the only information the agents will have.

      Look at all tasks at once, and add the necessary context to each task so that it can be executed by an agent without seeing the other tasks.
      Remember, one agent can only see one task and has no idea about what happened in other tasks.
      CONTEXT IS CRUCIAL. For example, if one filters for specific fields within an index and source type.
      The second task should build on that to conduct the calculations or statistics.
      Note that you should identify when calculations or statistics need to happen and specify this in the context.
      Also, you should identify when parts of the SPL command already exist and specify this very clearly because the agents sometimes duplicate things not knowing.

      RETURN JSON OUTPUTS ONLY.

      Here is the overall objective you need to refactor the tasks for: {objective}.
      Here is the task list you need to improve: {task_list_json}
      Here are the current detection procedures from the web you need to reference for the tasks:
      ---(Start detection procedures)---
      {detection_procedures}
      ---(End detection procedures)---


      RETURN THE SAME TASK LIST but with a new field called 'isolated_context' for each task in the list.
      This field should be a string with the context you are adding. DO NOT MAKE OTHER MODIFICATIONS TO THE LIST.

      RETURN JSON ONLY:
    """
  )


#Human Input
'''
Human input context for each task
'''
tasks_human_agent = PromptTemplate(
    input_variables=["task","human_feedback"],
    template="""You are an AI agent responsible for getting human input to improve the quality of tasks in a software project.
    Your goal is to analyze the provided task and adapt it based on the human's suggestions.
    The tasks should  start with either 'Write a Splunk SPL query to...', 'Edit existing SPL to...', 'Run a splunk SPL search to...','Apply a statistical analysis...', or 'Analyze results of...'  depending on the agent that will execute the task.

    For context, this task will be executed by other AI agents with the following characteristics:
    - spl_writer_agent: Writes the intial Splunk SPL snippets.
    - spl_filter_agent: Edits the provided Splunk SPL query. Can also modify existing SPL queries to filter for additional fields to meet the requirements of the task.
    - spl_statistical_analysis_agent: Applies a statistical analysis for the provided Splunk SPL query.
    - spl_refactor_agent: Responsible for refactoring the choosen index, source, and field names for the existing SPL Query to meet the requirements of the task.
    - splunk_executor_agent: Executes Splunk search API queries for tasks.
    - analysis_agent: Responsible for analyzing the results from the of executing the Splunk SPL query

    The current task is:
    {task}

    The human feedback is:
    {human_feedback}

    If the human feedback is empty, return the task as is. If the human feedback is saying to ignore the task, return the following string: <IGNORE_TASK>

    Note that your output will replace the existing task, so make sure that your output is a valid task that starts with one of the required phrases
    ('Write a Splunk SPL query to...', 'Edit existing SPL to...', 'Run a splunk SPL search to...','Apply a statistical analysis...', or 'Analyze results of...' ).

    Please adjust the task based on the human feedback while ensuring it starts with one of the required phrases
    ('Write a Splunk SPL query to...', 'Edit existing SPL to...', 'Run a splunk SPL search to...','Apply a statistical analysis...', or 'Analyze results of...' ).
    Return the improved task as a plain text output and nothing else. Write only the new task."""
  )
# Agent Assignment
'''
Used to assign
'''
task_assigner_agent = PromptTemplate(
    input_variables=["objective","task","recommendation"],
    template=""""You are an AI agent responsible for choosing the best agent to work on a given task.
    Your goal is to analyze the provided major objective of the project and a single task from the JSON checklist generated by the previous agent, and choose the best agent to work on the task.

    The overall objective is: {objective}
    The current task is: {task}

    Use this recommendation to guide you: {recommendation}

    The available agents are:
    - spl_writer_agent: Writes the intial Splunk SPL snippets.
    - spl_filter_agent: Edits the provided Splunk SPL query. Can also modify existing SPL queries to filter for additional fields to meet the requirements of the task.
    - spl_statistical_analysis_agent: Applies a statistical analysis for the provided Splunk SPL query.
    - spl_refactor_agent: Responsible for refactoring the choosen index, source, and field names for the existing SPL Query to meet the requirements of the task.
    - splunk_executor_agent: Executes Splunk search API queries for tasks.
    - analysis_agent: Responsible for analyzing the results from the of executing the Splunk SPL query

    Please consider the task description and the overall objective when choosing the most appropriate agent. Keep in mind that creating a file and writing code are different tasks. If the task involves creating a file, like "calculator.py" but does not mention writing any code inside it, the command_executor_agent should be used for this purpose. The code_writer_agent should only be used when the task requires writing or adding code to a file. The code_refactor_agent should only be used when the task requires modifying existing code.

    In summary, to execute splunk spl, use splunk_executor_agent, to write splunk spl queries, use spl_writer_agent, to modify existing spl, use spl_filter_agent, to apply statistical analysis, use spl_statistical_analysis_agent, to refactor existing spl queries, use spl_refactor_agent, to analyze the results of a splunk search, use the analysis_agent.

    Choose the most appropriate agent to work on the task and return a JSON output with the following format: {{"agent": "agent_name"}}.
    ONLY return JSON output.
    """
  )

# SPL Execution

# SPL Writer
spl_writer_agent = PromptTemplate(
    input_variables=["objective", "task","splunk_info" ,"schema","isolated_context"],
    template="""
      You are a world-class detection engineer and an expert in cyber security,
      threat hunting, and data science.

      For reference, your high level objective is:
      {objective}

      Write the Splunk SPL Query but include explanations/comments.
      Provide no information about who you are and focus on writing the detection query.
      Ensure the query is bug and error free. Respond in a well-formatted markdown with ONLY the SPL code.
      Ensure code blocks are used for the SPL query sections and that the code blocks always start with "spl".

      Approach problems step by step.
      Every query should be formatted to search "All Time" in your Splunk index.
      If there are thresholds in the Splunk SPL query that are used to detect suspicious activity, provide a value you believe would best fit that threshold.
      If there is analysis to be done on a given field use one of the Splunk SPL search commands detect suspicious activity, and provide a value you believe would best fit that threshold.
      Never include comments in the Splunk SPL query or in the code block.

      Your job is to write a Splunk SPL query to accomplish the current task:
      {task}

      Here is the index, source, and sourcetype information in a list data type format:
      {splunk_info}

      Ensure that the current field names in the SPL query match the format from your Splunk server.
      Here are the field names and example values for a given event code. The information is in a dictionary data type format:
      {schema}

      It is important you use this context as a reference for the other pieces of the SPL query that are relevant to your task. PAY ATTENTION TO THIS:
      {isolated_context}


      Respond with only the SPL needed to complete the task. IMPORTANT: JUST RETURN SPL QUERY, YOUR OUTPUT WILL BE ADDED DIRECTLY TO THE SEARCH BY OTHER AGENT. BE MINDFUL OF THIS
      YOUR RESPONSE:
    """
)
spl_writer_agent_testing = PromptTemplate(
    input_variables=["objective", "task","isolated_context"],
    template="""
      You are a world-class detection engineer, an expert in cyber security,
      threat hunting, and data science.

      For reference, your high level objective is to:
      {objective}

      Write the Splunk SPL Query but include explanations/comments.
      Provide no information about who you are and focus on writing the detection query.
      Ensure the query is bug and error free. Respond in a well-formatted markdown with ONLY the SPL code.
      Ensure code blocks are used for the SPL query sections and that the code blocks always start with "spl".

      Approach problems step by step.
      If there are time based thresholds in the Splunk SPL query that are used to detect suspicious activity, provide a value you believe would best fit that threshold.
      If there is analysis to be done on a given field use one of the Splunk SPL search commands detect suspicious activity, and provide a value you believe would best fit that threshold.
      Never include comments in the Splunk SPL query or in the code block.

      Your job is to write a Splunk SPL query to accomplish the current task:
      {task}

      Reference the following information from google when building your query:
      {isolated_context}

      Respond with only the SPL needed to complete the task. IMPORTANT: JUST RETURN SPL QUERY, YOUR OUTPUT WILL BE ADDED DIRECTLY TO THE SEARCH BY OTHER AGENT. BE MINDFUL OF THIS
      YOUR RESPONSE:
    """
)
spl_filter_agent = PromptTemplate(
    input_variables=["objective", "task","previous_query", "isolated_context"],
    template="""
      You are a world-class detection engineer, an expert in cyber security,
      threat hunting, and data science.

      For reference, your high level objective is to:
      {objective}

      Update the provided Splunk SPL Query but include explanations/comments.
      Provide no information about who you are and focus on writing the detection query.
      Ensure the query is bug and error free. Respond in a well-formatted markdown with ONLY the SPL code.
      Ensure code blocks are used for the SPL query sections and that the code blocks always start with "spl".

      Approach problems step by step.
      If there are time based thresholds in the Splunk SPL query that are used to detect suspicious activity, provide a value you believe would best fit that threshold.
      If there is analysis to be done on a given field use one of the Splunk SPL search commands detect suspicious activity, and provide a value you believe would best fit that threshold.
      Never include comments in the Splunk SPL query or in the code block.

      Your job is to update the provided Splunk SPL query to accomplish the current task:
      {task}

      Here is the provided Splunk SPL query:
      {previous_query}

      Reference the following information from google when building your query:
      {isolated_context}

      Respond with only the SPL needed to complete the task. IMPORTANT: JUST RETURN SPL QUERY, YOUR OUTPUT WILL BE ADDED DIRECTLY TO THE SEARCH BY OTHER AGENT. BE MINDFUL OF THIS
      YOUR RESPONSE:
    """
)
spl_statistical_analysis_agent = PromptTemplate(
    input_variables=["objective", "task","previous_query", "isolated_context"],
    template="""
      You are a world-class detection engineer, an expert in cyber security,
      threat hunting, and data science.

      For reference, your high level objective is to:
      {objective}

      Update the provided Splunk SPL Query but include explanations/comments.
      Provide no information about who you are and focus on writing the detection query.
      Ensure the query is bug and error free. Respond in a well-formatted markdown with ONLY the SPL code.
      Ensure code blocks are used for the SPL query sections and that the code blocks always start with "spl".

      Approach problems step by step.
      If there are time based thresholds in the Splunk SPL query that are used to detect suspicious activity, provide a value you believe would best fit that threshold.
      If there is analysis to be done on a given field use one of the Splunk SPL search commands detect suspicious activity, and provide a value you believe would best fit that threshold.
      Never include comments in the Splunk SPL query or in the code block.

      Your job is to update the provided Splunk SPL query to accomplish the current task:
      {task}

      Here is the provided Splunk SPL query:
      {previous_query}

      Reference the following information from google when building your query:
      {isolated_context}

      Respond with only the SPL needed to complete the task. IMPORTANT: JUST RETURN SPL QUERY, YOUR OUTPUT WILL BE ADDED DIRECTLY TO THE SEARCH BY OTHER AGENT. BE MINDFUL OF THIS
      YOUR RESPONSE:
    """
)

# SPL Refactor
'''
Outside the for loop last step is the summary
and it will be the final output
takes results from all the SPL Coding stuff/all tasks in loop
'''

#fix_bad_fields
spl_normalize_agent = PromptTemplate(
    input_variables=["existing_spl","objective", "splunk_info","schema"],
    template="""
    You are a world-class detection engineer and an expert in cyber security, threat hunting, and data science.
    You have been provided an SPL query that does not return any results due to one or more of the field names used not matching the field names in your Splunk server.
    Ensure that the SPL query starts with index=
    If the current SPL query starts with anything other than index, remove those strings from your final splunk query.
    Correct the field names in the SPL query to match the fields in your Splunk server.

    The current SPL query provided:
    {existing_spl}

    Ensure the index and source selected contain the data to answer the objective:
    {objective}

    Ensure the selected fields match the information in your current Splunk server or else the query will not return any data.
    You ran two Splunk queries to retrieve the index, source, and fields for a given event code in your Splunk server and the information was returned.

    Select the index and source that will contain the information from your Splunk SPL query.
    For reference here is the information from your Splunk server:
    {splunk_info}

    Ensure the current fields match the fields from your splunk server for the given event code. For reference here are the fields in a list format from your Splunk server for each event code:
    {schema}

    Only modify the index, source, sorucetype and field names to match the information from your Splunk server. Do not modify the value inside the fields.
    Respond with only the SPL query needed to complete the task. IMPORTANT: JUST RETURN SPL QUERY, YOUR OUTPUT WILL BE ADDED DIRECTLY TO THE SEARCH BY OTHER AGENT. BE MINDFUL OF THIS

    YOUR RESPONSE:
    """
)


# Research Agent
'''
Add come background research
'''
research_system_template = SystemMessage(
    content="""
    You are a world class researcher, who can do detailed research on windows attacks and produce facts based detection procedures using only windows security event logs;
    you do not make things up, you will try as hard as possible to gather facts & data to back up the research;
    you will priortize searching for specific event log id, field names, and values for field names

    Write a comprehensive guide to build a Splunk SPL Query including event codes and field names.
    DO NOT build any actual SPL Query.

    Please make sure you complete the objective above with the following rules:
    1/ You should do enough research to gather as much information as possible about the objective
    2/ If there are url of relevant links & articles, you will scrape it to gather more information
    3/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
    4/ You should not make things up, you should only write facts & data that you have gathered
    5/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
    """
)


event_id_prompt = PromptTemplate(
    input_variables=["detect_procedure"],
    template="""
    Your task is to assist users in building comprehensive Splunk SPL Queries tailored to their specific needs. You will focus on extracting the top two (2) relevant Event ID to be incorporated into the SPL query, ensuring that the detection logic is accurately represented. DO NOT build any actual SPL Query. You can analyze detection procedure and capable of EventCode Extraction.

    EventCode Extraction: Understand the user's detection logic and extract or recommend the top two (2) relevant Windows Event ID that need to be included in the SPL query. Only Extract the two most relevent Event IDs.

    Here is the current detection procedure:
    --- (Start Detection procedure) ---
    {detect_procedure}
    --- (End Detection procedure) ---
    Your output must be a list of EventCode numbers. Here is an example:
    [4769, 4688]
    RETURN LSIT ONLY:
    """
)
# Analyze Results
summarize_splunk_results = PromptTemplate(
    template="""You are a world-class detection engineer and an expert in cyber security,
    threat hunting, and data science. You have been asked to perform the following objective:
    {objective}.

    The query you created was the following:
    {query}

    The results of the query are:
    {results}

    Provide a summary to your manager to describe the results of the query based on the goal provided. You do not need to provide a summary of the query, only provide a brief summary of the result and how you would recommend the security team should respond.
    Be as clear and concise as possible when describing the results of the query and the recommendation you provide.
    """,
    input_variables=["objective", "query", "results"],
)
# Listen to the smart human
splunk_human_input_agent = PromptTemplate(
    template="""You are a world-class detection engineer and an expert in cyber security,
    threat hunting, and data science.

    You have created an amazing Splunk SPL query but have been asked to modify the command by your superior:
    {human_input}.

    The query you created was the following:
    {query}

    If your superior says "No updates required" or a variation of that comment to signify the Splunk SPL query does not need modification, then simply return the Splunk SPL query you created.

    If your superior provides instructions for modifying the Splunk SPL query, modify your Splunk SPL query and respond with only the SPL query needed to complete the task. Do not explain the query. Do not add comments.
    YOUR RESPONSE:
    """,
    input_variables=["human_input", "query"],
)

## Chains

In [None]:
loader = TextLoader('/content/BlogPostSplunkGPT.txt')
doc = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
docs = text_splitter.split_documents(doc)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
docsearch = FAISS.from_documents(docs, embeddings)

research_llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
qa = RetrievalQA.from_chain_type(llm=research_llm, chain_type="stuff", retriever=docsearch.as_retriever())

research_tools = [
    Tool(
        name="Internet_Search",
        func=search,
        description="Internet Search: useful for when you need to answer questions about current events, internet data. You should ask targeted questions"
    ),
    ScrapeWebsiteTool(),
    Tool(
        name="Local_Search",
        func=qa.run,
        description="Local Search: useful for when you need to answer questions about current events, using local data. You should ask targeted questions",
    ),
]


agent_kwargs = {"system_message": research_system_template,}
research_chain = initialize_agent(research_tools,research_llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False, agent_kwargs=agent_kwargs)



In [None]:
start_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
start_chain = LLMChain(llm=start_llm, prompt=tasks_initializer_prompt, verbose=False)

detial_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
detial_chain = LLMChain(llm=detial_llm, prompt=tasks_details_agent, verbose=False)

tasks_context_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
tasks_context_chain = LLMChain(llm=tasks_context_llm, prompt=tasks_context_agent, verbose=False)

tasks_human_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
tasks_human_chain = LLMChain(llm=tasks_human_llm, prompt=tasks_human_agent, verbose=False)

task_assigner_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
task_assigner_chain = LLMChain(llm=task_assigner_llm, prompt=task_assigner_agent, verbose=False)

spl_writer_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_writer_chain = LLMChain(llm=spl_writer_llm, prompt=spl_writer_agent, verbose=False)

spl_refactor_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.2, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_refactor_chain = LLMChain(llm=spl_writer_llm, prompt=spl_normalize_agent, verbose=False)

event_id_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
event_id_chain = LLMChain(llm=event_id_llm, prompt=event_id_prompt, verbose=False)

spl_summary_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.2, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_summary_chain = LLMChain(llm=spl_summary_llm, prompt=summarize_splunk_results, verbose=False)

spl_writer_agent_testing_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.2, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_writer_agent_testing_chain = LLMChain(llm=spl_writer_agent_testing_llm, prompt=spl_writer_agent_testing, verbose=False)

tasks_details_agent_testing_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
tasks_details_agent_testing_chain = LLMChain(llm=tasks_details_agent_testing_llm, prompt=tasks_details_agent_testing, verbose=False)

spl_filter_agent_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_filter_agent_chain = LLMChain(llm=spl_filter_agent_llm, prompt=spl_filter_agent, verbose=False)

spl_statistical_analysis_agent_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_statistical_analysis_chain = LLMChain(llm=spl_statistical_analysis_agent_llm, prompt=spl_statistical_analysis_agent, verbose=False)

spl_statistical_analysis_agent_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
spl_statistical_analysis_chain = LLMChain(llm=spl_statistical_analysis_agent_llm, prompt=spl_statistical_analysis_agent, verbose=False)

splunk_human_input_agent_llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.0, max_tokens=2000, model_kwargs={'frequency_penalty':0.2, 'presence_penalty':0})
splunk_human_input_agent_chain = LLMChain(llm=splunk_human_input_agent_llm, prompt=splunk_human_input_agent, verbose=False)



# Main Logic

In [None]:
user_input = "detect a BsidesAugusta attack"
# if local or remote
objective = f"Build a Splunk SPL Query to {user_input}"
local_research = True # Defaults to Remote
initial_response = start_chain.run(objective)
print(initial_response)

In [None]:
#
# Task Research and detection procedures
#

local_search = True

if local_search:
  prefix = "Local Search "
else:
  prefix = "Internet Search "

research_question = f"{prefix} for current detection procedures that {user_input}"
content = research_chain({"input": research_question})
actual_content = content['output']

In [None]:
print(actual_content)

In [None]:
#
# Splunk Index and sourcetype info
#
search_query = "| tstats values(source) as source by index"
data = run_splunk_search(search_query)
index_list = []
for result in data:
  index_list.append(result)
splunk_info = index_list
print(f"Splunk Info: {splunk_info}")
#
# Schema Info
#

schema = {}
eventID = event_id_chain.run(actual_content)
items = eventID[1:-1].split(', ')
print(items)

for event_code in items:
  search_query = f'search index="main" EventCode={event_code} | fieldsummary |table field '
  field_data = run_splunk_search(search_query)
  all_fields = []
  for fields in field_data:
      # Diagnostic messages may be returned in the results, therefore ignore them
      if isinstance(fields, dict):
          for field_name in fields.values():
              all_fields.append(field_name)
          schema[event_code] = all_fields
#print(schema)
for k,v in schema.items():
  print(k,v)
#print(schema)
print()


In [None]:
print(objective)

In [None]:
print(initial_response)

In [None]:
# Task Detial
#detial_response = detial_chain.predict(objective=objective,task_list_json=initial_response,detection_procedures=actual_content, splunk_info=splunk_info, schema=schema)
detial_response = tasks_details_agent_testing_chain.predict(objective=objective,task_list_json=initial_response,detection_procedures=actual_content)
print(detial_response)

In [None]:
# Task Context
context_response = tasks_context_chain.predict(objective=objective,task_list_json=detial_response, detection_procedures=actual_content)
print(context_response)

In [None]:
# Task list
task_json = json.loads(context_response)
for task in task_json["tasks"]:
    task_description = task["description"]
    task_isolated_context = task["isolated_context"]
    task_agent_reccomendation = task["agent"]
    print("*****TASK*****", "yellow")
    print(task_description)
    print(task_isolated_context)
    print(task_agent_reccomendation)

In [None]:
task_json = json.loads(context_response)
task_list_json = task_json["tasks"]
for task in task_list_json:
  task_description = task["description"]
  task_isolated_context = task["isolated_context"]
  task_agent_reccomendation = task["agent"]

  chosen_agent = task["agent"]
  #print("Current task recommendation: %s", chosen_agent)
  if chosen_agent == "splunk_executor_agent":
    '''
    TODO: Execute Splunk SPL
    Get Results and send to task re-creation agent to see if it was good enough
    if not create a new task to be added to the list or writer

    splunk_human_input_agent
    input_variables=["human_input", "query"],
    '''
    print("splunk_executor_agent")
    while True:
      human_answer = input("Would you like to modify the final query before executing? (Yes/No): ")
      if human_answer.lower() in ["yes", "y"]:
          human_input = input("What would you like to modify? ")

          context_response = splunk_human_input_agent_chain.predict(
            human_input=human_input,
            query=context_response
          )
          print(f"Output from SPL Executer Agent:\n\n{context_response}\n\nIs it good?\n\n")
      else:
          human_input = "No updates required"
          context_response = splunk_human_input_agent_chain.predict(
            human_input=human_input,
            query=context_response
          )
          print(f"Output from SPL Executer Agent:\n\n{context_response}\n\nIs it good?\n\n")
          break
  else:
    # SPL BUILDING AGENTS
    if chosen_agent == "spl_writer_agent":
      # Task Context
      context_response = spl_writer_agent_testing_chain.predict(
          objective=objective,
          task=task_description,
          isolated_context=task_isolated_context
          )
      print(f"Output from SPL Writer Agent:\n\n{context_response}\n\nIs it good?\n\n")

    elif chosen_agent == 'spl_filter_agent':
      """
      input_variables=["objective", "task","previous_query", "isolated_context"],
      """
      print("spl_filter_agent")
      context_response = spl_filter_agent_chain.predict(
          objective=objective,
          task=task_description,
          previous_query=context_response,
          isolated_context=task_isolated_context
          )
      print(f"Output from SPL Filter Agent:\n\n{context_response}\n\nIs it good?\n\n")
    elif chosen_agent == "spl_statistical_analysis_agent":
      '''
      spl_statistical_analysis_agent = PromptTemplate(
      input_variables=["objective", "task","previous_query", "isolated_context"],
      '''
      #print(f"Output from SPL Writer Agent:\n\n{context_response}\n\nIs it good?\n\n")
      #print("spl_refactor_agent")
      #spl_refactor_chain
      # Task Context
      context_response = spl_statistical_analysis_chain.predict(
          objective=objective,
          task=task_description,
          previous_query=context_response,
          isolated_context=task_isolated_context
      )
      print(f"Output from SPL Stats Agent:\n\n{context_response}\n\nIs it good?\n\n")

    elif chosen_agent == "spl_refactor_agent":
      '''
      input_variables=["objective","task_description", "objective","isolated_context","existing_spl","command_execution_errors"]
      '''
      #print(f"Output from SPL Writer Agent:\n\n{context_response}\n\nIs it good?\n\n")
      #print("spl_refactor_agent")
      #spl_refactor_chain
      # Task Context
      command_execution_errors = "No Errors"
      context_response = spl_refactor_chain.predict(
          existing_spl=context_response,
          objective=objective,
          splunk_info=splunk_info,
          schema=schema
      )

      print(f"Output from SPL Refactor Agent:\n\n{context_response}\n\nIs it good'er?\n\n")

In [None]:
print(context_response)

In [None]:
final_query = "search " + context_response

final_data = run_splunk_search(final_query)

splunk_results=[]

for item in final_data:
    #print(item)
    splunk_results.append(item)

#print(splunk_results)
summary_response = spl_summary_chain.predict(
          objective=objective,
          query=context_response,
          results=splunk_results
      )
print(summary_response)