In [1]:
#0. install dependencies
!pip install pymisp langchain chromadb sentence-transformers torch transformers hf_xet

from pymisp import PyMISP
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions
from transformers import pipeline

Defaulting to user installation because normal site-packages is not writeable


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#1. configuration
MISP_URL = "https://127.0.0.1"
MISP_KEY = "YOUR_MISP_KEY"
VERIFY_CERT = False
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"    # embedding model
LLM_NAME = "HuggingFaceH4/zephyr-7b-beta"                # LLM model

In [3]:
#2. connect to MISP and pull data
misp = PyMISP(MISP_URL, MISP_KEY, VERIFY_CERT)
events = misp.search(controller='events', limit=50)

docs = []
for event in events:
    event_info = event['Event']['info']
    attributes = event['Event']['Attribute']
    attr_texts = [f"{a['type']}:{a['value']}" for a in attributes]
    content = f"Event: {event_info}\n" + "\n".join(attr_texts)
    docs.append(content)

print(f"Pulled {len(docs)} events from MISP.")



Pulled 50 events from MISP.


In [4]:
#3. chunk the documents
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for doc in docs:
    chunks.extend(splitter.split_text(doc))

In [5]:
#4. create local chromaDB vector store
client = chromadb.Client()
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL_NAME)

collection = client.create_collection(name="misp_rag", embedding_function=embedding_fn)
for i, chunk in enumerate(chunks):
    collection.add(documents=[chunk], ids=[str(i)])

print(f"Stored {len(chunks)} chunks in ChromaDB.")

Stored 7055 chunks in ChromaDB.


In [6]:
# log in to hugging face
HF_TOKEN = 'YOUR_HUGGING_FACE_TOKEN'  # Hugging Face token
if HF_TOKEN:
    from huggingface_hub import login
    login(token=HF_TOKEN)

In [7]:
#5. retrieval and generation
def retrieve_and_generate(query):
    results = collection.query(query_texts=[query], n_results=3)
    retrieved_docs = "\n".join(results['documents'][0])

    llm = pipeline("text-generation", model=LLM_NAME, device=-1) #CPU
    prompt = f"Answer the question using the following threat intel:\n{retrieved_docs}\n\nQuestion: {query}\nAnswer:"
    response = llm(prompt, max_length=512, do_sample=True, temperature=0.3)
    return response[0]['generated_text']

answer = retrieve_and_generate("What IP addresses are linked to ransomware?")
print("\n--- ANSWER ---\n", answer)

Fetching 8 files: 100%|██████████| 8/8 [13:15<00:00, 99.46s/it]   
Loading checkpoint shards: 100%|██████████| 8/8 [00:00<00:00, 51.58it/s]
Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- ANSWER ---
 Answer the question using the following threat intel:
ip-dst:111.181.67.117
ip-dst:111.181.67.146
ip-dst:111.181.67.148
ip-dst:111.181.67.169
ip-dst:111.181.67.176
ip-dst:111.181.67.192
ip-dst:111.181.67.193
ip-dst:111.181.67.208
ip-dst:111.181.67.215
ip-dst:111.181.67.216
ip-dst:111.181.67.253
ip-dst:111.181.68.42
ip-dst:111.181.68.44
ip-dst:111.181.68.64
ip-dst:111.181.68.68
ip-dst:111.181.68.78
ip-dst:111.181.68.134
ip-dst:111.181.68.136
ip-dst:111.181.68.148
ip-dst:111.181.68.172
ip-dst:111.181.68.183
ip-dst:111.181.68.190
ip-dst:114.224.171.181
ip-dst:114.224.171.183
ip-dst:114.224.171.185
ip-dst:114.224.171.188
ip-dst:114.224.171.189
ip-dst:114.224.171.192
ip-dst:114.224.171.199
ip-dst:114.224.171.210
ip-dst:114.224.171.219
ip-dst:114.224.171.220
ip-dst:114.224.171.222
ip-dst:114.224.171.224
ip-dst:114.224.171.228
ip-dst:114.224.171.232
ip-dst:114.224.171.239
ip-dst:114.224.171.244
ip-dst:114.224.171.252
ip-dst:114.224.171.253
ip-dst:114.224.172.4
ip-dst:114.224.