In [1]:
!pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading faiss_gpu_cu12-1.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.11.0


In [2]:
import json
import faiss
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer

2025-08-11 23:00:02.244575: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754953202.437023      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754953202.487851      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# --- Load JSON corpus ---
with open("/kaggle/input/misp-data/misp_events.json", "r") as f:
    corpus = json.load(f)

In [4]:
# --- Prepare text list from nested structure ---
texts = []
for e in corpus:
    ev = e["Event"]
    # Basic info text
    info_text = ev.get("info", "")
    # All attribute values
    attr_values = " ".join(attr.get("value", "") for attr in ev.get("Attribute", []))
    # Combine
    combined = f"{info_text} {attr_values}".strip()
    texts.append(combined)

In [5]:
# Choose an embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # fast and light

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [6]:
# Build FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

print(f"Indexed {len(texts)} chunks.")

Indexed 500 chunks.


In [7]:
# --- Load LLM on GPU ---
llm = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", device_map="auto")

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
# --- Query function ---
def retrieve_and_generate(query, top_k=5):
    query_emb = model.encode([query]).astype("float32")
    D, I = index.search(query_emb, top_k)
    retrieved_docs = [texts[i] for i in I[0]]

    context = "\n".join(retrieved_docs)
    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"

    result = llm(prompt, max_new_tokens=300, do_sample=True, temperature=0.3)
    generated = result[0]["generated_text"]
    answer = generated[len(prompt):].strip()
    return answer

In [11]:
# --- Usage ---
print(retrieve_and_generate("List IP addresses related to ransomware"))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Here are some IP addresses related to ransomware:

- 162.244.80.235
- 85.93.88.165
- 185.141.63.120
- 82.118.21.1
- 195.242.213.155
- 185.82.217.131
- 91.132.138.221
- 91.132.138.213
- 193.194.126.195
- 182.82.219.201
- 185.82.219.201
- 138.199.47.184

These IP addresses are associated with various ransomware attacks and campaigns, as reported by CISA and other sources. It's important to monitor and block these IP addresses to prevent ransomware infections and data breaches.
