In [3]:
import torch

In [5]:
from sentence_transformers import SentenceTransformer

In [6]:
import pandas as pd
import numpy as np

In [7]:
glossary = pd.read_csv("sans_glossary_terms.csv")  # Replace with your file path
terms = glossary["Term"].tolist()
definitions = glossary["Definition"].tolist()

In [8]:
combined_texts = [f"{term}: {definition}" for term, definition in zip(terms, definitions)]

In [9]:
combined_texts

['Access Control: Access ControlAccess Control ensures that resources are only granted to those users who are entitled to them.',
 'Access Control List (ACL): Access Control List (ACL)A mechanism that implements access control for a system resource by listing the identities of the system entities that are permitted to access the resource.',
 'Access Control Service: Access Control ServiceA security service that provides protection of system resources against unauthorized access. The two basic mechanisms for implementing this service are ACLs and tickets.',
 'Access Management Access: Access Management AccessManagement is the maintenance of access information which consists of four tasks: account \nadministration, maintenance, monitoring, and revocation.',
 'Access Matrix: Access MatrixAn Access Matrix uses rows to represent subjects and columns to represent objects with privileges listed in each cell.',
 'Account Harvesting: Account HarvestingAccount Harvesting is the process of collec

In [11]:
import hf_xet

In [12]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  35%|###4      | 31.5M/90.9M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:  12%|#1        | 10.5M/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
embeddings = model.encode(combined_texts, convert_to_numpy=True)

np.save("embeddings.npy", embeddings)
glossary.to_csv("glossary_terms.csv", index=False)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
def get_best_match(user_query):
    query_embedding = model.encode([user_query], convert_to_numpy=True)
    similarities = cosine_similarity(query_embedding, embeddings)
    best_match_index = np.argmax(similarities)
    return terms[best_match_index], definitions[best_match_index]

In [19]:
def chatbot():
    print("Welcome to the Cybersecurity Glossary Chatbot!")
    print("Ask about a cybersecurity term (or type 'exit' to quit):")

    while True:
        user_query = input("Your question: ")
        if user_query.lower() == 'exit':
            print("Goodbye!")
            break

        term, definition = get_best_match(user_query)
        print(f"\nTerm: {term}\nDefinition: {definition}\n")

In [24]:
import pandas as pd
import random

# 1. Load your SANS glossary CSV (must have columns "Term" and "Definition")
glossary_df = pd.read_csv("sans_glossary_terms.csv")

# 2. 30 templates focused solely on requesting a term's definition
templates = [
    "What is `{term}`?",
    "Define `{term}`.",
    "What does `{term}` mean?",
    "Give the definition of `{term}`.",
    "What is the definition of `{term}`?",
    "Could you define `{term}`?",
    "Please define `{term}`.",
    "How do you define `{term}`?",
    "Explain what `{term}` means.",
    "I’d like the definition of `{term}`.",
    "What's the meaning of `{term}`?",
    "What does the term `{term}` refer to?",
    "How would you describe `{term}`?",
    "What does `{term}` stand for?",
    "Give me the meaning of `{term}`.",
    "Please explain `{term}`.",
    "Describe the meaning of `{term}`.",
    "What’s the explanation for `{term}`?",
    "What is the exact meaning of `{term}`?",
    "What is meant by `{term}`?",
    "How is `{term}` defined?",
    "In cybersecurity, what does `{term}` mean?",
    "Provide the definition of `{term}`.",
    "What is the meaning of the term `{term}`?",
    "Tell me the definition of `{term}`.",
    "Help me with the definition of `{term}`.",
    "How would you define `{term}`?",
    "Could you explain the meaning of `{term}`?",
    "What’s `{term}` defined as?",
    "What does `{term}` mean in cybersecurity?"
]

# 3. Generate one question per template for each term
all_qs = []
for term in glossary_df["Term"].tolist():
    for tpl in templates:
        question = tpl.format(term=term)
        all_qs.append((question, term))

# 4. Shuffle and select 200 questions
random.shuffle(all_qs)
test_set = all_qs[:200]

# 5. Save to CSV
test_df = pd.DataFrame(test_set, columns=["question", "correct_term"])
test_df.to_csv("glossary_test_questions.csv", index=False)

In [25]:
test_df.head()

Unnamed: 0,question,correct_term
0,Help me with the definition of `Password Authe...,Password Authentication Protocol (PAP)
1,Explain what `Fragment Offset` means.,Fragment Offset
2,Please define `Bastion Host`.,Bastion Host
3,Tell me the definition of `Byte`.,Byte
4,Describe the meaning of `Crimeware`.,Crimeware


In [36]:
def get_best_match_terms(user_query):
    query_embedding = model.encode([user_query], convert_to_numpy=True)
    similarities = cosine_similarity(query_embedding, embeddings)
    best_match_index = np.argmax(similarities)
    return terms[best_match_index]

In [37]:
def chatbot_predict(user_query):
    term = get_best_match_terms(user_query)
    return term  # Return the predicted term

In [38]:
test_df["Predicted Term"] = test_df["question"].apply(chatbot_predict)

In [39]:
print("\nPredictions DataFrame:")
test_df[["question", "correct_term", "Predicted Term"]]


Predictions DataFrame:


Unnamed: 0,question,correct_term,Predicted Term
0,Help me with the definition of `Password Authe...,Password Authentication Protocol (PAP),Password Authentication Protocol (PAP)
1,Explain what `Fragment Offset` means.,Fragment Offset,Fragment Offset
2,Please define `Bastion Host`.,Bastion Host,Bastion Host
3,Tell me the definition of `Byte`.,Byte,Byte
4,Describe the meaning of `Crimeware`.,Crimeware,Crimeware
...,...,...,...
195,How would you describe `Corruption`?,Corruption,Corruption
196,What does `Internet Standard` stand for?,Internet Standard,Internet Standard
197,What does the term `Private Addressing` refer to?,Private Addressing,Private Addressing
198,I’d like the definition of `Virus`.,Virus,Virus


In [41]:
accuracy = (test_df["correct_term"] == test_df["Predicted Term"]).mean()
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Accuracy: 96.00%


In [51]:
incorrect_df = test_df[test_df["correct_term"] != test_df["Predicted Term"]]
print("\nMisclassified Examples DataFrame:")
incorrect_df


Misclassified Examples DataFrame:


Unnamed: 0,question,correct_term,Predicted Term
51,Give the definition of `Sniffer`.,Sniffer,Sniffing
73,Explain what `Reverse Proxy` means.,Reverse Proxy,HTTP Proxy
87,Please define `Switch`.,Switch,Switched Network
97,Provide the definition of `Fingerprinting`.,Fingerprinting,TCP Fingerprinting
105,Please define `Firewall`.,Firewall,Personal Firewalls
130,What is meant by `Domain Name`?,Domain Name,Domain
173,What is the definition of `Hash Functions`?,Hash Functions,Hash Function
185,Tell me the definition of `Host-Based ID`.,Host-Based ID,Host


In [52]:
chatbot()

Welcome to the Cybersecurity Glossary Chatbot!
Ask about a cybersecurity term (or type 'exit' to quit):


Your question:  what is a Firewall



Term: Personal Firewalls
Definition: Personal FirewallsPersonal firewalls are those firewalls that are installed and run on individual PCs.



Your question:  what is a hash function



Term: Hash Function
Definition: Hash FunctionAn algorithm that computes a value based on a data object thereby mapping the data object to a smaller data object.



Your question:  what is sniffing



Term: Sniffing
Definition: SniffingA synonym for "passive wiretapping."



Your question:  exit


Goodbye!
