### 0. Setup

In [3]:
from typing import List
import os
import numpy as np
import pandas as pd
import fitz
from scipy.spatial.distance import cdist
from utils import embed, complete_prompt
import openai
from secret_keys import OPENAI_API_KEY
openai.api_key = OPENAI_API_KEY
from IPython.display import display


In [4]:
def extract_all_text_w_metadata(pdf_location: str, chunk_size: int = 600, chunk_overlap: int = 100) -> str:
    """
    Extract all text from a PDF file, including text metadata such as the originating file name, 
    and its location inside that file.

    chunk_size: Amount of characters in a text chunk.
    chunk_overlap: Amount of characters overlapping between chunks.
    """
    doc = fitz.open(pdf_location)
    doc_name = pdf_location.split("\\")[-1]
    all_text = [page.get_text().strip().replace("\n", " ") for page in doc]
    all_text = " ".join(all_text)
    # Split into overlapping chunks
    chunks = [
        all_text[i : i + chunk_size]
        for i in range(0, len(all_text), chunk_size - chunk_overlap)
    ]
    return [
        {"originating_document": doc_name, "text_chunk": chunk, "chunk_id": i}
        for i, chunk in enumerate(chunks)
    ]

def embed_security_policies(policy_chunks: List[str]):
    embedding_objects = embed(policy_chunks)
    embeddings = [embed_obj["embedding"] for embed_obj in embedding_objects["data"]]
    return np.array(embeddings)


def get_relevant_policy_chunks(
    question: str,
    policy_chunks: pd.DataFrame,
    chunk_embeddings: np.ndarray,
    nr_chunks: int = 6,
):
    """
    Search the embedded policies for semantically relevant pieces of text w.r.t. the security control / question at hand.
    """
    question_embedding = np.array(embed(question)["data"][0]["embedding"])
    distances = cdist(
        np.expand_dims(question_embedding, 0), chunk_embeddings, "cosine"
    )[0]  # cosine similarity
    most_relevant_idx = np.argsort(distances)[:nr_chunks]
    relevant_chunks = policy_chunks.iloc[most_relevant_idx]

    chunks_str = ""
    metadata_str = "Sources obtained from: \n"
    for line_idx, line in relevant_chunks.iterrows():
        chunks_str += f"- {line['text_chunk']} \n"
        metadata_str += f"- Document: {line['originating_document']} / location: {line['chunk_id']}\n"
    return chunks_str, metadata_str

# 1. Embed the security policies

In [5]:
SAMPLE_POLICY_LOC = "data\sample_policies"

In [6]:
all_text = []
for sample_policy in os.listdir(SAMPLE_POLICY_LOC):
    if sample_policy.split(".")[-1] != "pdf":
        continue
    all_text.extend(extract_all_text_w_metadata(f"{SAMPLE_POLICY_LOC}\{sample_policy}"))
all_text_df = pd.DataFrame(all_text)

In [7]:
display(all_text_df)

Unnamed: 0,originating_document,text_chunk,chunk_id
0,acceptable_use_policy.pdf,Version 3.0 DWP Acceptable Use Policy Cont...,0
1,acceptable_use_policy.pdf,licy applies to .................................,1
2,acceptable_use_policy.pdf,/Voice Communication.............................,2
3,acceptable_use_policy.pdf,on The Acceptable Use Policy (AUP) aims to p...,3
4,acceptable_use_policy.pdf,ties for the appropriate use of DWP’s informa...,4
...,...,...,...
296,vulnerability_management_policy.pdf,ception requests will invoke the DWP Security ...,38
297,vulnerability_management_policy.pdf,"rposes, if appropriate with security log audit...",39
298,vulnerability_management_policy.pdf,PR.IP-12 DE.CM-8 DE.DP-2 RS.AN-5 ...,40
299,vulnerability_management_policy.pdf,to vulnerabilities disclosed to the organizat...,41


In [8]:
embedding_objects = embed(all_text_df["text_chunk"].to_list())
embeddings = np.array([embed_obj["embedding"] for embed_obj in embedding_objects["data"]])

In [9]:
embeddings.shape

(301, 1536)

# 2. Perform an ISO 27002 audit of the security policies

In [10]:
iso_controls = pd.read_excel("data\ISO_controls.xlsx")
display(iso_controls)

Unnamed: 0,Security clause,Security category,ISO 27002 2013 reference,Security control,Security control specification,Description,Documentation
0,Information Security Policies,Management direction for information security,5.1.1,Policies for information security,Whether the organisation has comprehensive pol...,,
1,Information Security Policies,Management direction for information security,5.1.2,Review of the Information Security Policy,Whether the Security Policy has an owner defin...,,
2,Organisation of Information Security,Internal Organisation,6.1.1,Information security roles and responsibilities,Whether information security for information a...,,
3,Organisation of Information Security,Internal Organisation,6.1.2,Segregation of duties,Whether conflicting duties and areas of respon...,,
4,Organisation of Information Security,Internal Organisation,6.1.3,Contact with authorities,Whether there are policies or procedures in pl...,,
...,...,...,...,...,...,...,...
109,Compliance,Compliance with legal and contractual requirem...,18.1.4,Privacy and protection of personally identifia...,Whether there is a policy and appropriate proc...,,
110,Compliance,Compliance with legal and contractual requirem...,18.1.5,Regulation of Cryptographic Controls,Whether effective regulation of cryptographic ...,,
111,Compliance,Information security reviews,18.2.1,Independent review of information security,"Whether policies, processes, procedures, contr...",,
112,Compliance,Information security reviews,18.2.2,Compliance with security policies and standards,Whether information systems were regularly che...,,


In [11]:
SYSTEM_ROLE = "You are a cybersecurity auditor performing an ISO 27002 2013 audit."

def get_prompt(
    relevant_chunks,
    clause: str,
    category: str,
    reference: str,
    control: str,
    specification: str,
):
    prompt = f"""Security policy implementation descriptions:
{relevant_chunks}

Security clause: {clause}
Security category: {category}
ISO 27002 2013 reference: {reference}
Security control: {control}
Security control specification: {specification}

Assess whether the security control specification is implemented, based on the given Security policy implementation descriptions.
"""
    return prompt

Focus on just the access control clauses

In [12]:
access_control_audit = iso_controls[iso_controls["Security clause"] == "Access control"]
display(access_control_audit[:3])

Unnamed: 0,Security clause,Security category,ISO 27002 2013 reference,Security control,Security control specification,Description,Documentation
25,Access control,Business requirements of access control,9.1.1,Access Control Policy,Whether asset owners have determined appropria...,,
26,Access control,Business requirements of access control,9.1.2,Access to networks and network services,Whether users are only able to gain access to ...,,
27,Access control,User access management,9.2.1,User registration and de-registration,Whether the organisation has a formal registra...,,


In [15]:
audited_df = []
for line_idx, line in access_control_audit.iterrows():
    question = line["Security control specification"]
    relevant_chunks, chunk_metadata = get_relevant_policy_chunks(question=question, policy_chunks=all_text_df, chunk_embeddings=embeddings)
    prompt = get_prompt(
        relevant_chunks,
        line["Security clause"],
        line["Security category"],
        line["ISO 27002 2013 reference"],
        line["Security control"],
        question,
    )
    if line_idx == 25:
        print(prompt)
    response = complete_prompt(prompt, system_role=SYSTEM_ROLE)
    line_dict = access_control_audit.loc[line_idx].to_dict()
    line_dict["Description"] = response
    line_dict["Documentation"] = chunk_metadata
    audited_df.append(line_dict)

Security policy implementation descriptions:
- ted information asset access controls and  procedures, and that security responsibilities have been allocated and  accepted, and log system and service user activity to determine individual  accountability.  5.1.4.2. Ensure the effective use of cryptography, especially where  interconnections between systems or services exist.  5.1.4.3. Ensure users are accountable for safeguarding their authentication  information,  5.1.4.4. Ensure correct and secure operations of information processing  facilities by regulating, monitoring and reviewing the implementation of  protective measures,  5.1.4.5. E 
- ntify DWP information assets and define responsibilities to ensure  that information receives an appropriate level of protection in accordance with  its importance to the organisation and to the citizen, and its hosting location,  5.1.3.2. Ensure DWP has appropriate structures and processes to enable the  Department to understand the use of and mo

In [45]:
audited_df = pd.DataFrame(audited_df)
display(audited_df)

Unnamed: 0,Security clause,Security category,ISO 27002 2013 reference,Security control,Security control specification,Description,Documentation
0,Access control,Business requirements of access control,9.1.1,Access Control Policy,Whether asset owners have determined appropria...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: informati...
1,Access control,Business requirements of access control,9.1.2,Access to networks and network services,Whether users are only able to gain access to ...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...
2,Access control,User access management,9.2.1,User registration and de-registration,Whether the organisation has a formal registra...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...
3,Access control,User access management,9.2.2,User access provisioning,Whether there is a documented procedure for ap...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...
4,Access control,User access management,9.2.3,Management of privileged access rights,Whether the allocation and use of any privileg...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...
5,Access control,User access management,9.2.4,Management of secret authentication informatio...,The allocation and reallocation of secret auth...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: informati...
6,Access control,User access management,9.2.5,Review of user access rights,Whether there exists a process to review user ...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...
7,Access control,User access management,9.2.6,Removal or adjustment of access rights,Whether procedures are clearly established for...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: personnel...
8,Access control,User responsibilities,9.3.1,Use of secret authentication information,Whether there are any guidelines in place to g...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: acceptabl...
9,Access control,System and application access control,9.4.1,Information access restriction,Whether access to information held in shares o...,Based on the given Security policy implementat...,Sources obtained from: \n- Document: privilege...


# 3. Inspect some audit responses

In [67]:
def print_line(line):
    print(f"""Question: {line['Security control specification']}
        
Response: {line['Description']}

{line['Documentation']}
""")

In [68]:
print_line(audited_df.iloc[0])

Question: Whether asset owners have determined appropriate access control rules, access rights and restrictions for specific user roles. The strictness of the access rules must reflect the associated information security risks
        
Response: Based on the given Security policy implementation descriptions, it appears that the Access Control Policy security control specification is partially implemented. The policy mentions the need for documented information asset access controls and procedures, the allocation and acceptance of security responsibilities, and the use of cryptography for interconnections between systems or services. It also mentions the need for a formal authorization process for privileged user roles, appropriate background checks and clearance for privileged users, and the use of identity and access management policies for managing privileged users. However, it is unclear whether asset owners have determined appropriate access control rules, access rights, and restri

In [69]:
print_line(audited_df.iloc[6])

Question: Whether there exists a process to review user access rights at regular intervals - e.g. Special privilege review every 3 months, normal privileges every 6 months
        
Response: Based on the given Security policy implementation descriptions, it is stated that privileged user access rights must be monitored and reviewed and revalidated on a monthly basis to confirm that the levels of access are still required for the role. This indicates that there is a process in place to review user access rights at regular intervals, which aligns with the security control specification of reviewing user access rights every 3-6 months. Therefore, it can be assessed that the security control specification of reviewing user access rights is implemented.

Sources obtained from: 
- Document: privileged_users_security_policy.pdf / location: 8
- Document: privileged_users_security_policy.pdf / location: 7
- Document: privileged_users_security_policy.pdf / location: 6
- Document: privileged_user

In [70]:
print_line(audited_df.iloc[10])

Question: Whether access to information system is attainable only via a secure log-on process
        
Response: There is no clear indication in the given Security policy implementation descriptions that specifically addresses the implementation of the security control specification for secure log-on procedures. However, the following descriptions may indirectly relate to the control:

- "Ensure users are accountable for safeguarding their authentication information" (5.1.4.3)
- "Ensure correct and secure operations of information processing facilities by regulating, monitoring and reviewing the implementation of protective measures" (5.1.4.4)

Therefore, further investigation and clarification may be necessary to determine if the security control specification for secure log-on procedures is fully implemented.

Sources obtained from: 
- Document: information_security_policy.pdf / location: 22
- Document: vulnerability_management_policy.pdf / location: 39
- Document: information_securi