# Workflow

<img src="https://images.ctfassets.net/xjan103pcp94/4PX0l1ruKqfH17YvUiMFPw/c60a7a665125cb8056bebcc146c23b76/image8.png" alt="Example Image" width="400" height="300">

1. Pass the query to the embedding model to semantically represent it as an embedded query vector.
2. Pass the embedded query vector to our vector DB.
3. Retrieve the top-k relevant contexts – measured by distance between the query embedding and all the embedded chunks in our knowledge base.
4. Pass the query text and retrieved context text to our LLM.
5. The LLM will generate a response using the provided content.

# Downloading Dataset

In [None]:
import requests
from tqdm import tqdm

def download_atlasv2():
    url = "https://public.boxcloud.com/d/1/b1!DF-5pZffA7zR1_FcUVCjx4PJaM3r8Lx6kBOjMXR9DG-geexQh9pfYrZNUzBLVe5vu620bSdhYCFpRhlwpF8DRlkV1VXDUIytUAH0lEvnJsN2Gzu4IVGKYqvRCwEQw02lbOAxCKDHJ7tKVn-C2OQSZs6Cr2kCOHM5TLQxIfWzrJzTU4dxHicqi-tDUeMF4xo2WKxIH8aNtVmx5TIMa_9tWaOb3npDRkUvk7MGCuk6CFsQSTNigqvmC2gSRtBnrVtdnCDPZF_yNBBLCpyOVFAEeR18pnqSdCw0HS4nkDHeA9yZ1RF4t6eYRM4xK8DrWAWMdDmT2_qdw4g4wi8WSjXeCCUmE3kOaUcmd3pXgS46qtb8lZdmw4PTriZUt9szpaRe5AgxO7F_Up0b8mbNZWDJ8kF7G9gHR6vToa28kWq_TwF7nOEtbdzPHZeL-vAk73D2cCJTmJB4einKCrsGLe4R-MOzKcWj7wu2Fjt1IvS6aqVRVy1uvwIRiWrUyhHDxSxsFGezirHsGKbBQej5Cytn40BaOlmFHMvK3S9vfpL8m2XjqmBW_q6sw4HnNYMBKTb9DzKrQb2aj3apEyuANeh-VSImgX09kiIWM70Q-QpUAI794N8bnJ3iykg6kjWro-3EZb44wOjuvMNGUp-HOzAbwY2P_Gk2y-Opo3UkXwoJV66n0LyMVgt8tLI0Dm7Q_VEsyw9u-b_mn3-uxblZR4fllyfCur7Ew8R2jiIZcry-o5iE2M1lIL7zlDy3-yCMhN75VXMI6EVvtuIwaHmLMubvu2N62nGVJ2n039LVAkByCoGzRBNIOXVgBy-aY49xnScn0F_fzyjQIK7VdsObJnO9Tg5bjurfRQ_ZQNx-g4CPG-vne2FkuI1ft7bLEcIMmVk0zn67VHwmcQ4rcBFHW_d_Vw2OfozrS0QjpcOIX1oC7h_MZIHb2SMzLKu3VyhqroZkYSz-Eebl3ieQuF8NO9d4kRJin_OBxki1_7IFSa239hcMoMk3x3hmHmsfeOb3msEtCdcXvcQOdt-inyjuNlvd_uhqs-p2LWcW32GBpAfYpnP26wMT-DHo1lX7k4R4BEsxWIelC-_-GnGpx7R7wsxvHmamyjjKeNfv_RvUeBYGqC67q64HAZY3ggQyyuzCanzXukzevdO5CkmxB61cNAE4OfKYcuPpITWE9XStCjA_hQ7xsnSz6tG0v69Ul9-qWxrmlChf9aubNe6HyBifPh7l7udwRnaVumiTMN0HaM232VdT-t5HoetaqSCnKIj46ql4iEZOqCuA/download"

    response = requests.head(url, allow_redirects=True)
    file_size = int(response.headers.get('content-length', 0))

    response = requests.get(url, stream=True)

    if response.status_code == 200:
        filename = "atlasv2.tar.gz"

        with open(filename, 'wb') as file, tqdm(
            desc=filename,
            total=file_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
                bar.update(len(chunk))

        print(f"File downloaded successfully and saved as {filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

    !tar -xzvf atlasv2.tar.gz -C atlas_dataset/

# Analyzing Events

In [None]:
import pandas as pd
import json

def load_data(file_path):
    log_data = []
    with open(file_path, 'r') as file:
        for line in file:
            doc = json.loads(line)
            timestamp = doc.get('device_timestamp')

            if doc.get('type') == 'endpoint.event.crossproc':
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('crossproc_guid')
                child_node_label = doc.get('crossproc_name')
                edge_label = 'crossproc'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'PROCESS',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.procstart':
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('childproc_guid')
                child_node_label = doc.get('childproc_name')
                edge_label = 'procstart'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'PROCESS',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.filemod':
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('filemod_name')
                child_node_label = doc.get('filemod_name')
                edge_label = 'filemod'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'FILE',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.netconn':
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('remote_ip')
                child_node_label = doc.get('remote_ip')
                edge_label = 'netconn'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'SOCKET',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

    df = pd.DataFrame(log_data)
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S.%f %z UTC')
    df = df.sort_values(by='timestamp')
    return df

In [None]:
host1 = "atlas_dataset/atlasv2/data/benign/h1/cbc-edr/edr-h1-benign.jsonl"

In [None]:
def format_event(row):
    return f"{row['actorname']} performed {row['action']} action on {row['object']} named {row['objectname']}"

df = load_data(host1)
df['event'] = df.apply(format_event, axis=1)
df = df.drop_duplicates(subset='event')

documents = df.groupby('actorname')['event'].apply(lambda events: '. '.join(events)).to_dict()

In [None]:
process_docs = list(documents.values())

# GPT 4 Response - Benign process

# Analysis of Process `smss.exe`

Based on the provided system event details, it is likely that the actions described are part of normal system behavior and not indicative of an attack. Here is a breakdown of the reasoning:

1. **Process Start and File Modifications**:
   - `smss.exe` (Session Manager Subsystem) is a legitimate system process responsible for handling sessions on the system. Its execution is normal for Windows operation.
   - File modifications in directories like `c:\windows\system32\logfiles\wmi\rtbackup\` and `c:\windows\rescache\` are typical system activities. The `rescache` directory is used for resource caching, and modifications to files within it (e.g., `rescache.dir`, `rescache.hit`, `segment0.cmf`, `segment1.cmf`) occur during regular system maintenance and operations.

2. **Network Connections**:
   - Connections to `192.168.223.1` and `192.168.223.130` are within the private IP range, commonly used in internal networks. Such connections might be part of normal communication within a local network.
   - The `0000:0000:0000:0000:0000:0000:0000:0001` address refers to the loopback address in IPv6, which is typically used for testing and internal communication.

While none of these activities individually or collectively raise immediate red flags for malicious behavior, it is crucial to have a broader context to make a fully informed decision. For instance, continuous monitoring and correlation with other system events, historical patterns, and known attack vectors can provide more conclusive evidence.

**Decision**: **Benign**

However, it is advisable to continuously monitor for any unusual patterns or deviations from normal behavior.


In [None]:
attack_host = "atlas_dataset/atlasv2/data/attack/h1/cbc-edr/edr-h1-s1.jsonl"

In [None]:
def format_event(row):
    return f"{row['actorname']} performed {row['action']} action on {row['object']} named {row['objectname']}"

df = load_data(attack_host)
df['event'] = df.apply(format_event, axis=1)
df = df.drop_duplicates(subset='event')

documents = df.groupby('actorname')['event'].apply(lambda events: '. '.join(events)).to_dict()

In [None]:
pname = 'c:\\users\\aalsahee\\payload.exe'
attack_sample =f"{pname} -> {documents[pname]}"

In [None]:
for k,v in documents.items():
    print(k,len(v))

SYSTEM 79
c:\program files\common files\microsoft shared\officesoftwareprotectionplatform\osppsvc.exe 353
c:\program files\confer\scanner\scanhost.exe 371
c:\program files\confer\scanner\upd.exe 132093
c:\program files\microsoft office\office14\excel.exe 1701
c:\program files\mozilla firefox\firefox.exe 713524
c:\program files\mozilla firefox\plugin-container.exe 2797
c:\program files\vmware\vmware tools\tpautoconnsvc.exe 9924
c:\program files\wireshark\dumpcap.exe 190
c:\program files\wireshark\tshark.exe 311
c:\python27\python.exe 143
c:\users\aalsahee\payload.exe 7784
c:\windows\explorer.exe 2769
c:\windows\system32\audiodg.exe 361
c:\windows\system32\cmd.exe 411
c:\windows\system32\conhost.exe 872
c:\windows\system32\consent.exe 107
c:\windows\system32\csrss.exe 105
c:\windows\system32\dllhost.exe 158
c:\windows\system32\lsass.exe 1195
c:\windows\system32\lsm.exe 337
c:\windows\system32\mmc.exe 2049
c:\windows\system32\searchindexer.exe 1171
c:\windows\system32\services.exe 3992
c:

# GPT 4 Response - Attack Process

# Analysis of Process `c:\users\aalsahee\payload.exe`

## 1. Process Name and Location
- The name `payload.exe` and its location in a user directory (not a standard system or program directory) raise immediate red flags. The term "payload" is commonly associated with malicious software, although this alone isn't definitive proof of maliciousness.

## 2. File Modification in Temp Directory
- The process is modifying multiple files in a temporary directory (`c:\users\aalsahee\appdata\local\temp\_mei15042`). This behavior is typical of software that unpacks or stages components during execution.
- The files include a mix of `.pyd` (Python extension modules), `.dll` (Dynamic Link Libraries), and other executable content. The creation of these files could indicate the unpacking of a self-contained Python application, often seen with tools like PyInstaller or malicious software using such tools to bundle and obfuscate code.

## 3. Network Connection
- The process established a network connection to `10.193.66.115`. While the IP address itself isn't immediately indicative of a threat, the combination of network activity with the other behaviors (especially from a process named "payload.exe") is highly suspicious.
- This suggests possible data exfiltration or command-and-control communication.

## 4. Process Start Event
- The process starts itself (`procstart action on PROCESS named c:\users\aalsahee\payload.exe`), which could be part of a self-extracting or self-executing script.

# Decision
The activities observed are **highly suspicious** and likely indicative of a malicious process. The use of a generic name like `payload.exe`, file modifications in the user's temp directory, the presence of many `.dll` and `.pyd` files being unpacked, and the network connection are all consistent with behaviors seen in malware.

Given this evidence, the process is likely **malicious**. Immediate investigation and containment (e.g., isolating the affected system) would be advisable.


## APT Kill Chain Stages Involved

### 1. Installation
- The process `payload.exe` is performing extensive file modifications in a temporary directory, including creating and modifying `.dll` and `.pyd` files.
- This activity suggests that the attacker has successfully executed a payload on the victim's system, likely establishing a foothold by installing necessary components to further their objectives.
- This stage involves the deployment of the malicious software (the payload) that will enable persistence or further malicious activities.

### 2. Command and Control (C2)
- The process has made a network connection to an external IP address (`10.193.66.115`).
- This behavior is typical of the C2 phase, where the installed malware communicates with a remote server controlled by the attacker.
- This connection could be used to receive instructions, exfiltrate data, or stage further attacks.

## Conclusion
Given the process activities, the attacker has moved past the initial stages of reconnaissance, weaponization, and delivery, and has reached the **Installation** stage, with indications that the **Command and Control (C2)** stage is either beginning or already in progress.

At this point, the attacker likely has established a foothold on the system and is preparing to or has already begun communicating with external servers for further exploitation or data exfiltration. Immediate action to disrupt this chain, such as isolating the affected system, is critical to prevent further progression.


# To-Do List:

1. Link alerts into the temporally ordered alert chain.
2. Construct a causal document for each alert process node.
3. Filter out irrelevant or benign information from the documents.
4. Map system entity sets to specific stages of the attack kill chain using the documents.
5. Generate a report detailing the attack from the initial compromise to the current stage.

<img src="https://i.ibb.co/WK2L7h7/Screenshot-2024-08-19-at-1-06-58-PM.png" alt="Example Image">
