# Imports

In [None]:
# Imports
from flowprint.flowprint import FlowPrint
from flowprint.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os.path as p
import os
from datetime import datetime

# Note:
# The model requires WireShark to be installed for it to work properly.
# WireShark is a network protocol analyzer that the model depends on for handling or pcap files.

# Flow extraction

In [None]:
def load_data(path):
    """
    This function loads pcap file paths and their corresponding labels from a specified directory.
    
    Assumptions:
    - The dataset directory structure must consist of a main folder (specified by 'path')
      containing subfolders named after the applications. Each subfolder should contain 
      pcap files related to that specific application.
    
    Parameters:
    - path (str): The path to the main directory containing subdirectories with pcap files.
    
    Process:
    - The function iterates over all files in the specified directory (excluding hidden files).
    - For each valid file found:
        1. The full file path is appended to the global list 'AllPcaps'.
        2. The file name (without the extension) is extracted and appended to the global list 'AllLabels'.
           This name typically corresponds to the application's name.
    
    Note:
    - The lists 'AllPcaps' and 'AllLabels' are expected to be defined globally before calling this function.
    
    Example:
    If the directory structure is:
    /path/to/main_folder/
        ├── AppA/
        │   ├── capture1.pcap
        │   └── capture2.pcap
        ├── AppB/
            ├── capture1.pcap
            └── capture2.pcap
    Then after calling `load_data('/path/to/main_folder/')`, 'AllPcaps' will contain the paths to each pcap file,
    and 'AllLabels' will contain 'capture1', 'capture2', etc., as labels.
    """
    for file in os.listdir(path):
        if file.startswith('.'):
            continue
        AllPcaps.append(p.join(path, file))
        AllLabels.append(p.splitext(file)[0])

In [None]:
dataset = 'dataset_path'
AllPcaps = []
AllLabels = []

In [None]:
# Load data
load_data(dataset)

In [None]:
# Create Preprocessor object
preprocessor = Preprocessor(verbose=True)

# Create Flows and labels
X, y = preprocessor.process(files =AllPcaps,
                            labels=AllLabels,)

In [None]:
# get the time
time = datetime.now().strftime('%H:%M:%S')
filename = './flows_'+ time +'.p'
# Save flows and labels to file 'flows.p'
preprocessor.save(filename, X, y)

### Load flow

In [None]:
# Load flows from file 'flows.p'
# flow_path = 'name_of_file'
# X, y = preprocessor.load(flow_path)

# Fingerprint generation

In [None]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
# Create FlowPrint object
flowprint = FlowPrint(
    batch       = 300,
    window      = 30,
    correlation = 0.1,
    similarity  = 0.9
)

In [None]:
# Fit FlowPrint with flows and labels
flowprint.fit(X_train, y_train)

# Create fingerprints for test data
fp_test = flowprint.fingerprint(X_test)
# Predict best matching fingerprints for each test fingerprint
y_pred = flowprint.predict(fp_test)

In [None]:
# Load fingerprints from file 'fingerprints.json'
# This returns both the fingerprints and stores them in the flowprint object
# fingerprints = flowprint.load('./tests/fingerprints_20:01:40.json')

In [None]:
# Create FlowPrint object
flowprint = FlowPrint(
    batch       = 300,
    window      = 30,
    correlation = 0.1,
    similarity  = 0.9
)

# Fit FlowPrint with flows and labels
flowprint.fit(X_train, y_train)

# Recognise which app produced each flow
y_recognize = flowprint.recognize(fp_test)
# Detect previously unseen apps
# +1 if a flow belongs to a known app, -1 if a flow belongs to an unknown app
y_detect    = flowprint.detect(fp_test)

In [None]:
# Print report with 4 digit precision
print(classification_report(y_test, y_recognize, digits=4))