#  README

In [None]:
'''
To test this model, the dataset must be organized as a main folder containing subfolders named after each application. 
Inside each subfolder, place the pcap files corresponding to the network traffic of that application. 
The folder structure should look like this:

/path/to/main_folder/
        ├── AppA/
        │   ├── capture1.pcap
        │   └── capture2.pcap
        ├── AppB/
            ├── capture1.pcap
            └── capture2.pcap

This structure ensures that the model can correctly associate each pcap file with its corresponding application during testing.
'''

# Imports

In [1]:
# Imports
from flowprint.flowprint import FlowPrint
from flowprint.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os.path as p
import os
from datetime import datetime

# Note:
# The model requires WireShark to be installed for it to work properly.
# WireShark is a network protocol analyzer that the model depends on for handling or pcap files.

# Flow extraction

In [2]:
def load_data(path):
    """
    This function loads pcap file paths and their corresponding labels from a specified directory.
    
    Assumptions:
    - The dataset directory structure must consist of a main folder (specified by 'path')
      containing subfolders named after the applications. Each subfolder should contain 
      pcap files related to that specific application.
    
    Parameters:
    - path (str): The path to the main directory containing subdirectories with pcap files.
    
    Process:
    - The function iterates over all files in the specified directory (excluding hidden files).
    - For each valid file found:
        1. The full file path is appended to the global list 'AllPcaps'.
        2. The file name (without the extension) is extracted and appended to the global list 'AllLabels'.
           This name typically corresponds to the application's name.
    
    Note:
    - The lists 'AllPcaps' and 'AllLabels' are expected to be defined globally before calling this function.
    
    Example:
    If the directory structure is:
    /path/to/main_folder/
        ├── AppA/
        │   ├── capture1.pcap
        │   └── capture2.pcap
        ├── AppB/
            ├── capture1.pcap
            └── capture2.pcap
    Then after calling `load_data('/path/to/main_folder/')`, 'AllPcaps' will contain the paths to each pcap file,
    and 'AllLabels' will contain 'capture1', 'capture2', etc., as labels.
    """

    all_pcaps = []
    all_labels = []

    for dirs in os.listdir(path):
        if dirs.startswith('.'):
                continue
        for file in os.listdir(p.join(path, dirs)):
            if file.startswith('.'):
                continue
            file_path = p.join(path, dirs,file)
            all_pcaps.append(file_path)
            all_labels.append(dirs)

    return all_pcaps, all_labels

In [3]:
dataset = '/home/leonardo/GitHub/mappgraph/data/flow/old'

# Load data
AllPcaps, AllLabels = load_data(dataset)

In [4]:
print(AllPcaps)

['/home/leonardo/GitHub/mappgraph/data/flow/old/Splitwise.SplitwiseMobile/Splitwise.SplitwiseMobile.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/yarn-chat-text-stories/yarn-chat-text-stories.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/mxtech.videoplayer.ad/mxtech.videoplayer.ad.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/club factory/club.fromfactory.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/club factory/club-factory-fair-price.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/abtnprojects.ambatana/abtnprojects.ambatana.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/ndtv/ndtv.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/tiexue.mobile.topnews/tiexue.mobile.topnews.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/naukriApp.appModules.login/naukriApp.appModules.login.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/tanchishedazuozhan/tanchishedazuozhan.pcap', '/home/leonardo/GitHub/mappgraph/data/flow/old/flipagram/fli

In [None]:
# Create Preprocessor object
preprocessor = Preprocessor(verbose=True)

# Create Flows and labels
X, y = preprocessor.process(files =AllPcaps,
                            labels=AllLabels,)

In [6]:
# get the time
time = datetime.now().strftime('%H:%M:%S')
filename = './flows_'+ time +'.p'
# Save flows and labels to file 'flows.p'
preprocessor.save(filename, X, y)

### Load flow

In [7]:
# Load flows from file 'flows.p'
# flow_path = 'name_of_file'
# X, y = preprocessor.load(flow_path)

# Fingerprint generation

In [8]:
# Split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [9]:
# Create FlowPrint object
flowprint = FlowPrint(
    batch       = 300,
    window      = 30,
    correlation = 0.1,
    similarity  = 0.9
)

In [10]:
# Fit FlowPrint with flows and labels
flowprint.fit(X_train, y_train)

# Create fingerprints for test data
fp_test = flowprint.fingerprint(X_test)
# Predict best matching fingerprints for each test fingerprint
y_pred = flowprint.predict(fp_test)

In [11]:
# Load fingerprints from file 'fingerprints.json'
# This returns both the fingerprints and stores them in the flowprint object
# fingerprints = flowprint.load('./tests/fingerprints_20:01:40.json')

In [12]:
# Create FlowPrint object
flowprint = FlowPrint(
    batch       = 300,
    window      = 30,
    correlation = 0.1,
    similarity  = 0.9
)

# Fit FlowPrint with flows and labels
flowprint.fit(X_train, y_train)

# Recognise which app produced each flow
y_recognize = flowprint.recognize(fp_test)
# Detect previously unseen apps
# +1 if a flow belongs to a known app, -1 if a flow belongs to an unknown app
y_detect    = flowprint.detect(fp_test)

In [13]:
# Print report with 4 digit precision
print(classification_report(y_test, y_recognize, digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                                                   precision    recall  f1-score   support

                                              1mg     0.9778    0.9167    0.9462        48
                                      58tongcheng     1.0000    0.9870    0.9935        77
                                      8-ball-pool     1.0000    0.9821    0.9910        56
                                         Buzzfeed     0.9643    0.9310    0.9474        29
                               HinKhoj.Dictionary     0.0000    0.0000    0.0000        14
                                            Qunar     0.9836    0.9836    0.9836        61
                                         Snapchat     0.2500    0.8571    0.3871         7
                        Splitwise.SplitwiseMobile     0.0000    0.0000    0.0000        11
                        SugarMommasDating_4791482     1.0000    0.9884    0.9942        86
                                         UCMobile     1.0000    1.0000    1.0000       16