In [13]:
from pathlib import Path
import subprocess
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import shutil
import json
import time
import re

In [14]:
# Define base directories
ML_SYSTEM = Path("/Users/book_kuno/Desktop/Final/ML_System")
SIM_SYSTEM = Path("/Users/book_kuno/Desktop/Final/Sim_System5")

In [15]:
# Function 1: Train models using ML_System/scripts/integrator.ipynb
def train_models():
    print("Step 1: Starting training using integrator.ipynb...")
    integrator_path = ML_SYSTEM / "scripts" / "integrator.ipynb"
    with open(integrator_path) as f:
        nb = nbformat.read(f, as_version=4)
    ep = ExecutePreprocessor(timeout=1200)
    ep.preprocess(nb, {'metadata': {'path': str(integrator_path.parent)}})
    print("Step 1: Training completed.\n")


In [16]:
# Function 2: Simulate attack and generate test CSV in Sim_System
def wait_and_copy_csv(timeout=10000, poll_interval=30):
    src_csv = Path("/Users/book_kuno/Desktop/Final/Sim_System5/out/capture.csv")
    dest_dir = Path("/Users/book_kuno/Desktop/Final/ML_System/sim_data")
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest_csv = dest_dir / "capture.csv"
    
    print(f"Attempting to copy file from {src_csv} to {dest_csv}...")
    
    elapsed = 0
    while not src_csv.exists() and elapsed < timeout:
        print(f"capture.csv not found yet. Waiting for {poll_interval} seconds...")
        time.sleep(poll_interval)
        elapsed += poll_interval
    
    if src_csv.exists():
        # print("Waiting for 2 minutes to allow simulation data to be fully processed...")
        # time.sleep(120)
        try:
            shutil.copy(src_csv, dest_csv)
            print("Successfully copied capture.csv to ML_System/sim_data.")
        except Exception as e:
            print(f"Failed to copy file. Error: {e}")
    else:
        print("Timeout reached. capture.csv was not found in the expected location.")

def simulate_attack_and_export():
    print("Step 2: Starting simulation in Sim_System...")
 
    # Path to the YAML orchestration file
    attack_sequence_file = "/Users/book_kuno/Desktop/Final/Sim_System5/ddos_attack_sequence.yml"
    print(f"Step 2: Located YAML orchestration file at: {attack_sequence_file}")
 
    # Execute the YAML using ansible-playbook (adjust if you're using a different tool)
    print("Step 2: Executing ansible-playbook command...")
    result = subprocess.run(
        ["ansible-playbook", str(attack_sequence_file)],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    
    # Print output for debugging
    print("Step 2: Simulation output:\n", result.stdout)
    if result.returncode != 0:
        print("Step 2: Simulation failed with error:\n", result.stderr)
        return
    else:
        print("Step 2: ansible-playbook executed successfully.")
    
    # Now, call our waiting-and-copying function
    wait_and_copy_csv()

In [17]:
# Function 3: Run all model test notebooks using the test data
def test_models_on_sim_data():
    print("Step 3: Testing all trained models on simulated data...")
    test_dir = ML_SYSTEM / "test"
    for notebook in test_dir.glob("*.ipynb"):
        print(f"  - Running {notebook.name}")
        with open(notebook) as f:
            nb = nbformat.read(f, as_version=4)
        ep = ExecutePreprocessor(timeout=600)
        ep.preprocess(nb, {'metadata': {'path': str(test_dir)}})
    print("Step 3: Testing complete.\n")

In [18]:
# Function 4: Analyze test results and collect bad models + attacks
def analyze_results_and_filter_bad_models():
    print("Step 4: Analyzing latest test results...")

    # Directory where test results are stored
    test_results_dir = Path("/Users/book_kuno/Desktop/Final/ML_System/test/test_results")

    # Get all subdirectories with the expected timestamp format
    test_folders = [d for d in test_results_dir.iterdir() if d.is_dir()]
    if not test_folders:
        print("Step 4: No test result folders found.")
        return None

    # Sort the folders lexicographically (YYYYMMDD_HHMMSS is naturally sortable) and pick the latest
    latest_folder = sorted(test_folders)[-1]
    print(f"Step 4: Latest test results folder: {latest_folder}")

    bad_models = []
    # Loop over each notebook file (*.ipynb) in the latest folder
    for nb_file in latest_folder.glob("*.ipynb"):
        print(f"Analyzing file: {nb_file.name}")
        # Extract model name assuming pattern "test-<model>_executed.ipynb"
        model_name = nb_file.name
        if model_name.startswith("test-"):
            model_name = model_name[len("test-"):]
            if model_name.endswith("_executed.ipynb"):
                model_name = model_name[:-len("_executed.ipynb")]
        
        # Read the notebook
        try:
            nb = nbformat.read(nb_file, as_version=4)
        except Exception as e:
            print(f"  - Failed to read {nb_file.name}: {e}")
            continue

        acc_value = None
        for cell in nb.cells:
            if cell.cell_type == "code":
                for output in cell.get("outputs", []):
                    if "text" in output:
                        text_output = output["text"]
                        # Updated regex to capture the number following "Test Accuracy:"
                        match = re.search(r"Test Accuracy:\s*([0-9]*\.?[0-9]+)", text_output)
                        if match:
                            try:
                                acc_value = float(match.group(1))
                                break
                            except ValueError:
                                continue
            if acc_value is not None:
                break

        if acc_value is not None:
            print(f"  - {nb_file.name}: Extracted accuracy = {acc_value}")
            if acc_value < 0.99:
                bad_models.append(model_name)
                print(f"    -> Model {model_name} is bad (acc < 0.99)")
            else:
                print(f"    -> Model {model_name} is good (acc >= 0.99)")
        else:
            print(f"  - {nb_file.name}: Accuracy value not found.")

    if not bad_models:
        print("Step 4: All models performed well. Ending loop.\n")
        return None
    else:
        print(f"Step 4: Bad models found: {bad_models}\n")
        return bad_models


In [None]:
# Function 5: Send bad models info
def send_bad_models_info(bad_models, iteration):
    """
    Create a text file containing the names of bad models (based on the current iteration)
    and copy it to the Sim_System directory.
    """
    print("Step 5: Sending bad models information to Sim_System...")
    if not bad_models:
        print("Step 5: No bad models to send.")
        return
    
    # Create a text file with a unique name based on the current iteration
    bad_models_file = ML_SYSTEM / f"bad_models_{iteration}.txt"
    try:
        with open(bad_models_file, "w") as f:
            for model in bad_models:
                f.write(model + "\n")
        # Copy the file to the Sim_System directory with the same unique name
        dest_file = SIM_SYSTEM / f"bad_models_{iteration}.txt"
        shutil.copy(bad_models_file, dest_file)
        print(f"Step 5: Bad models info sent successfully to Sim_System as {dest_file}.\n")
    except Exception as e:
        print("Step 5: Failed to send bad models info:", e, "\n")


In [20]:
# Function 6: Generate new training data
def generate_new_training_data(loop_iteration):
    """
    Trigger the simulation to generate new data and copy the new capture.csv from Sim_System/out
    to ML_System/ddos_datasets with a unique name based on the current loop iteration.
    """
    print("Step 6: Triggering simulation for new training data generation...")
    
    # Path to the YAML orchestration file
    attack_sequence_file = SIM_SYSTEM / "ddos_attack_sequence.yml"
    print(f"Step 6: Located YAML orchestration file at: {attack_sequence_file}")
    
    # Execute the YAML using ansible-playbook
    result = subprocess.run(
        ["ansible-playbook", str(attack_sequence_file)],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    print("Step 6: Simulation output:\n", result.stdout)
    if result.returncode != 0:
        print("Step 6: Simulation failed with error:\n", result.stderr)
        return
    else:
        print("Step 6: ansible-playbook executed successfully for new training data generation.")
    
    # Wait 60 seconds to allow capture.csv to be fully generated
    print("Step 6: Waiting 60 seconds for new capture.csv to be fully generated...")
    time.sleep(60)
    
    # Copy the new capture.csv from Sim_System/out to ML_System/ddos_datasets with a unique name
    src_csv = SIM_SYSTEM / "out" / "capture.csv"
    dest_dir = ML_SYSTEM / "ddos_datasets"
    dest_dir.mkdir(parents=True, exist_ok=True)
    dest_csv = dest_dir / f"capture_{loop_iteration}.csv"
    
    if src_csv.exists():
        try:
            shutil.copy(src_csv, dest_csv)
            print(f"Step 6: New capture.csv copied to {dest_csv}.\n")
        except Exception as e:
            print(f"Step 6: Failed to copy new capture.csv due to error: {e}\n")
    else:
        print(f"Step 6: capture.csv not found at {src_csv}.\n")

In [21]:
# Function 7: Merge datasets
def merge_datasets():
    """
    Merge all CSV files in ML_System/ddos_datasets into a single file called '2018.csv'.
    After merging, remove the individual CSV files.
    """
    print("Step 7: Merging all training datasets into a single file '2018.csv'...")
    import pandas as pd  # Importing here in case it's not already imported
    
    datasets_dir = ML_SYSTEM / "ddos_datasets"
    csv_files = list(datasets_dir.glob("*.csv"))
    if not csv_files:
        print("Step 7: No CSV files found in ddos_datasets to merge.\n")
        return
    
    dataframes = []
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
            dataframes.append(df)
            print(f"Step 7: Loaded {csv_file.name} with {len(df)} records.")
        except Exception as e:
            print(f"Step 7: Failed to read {csv_file.name}: {e}")
    
    if not dataframes:
        print("Step 7: No valid CSV data to merge.\n")
        return
    
    merged_df = pd.concat(dataframes, ignore_index=True)
    output_file = datasets_dir / "2018.csv"
    try:
        merged_df.to_csv(output_file, index=False)
        print(f"Step 7: Merged dataset saved as {output_file} with {len(merged_df)} total records.")
    except Exception as e:
        print(f"Step 7: Failed to save merged dataset: {e}")
        return
    
    # Remove all other CSV files except the merged one
    for csv_file in csv_files:
        if csv_file != output_file:
            try:
                csv_file.unlink()
                print(f"Step 7: Removed {csv_file.name}")
            except Exception as e:
                print(f"Step 7: Failed to remove {csv_file.name}: {e}")
    print("Step 7: Dataset merge complete.\n")

In [22]:
# Confirm setup before looping (after the 7th function)
print("Controller setup complete. Ready to proceed with orchestration loop.")

Controller setup complete. Ready to proceed with orchestration loop.


In [None]:
# Loop to orchestrate the entire process
MAX_ITERATIONS = 5

for iteration in range(1, MAX_ITERATIONS + 1):
    print(f"\n==================== Iteration {iteration} ====================")

    # Step 1: Train models
    train_models()

    # Step 2: Run simulation to generate attack data
    simulate_attack_and_export()

    # Wait for 2 minutes to ensure the simulation data is fully processed before testing
    # print("Waiting for 2 minutes to allow simulation data to be fully processed...")
    # time.sleep(120)

    # Step 3: Test all models on the new simulated data
    test_models_on_sim_data()

    # Step 4: Analyze test results to find bad models/attacks
    bad_attacks = analyze_results_and_filter_bad_models()
    
    # If all models are good, exit loop early
    if bad_attacks is None:
        print(f"All models passed the threshold. Ending early at iteration {iteration}.\n")
        break

    # Step 5: Send the list of attacks to simulation team
    send_bad_models_info(bad_attacks, iteration)

    # Step 6
    generate_new_training_data(loop_iteration=iteration)
    
    # Step7
    merge_datasets()
    
    print(f"================== End of Iteration {iteration} ==================\n")
else:
    print("Reached maximum number of iterations. Stopping.\n")


Step 1: Starting training using integrator.ipynb...


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


Step 1: Training completed.

Step 2: Starting simulation in Sim_System...
Step 2: Located YAML orchestration file at: /Users/book_kuno/Desktop/Final/Sim_System5/ddos_attack_sequence.yml
Step 2: Executing ansible-playbook command...
Step 2: Simulation output:
 
PLAY [Build and run DDoS attack simulation] ************************************

TASK [Gathering Facts] *********************************************************
ok: [localhost]

TASK [Build victim Docker image] ***********************************************
changed: [localhost]

TASK [Remove existing victim container (if any)] *******************************
changed: [localhost]

TASK [Delete previous attack timeline log] *************************************
ok: [localhost]

TASK [Save tcpdump start timestamp] ********************************************
changed: [localhost]

TASK [Start victim container] **************************************************
changed: [localhost]

TASK [Run attacker simulations] *****************

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


Step 3: Testing complete.

Step 4: Analyzing latest test results...
Step 4: Latest test results folder: /Users/book_kuno/Desktop/Final/ML_System/test/test_results/20250403_221006
Analyzing file: test-svm_executed.ipynb
  - test-svm_executed.ipynb: Extracted accuracy = 0.0207
    -> Model svm is bad (acc < 0.99)
Analyzing file: test-rfc_executed.ipynb
  - test-rfc_executed.ipynb: Extracted accuracy = 0.8946
    -> Model rfc is bad (acc < 0.99)
Step 4: Bad models found: ['svm', 'rfc']

Step 5: Sending bad models information to Sim_System...
Step 5: Bad models info sent successfully to Sim_System.



TypeError: generate_new_training_data() missing 1 required positional argument: 'loop_iteration'