## Join CSV Files

## **IoTD20**

In [2]:
import os
import pandas as pd

def concatenate_csv_files(main_directory,save_directory):
    """
    Concatenate all CSV files in subdirectories of the main directory into a single CSV file.
    
    Parameters:
    main_directory (str): Path to the main directory containing subdirectories with CSV files.
    
    The function saves the concatenated CSV file in the main directory, named as the main directory name + '_all.csv'.
    """
    
    # Get the main directory name for the output file
    main_directory_name = os.path.basename(os.path.normpath(main_directory))

    # Initialize an empty list to hold DataFrames
    data_frames = []
    header_saved = False
    column_order = []

    total_length = 0  # Initialize total length counter
    # Walk through each subfolder in the main directory
    for subdir, _, files in os.walk(main_directory):
        for file in files:
            # Check if the file is a CSV file
            if file.endswith('.csv'):
                file_path = os.path.join(subdir, file)
                # Read the CSV file and append the DataFrame to the list
                if not header_saved:
                    # Read the first CSV file with headers
                    df = pd.read_csv(file_path)
                    header_saved = True
                    column_order = df.columns.tolist()  # Save the column order
                    print(column_order)
                else:
                    # Read subsequent CSV files
                    df = pd.read_csv(file_path)
                    # Reorder the columns of the DataFrame to match the column order of the first DataFrame
                    df = df.reindex(column_order, axis=1)
                data_frames.append(df)
                total_length += len(df)  # Add length of current DataFrame to total length

    # Concatenate all DataFrames in the list into a single DataFrame
    concatenated_df = pd.concat(data_frames, ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file in the main directory
    output_file = os.path.join(save_directory, f'{main_directory_name}_all.csv')
    concatenated_df.to_csv(output_file, index=False)

    print(f'Total length of concatenated CSV: {total_length}')
    print(f'All CSV files have been concatenated and saved to {output_file}')

Lo he hecho para todas las carpetas de IoTD20 excepto MITM

In [25]:
main_directory = "/root/bbdd/logs-zeek/iotd20-logs/logs-scan"
save_directory = "/root/bbdd/logs-zeek/iotd20-logs/all-labeled"
concatenate_csv_files(main_directory,save_directory)

['ts', 'startTime', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'orig_bytes_mean', 'resp_bytes_mean', 'orig_bytes_mean_nocero', 'resp_bytes_mean_nocero', 'orig_bytes_std_nocero', 'resp_bytes_std_nocero', 'orig_bytes_min', 'resp_bytes_min', 'orig_bytes_max', 'resp_bytes_max', 'orig_pkts_nocero', 'resp_pkts_nocero', 'orig_pkts_cero', 'resp_pkts_cero', 'time_mean', 'time_std', 'time_min', 'time_max', 'orig_time_mean', 'orig_time_min', 'orig_time_max', 'resp_time_mean', 'resp_time_min', 'resp_time_max', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'history', 'orig_bytes_std', 'resp_bytes_std', 'orig_time_std', 'resp_time_std', 'binary-label', 'label', 'detailed-label']
Total length of concatenated CSV: 17756
All CSV files have been concatenated and saved to /root/bbdd/logs-zeek/iotd20-logs/all-labeled/logs-scan_all.csv


In [26]:
main_directory = "/root/bbdd/logs-zeek/iotd20-logs/logs-original"
save_directory = "/root/bbdd/logs-zeek/iotd20-logs/all-labeled"
concatenate_csv_files(main_directory,save_directory)

['ts', 'startTime', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'orig_bytes_mean', 'resp_bytes_mean', 'orig_bytes_std', 'resp_bytes_std', 'orig_bytes_mean_nocero', 'resp_bytes_mean_nocero', 'orig_bytes_std_nocero', 'resp_bytes_std_nocero', 'orig_bytes_min', 'resp_bytes_min', 'orig_bytes_max', 'resp_bytes_max', 'orig_pkts_nocero', 'resp_pkts_nocero', 'orig_pkts_cero', 'resp_pkts_cero', 'time_mean', 'time_std', 'time_min', 'time_max', 'orig_time_mean', 'orig_time_min', 'orig_time_max', 'resp_time_mean', 'resp_time_min', 'resp_time_max', 'orig_time_std', 'resp_time_std', 'service', 'binary-label', 'label', 'detailed-label']
Total length of concatenated CSV: 43666
All CSV files have been concatenated and saved to /root/bbdd/logs-zeek/iotd20-logs/all-labeled/logs-original_all.csv


In [27]:
main_directory = "/root/bbdd/logs-zeek/iotd20-logs/logs-dos"
save_directory = "/root/bbdd/logs-zeek/iotd20-logs/all-labeled"
concatenate_csv_files(main_directory,save_directory)

['ts', 'startTime', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'orig_bytes_mean', 'resp_bytes_mean', 'orig_bytes_std', 'orig_bytes_mean_nocero', 'resp_bytes_mean_nocero', 'orig_bytes_std_nocero', 'resp_bytes_std_nocero', 'orig_bytes_min', 'resp_bytes_min', 'orig_bytes_max', 'resp_bytes_max', 'orig_pkts_nocero', 'resp_pkts_nocero', 'orig_pkts_cero', 'resp_pkts_cero', 'time_mean', 'time_std', 'time_min', 'time_max', 'orig_time_mean', 'orig_time_std', 'orig_time_min', 'orig_time_max', 'resp_time_mean', 'resp_time_min', 'resp_time_max', 'resp_bytes_std', 'resp_time_std', 'binary-label', 'label', 'detailed-label']
Total length of concatenated CSV: 61367
All CSV files have been concatenated and saved to /root/bbdd/logs-zeek/iotd20-logs/all-labeled/logs-dos_all.csv


In [30]:
main_directory = "/root/bbdd/logs-zeek/iotd20-logs/all-labeled"
save_directory = "/root/bbdd/logs-zeek/iotd20-logs"
concatenate_csv_files(main_directory,save_directory)

['ts', 'startTime', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'orig_bytes_mean', 'resp_bytes_mean', 'orig_bytes_std', 'resp_bytes_std', 'orig_bytes_mean_nocero', 'resp_bytes_mean_nocero', 'orig_bytes_std_nocero', 'resp_bytes_std_nocero', 'orig_bytes_min', 'resp_bytes_min', 'orig_bytes_max', 'resp_bytes_max', 'orig_pkts_nocero', 'resp_pkts_nocero', 'orig_pkts_cero', 'resp_pkts_cero', 'time_mean', 'time_std', 'time_min', 'time_max', 'orig_time_mean', 'orig_time_min', 'orig_time_max', 'resp_time_mean', 'resp_time_min', 'resp_time_max', 'orig_time_std', 'resp_time_std', 'service', 'binary-label', 'label', 'detailed-label']
Total length of concatenated CSV: 123185
All CSV files have been concatenated and saved to /root/bbdd/logs-zeek/iotd20-logs/all-labeled_all.csv


## **IoT-23**

In [1]:
import os
import pandas as pd

def concatenate_csv_files(main_directory, save_directory, chunk_size=50000):
    """
    Concatenate all CSV files in subdirectories of the main directory into a single CSV file.

    Parameters:
    main_directory (str): Path to the main directory containing subdirectories with CSV files.
    save_directory (str): Path to the directory where the concatenated CSV file will be saved.
    chunk_size (int): Number of rows per chunk to read from each CSV file.
    """
    
    # Get the main directory name for the output file
    main_directory_name = os.path.basename(os.path.normpath(main_directory))
    output_file = os.path.join(save_directory, f'{main_directory_name}_all.csv')
    
    # Initialize a flag to indicate whether to write header
    header_written = False
    column_order = []
    
    total_length = 0  # Initialize total length counter
    
    # Walk through each subfolder in the main directory
    for subdir, _, files in os.walk(main_directory):
        for file in files:
            # Check if the file is a CSV file
            if file.endswith('.csv'):
                file_path = os.path.join(subdir, file)
                
                # Process the CSV file in chunks
                for chunk in pd.read_csv(file_path, chunksize=chunk_size):
                    if not header_written:
                        # Write the first chunk with headers and save column order
                        chunk.to_csv(output_file, mode='w', header=True, index=False)
                        header_written = True
                        column_order = chunk.columns.tolist()
                    else:
                        # Ensure the chunk has the same column order and write without headers
                        chunk = chunk.reindex(columns=column_order)
                        chunk.to_csv(output_file, mode='a', header=False, index=False)
                    
                    total_length += len(chunk)  # Add length of current chunk to total length

    print(f'Total length of concatenated CSV: {total_length}')
    print(f'All CSV files have been concatenated and saved to {output_file}')


In [2]:
main_directory = "/root/bbdd/logs-zeek/iot-23-logs/labeled-csv/"
save_directory = "/root/bbdd/logs-zeek/iot-23-logs/"
concatenate_csv_files(main_directory,save_directory)

  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_csv(file_path, chunksize=chunk_size):
  for chunk in pd.read_c

Total length of concatenated CSV: 128693449
All CSV files have been concatenated and saved to /root/bbdd/logs-zeek/iot-23-logs/labeled-csv_all.csv
