# Background
Dataset: https://www.takakura.com/Kyoto_data/new_data201704/

In [8]:
# Necessary imports
import pandas as pd
import os
import warnings
from shutil import rmtree

# Pre-processing

### Converting the data from txt format to CSV format to make the data easier to work with

In [11]:
# Since our data has mixed attribute types, we suppress the pandas warning regarding it
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)

# From the data documentation
columns = [
    "Duration (seconds)",
    "Service Type",
    "Source bytes",
    "Destination bytes",
    "Count",
    "Same srv rate",
    "Serror rate",
    "Srv serror rate",
    "Dst host count",
    "Dst host srv count",
    "Dst host same src port rate",
    "Dst host serror rate",
    "Dst host srv serror rate",
    "Flag",
    "IDS detection",
    "Malware detection",
    "Ashula detection",
    "Label",
    "Source IP Address",
    "Source Port Number",
    "Destination IP Address",
    "Destination Port Number",
    "Start Time",
    "Protocol"
]

# Delete the CSVs folder if it already exists
rmtree("Data/CSVs/", ignore_errors = True)
# Make it again
os.mkdir("Data/CSVs/")

# Since we're iterating through all months in the year
for month in range(1, 13):
    month_encoder = {
        1: "January",
        2: "February",
        3: "March",
        4: "April",
        5: "May",
        6: "June",
        7: "July",
        8: "August",
        9: "September",
        10: "October",
        11: "November",
        12: "December"
    }
    print(f"Creating {month_encoder[month]}.csv...")
    output_file = f"Data/CSVs/{month_encoder[month]}.csv"
    if month < 10:
        str_month = "0"+str(month)    # Since months are 2 digits in the file structure
    else:
        str_month = str(month)
    
    for data_file in os.listdir(f"Data/{str(str_month)}"):
        file_path = os.path.join(f"Data/{str(str_month)}", data_file)
        # Read the TXT using tab delimiters since that's how it's structured
        day_data = pd.read_csv(file_path, delimiter="\t", names=columns, index_col=False)
        # If [month].csv doesn't exist yet, create it
        if not os.path.exists(output_file):
            day_data.to_csv(output_file, mode='w', header=True, index=False)
        # Or else append to it
        else:
            day_data.to_csv(output_file, mode='a', header=False, index=False)

Creating January.csv...
Creating February.csv...
Creating March.csv...
Creating April.csv...
Creating May.csv...
Creating June.csv...
Creating July.csv...
Creating August.csv...
Creating September.csv...
Creating October.csv...
Creating November.csv...
Creating December.csv...


# Modeling

# Post-processing

# Analysis + Accuracy