# DATA PRE-PROCESSING

In [1]:
import pandas as pd

# Load the datasets
train_path = "UNSW_NB15_training-set.csv"
test_path = "UNSW_NB15_testing-set.csv"

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)


HANDLING MISSING VALUES

In [2]:
# Function to handle missing values
def handle_missing_values(df):
    # Step 1: Replace potential placeholders for missing values ('-', 'NA', 'N/A', etc.)
    df.replace(['-', 'NA', 'N/A', 'unknown', 'null', 'NULL'], pd.NA, inplace=True)
    
    # Step 2: Drop columns with >40% missing data
    missing_percent = (df.isnull().sum() / len(df)) * 100
    cols_to_drop = missing_percent[missing_percent > 40].index
    df.drop(columns=cols_to_drop, inplace=True)
    
    # Step 3: Drop rows with >30% missing data
    df.dropna(thresh=len(df.columns) * 0.7, inplace=True)
    
    # Step 4: Identify numerical and categorical features
    num_cols = df.select_dtypes(include=['number']).columns
    cat_cols = df.select_dtypes(include=['object']).columns
    
    # Step 5: Apply Median Imputation for Numerical Features
    for col in num_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())

    # Step 6: Apply Dynamic Imputation for Categorical Features
    for col in cat_cols:
        if df[col].isnull().sum() > 0:
            if df[col].nunique() < 10:  # Few unique categories → Use Mode
                df[col] = df[col].fillna(df[col].mode()[0])
            else:  # Many unique categories → Use 'Unknown'
                df[col] = df[col].fillna("Unknown")

    return df


Saving the new dataset after handling missing value

In [3]:
# Apply the function to training and testing datasets
df_train_cleaned = handle_missing_values(df_train)
df_test_cleaned = handle_missing_values(df_test)

# Save the cleaned datasets into new files
df_train_cleaned.to_csv("UNSW_NB15_training_cleaned_new.csv", index=False)
df_test_cleaned.to_csv("UNSW_NB15_testing_cleaned_new.csv", index=False)

print("✅ Missing values handled and datasets saved to new files!")

✅ Missing values handled and datasets saved to new files!


Checking how well we handled missing values

In [5]:
# Function to check missing values (including placeholders for missing data)
def check_missing_values(df):
    # Replace potential placeholders for missing values ('-', 'NA', 'N/A', etc.)
    df_replaced = df.replace(['-', 'NA', 'N/A', 'unknown', 'null', 'NULL'], pd.NA)
    
    # Count the total number of missing values in the dataset
    missing_values_count = df_replaced.isnull().sum().sum()  # Total number of missing values
    missing_values_per_column = df_replaced.isnull().sum()   # Missing values per column
    
    return missing_values_count, missing_values_per_column

# Load the original and cleaned datasets
df_train_original = pd.read_csv(train_path)
df_test_original = pd.read_csv(test_path)

df_train_cleaned = pd.read_csv("UNSW_NB15_training_cleaned_new.csv")
df_test_cleaned = pd.read_csv("UNSW_NB15_testing_cleaned_new.csv")

# Check missing values in the original datasets
train_original_missing_count, train_original_missing_per_column = check_missing_values(df_train_original)
test_original_missing_count, test_original_missing_per_column = check_missing_values(df_test_original)

# Check missing values in the cleaned datasets
train_cleaned_missing_count, train_cleaned_missing_per_column = check_missing_values(df_train_cleaned)
test_cleaned_missing_count, test_cleaned_missing_per_column = check_missing_values(df_test_cleaned)

# Print results
print(f"Original training dataset - Total missing values: {train_original_missing_count}")
print(f"Missing values per column in original training dataset:\n{train_original_missing_per_column}")
print(f"\nOriginal testing dataset - Total missing values: {test_original_missing_count}")
print(f"Missing values per column in original testing dataset:\n{test_original_missing_per_column}")

print(f"\nCleaned training dataset - Total missing values: {train_cleaned_missing_count}")
print(f"Missing values per column in cleaned training dataset:\n{train_cleaned_missing_per_column}")
print(f"\nCleaned testing dataset - Total missing values: {test_cleaned_missing_count}")
print(f"Missing values per column in cleaned testing dataset:\n{test_cleaned_missing_per_column}")


Original training dataset - Total missing values: 47153
Missing values per column in original training dataset:
id                       0
dur                      0
proto                    0
service              47153
state                    0
spkts                    0
dpkts                    0
sbytes                   0
dbytes                   0
rate                     0
sttl                     0
dttl                     0
sload                    0
dload                    0
sloss                    0
dloss                    0
sinpkt                   0
dinpkt                   0
sjit                     0
djit                     0
swin                     0
stcpb                    0
dtcpb                    0
dwin                     0
tcprtt                   0
synack                   0
ackdat                   0
smean                    0
dmean                    0
trans_depth              0
response_body_len        0
ct_srv_src               0
ct_state_ttl            