In [None]:
# Task 1: Understanding the UNSW-NB15 Dataset

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data_path = '../data/UNSW-NB15.csv'  # Adjust the path if necessary
df = pd.read_csv(data_path)

# Display the first few rows of the dataset
df.head()

# Display basic information about the dataset
df.info()

# Display summary statistics
df.describe()

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Visualize the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x='label', data=df)
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

# Visualize correlations between numerical features
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

# Visualize the distribution of a few numerical features
numerical_features = ['srcip', 'sport', 'dstip', 'dsport', 'sbytes']
df[numerical_features].hist(bins=30, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features')
plt.show()

# Example of pairplot for a subset of features
sns.pairplot(df[['sbytes', 'dbytes', 'sttl', 'dttl', 'label']], hue='label')
plt.show()

# Save the cleaned and processed dataset
df.to_csv('../data/processed/UNSW-NB15_cleaned.csv', index=False)
