# Model Training Test - UNSW-NB15 Dataset
Testing model training with real cybersecurity dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load UNSW-NB15 Training Dataset

In [None]:
# Load training dataset
df_train = pd.read_csv('.venv/datasets/UNSW_NB15_training-set.csv')

print(f"Dataset shape: {df_train.shape}")
print(f"\nColumns: {df_train.columns.tolist()}")
print(f"\nFirst few rows:")
df_train.head()

## 2. Dataset Analysis

In [None]:
# Check for target column (usually 'label' or 'attack_cat')
print("Dataset Info:")
print(df_train.info())

print("\n" + "="*50)
print("Missing Values:")
print(df_train.isnull().sum().sort_values(ascending=False).head(10))

print("\n" + "="*50)
print("Dataset Statistics:")
df_train.describe()

In [None]:
# Check target distribution
if 'label' in df_train.columns:
    target_col = 'label'
elif 'Label' in df_train.columns:
    target_col = 'Label'
elif 'attack_cat' in df_train.columns:
    target_col = 'attack_cat'
else:
    # Find likely target column
    target_col = df_train.columns[-1]

print(f"Target column: {target_col}")
print(f"\nTarget distribution:")
print(df_train[target_col].value_counts())

# Visualize distribution
plt.figure(figsize=(10, 5))
df_train[target_col].value_counts().plot(kind='bar')
plt.title('Target Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Prepare data for training
df = df_train.copy()

# Handle missing values
df = df.fillna(df.median(numeric_only=True))

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

print(f"Categorical columns: {categorical_cols}")

# Encode categorical variables
for col in categorical_cols:
    if df[col].nunique() < 50:  # Only encode if reasonable number of unique values
        df[col] = pd.factorize(df[col])[0]
    else:
        df = df.drop(col, axis=1)

# Separate features and target
X = df.drop(target_col, axis=1)
y = df[target_col]

# If target is categorical, encode it
if y.dtype == 'object':
    y = pd.factorize(y)[0]

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Number of classes: {len(np.unique(y))}")

## 4. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"\nTraining target distribution:")
print(pd.Series(y_train).value_counts())

## 5. Train Random Forest Model

In [None]:
print("Training Random Forest model...")

# Create and train model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

model.fit(X_train, y_train)

print("\nModel training completed!")

## 6. Model Evaluation

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print("="*60)

In [None]:
# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

## 7. Confusion Matrix Visualization

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## 8. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20))

# Visualize top 15 features
plt.figure(figsize=(12, 8))
sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## 9. Save Trained Model

In [None]:
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save model
model_path = 'models/unsw_nb15_threat_model.pkl'
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")
print(f"Model file size: {os.path.getsize(model_path) / 1024 / 1024:.2f} MB")

## 10. Test Model Predictions

In [None]:
# Test on a few samples
sample_indices = np.random.choice(len(X_test), 10, replace=False)
samples = X_test.iloc[sample_indices]
true_labels = y_test.iloc[sample_indices]
predictions = model.predict(samples)

print("Sample Predictions:")
print("="*50)
for i, (true, pred) in enumerate(zip(true_labels, predictions)):
    status = "✓" if true == pred else "✗"
    print(f"{status} Sample {i+1}: True={true}, Predicted={pred}")

accuracy_sample = accuracy_score(true_labels, predictions)
print("="*50)
print(f"Sample accuracy: {accuracy_sample*100:.2f}%")

## Summary

This notebook demonstrates:
1. ✅ Loading UNSW-NB15 dataset successfully
2. ✅ Data preprocessing and feature engineering
3. ✅ Training Random Forest classifier
4. ✅ Model evaluation with multiple metrics
5. ✅ Feature importance analysis
6. ✅ Model persistence (saving)

**The model is training properly with real cybersecurity data!**