# Network Intrusion Detection System (NIDS) using Self-Taught Learning
## Implementation with NSL-KDD Dataset

### Objectives:
1. Implement 3 classification scenarios:
   - Binary Classification (Normal vs Anomaly)
   - 5-Class Classification (Normal + 4 Attack Categories)
   - 23-Class Classification (Normal + 22 Attacks)

In [None]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load NSL-KDD Dataset
def load_nsl_kdd_data(path='NSL-KDD/KDDTrain+.txt', test_path='NSL-KDD/KDDTest+.txt'):
    # Column names for NSL-KDD dataset
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 
        'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 
        'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 
        'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 
        'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 
        'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
        'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
        'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level'
    ]
    
    # Load training and test data
    train_data = pd.read_csv(path, header=None, names=columns)
    test_data = pd.read_csv(test_path, header=None, names=columns)
    
    return train_data, test_data

In [None]:
# Preprocessing Function
def preprocess_data(train_data, test_data, classification_type='binary'):
    # Encode categorical features
    categorical_columns = ['protocol_type', 'service', 'flag']
    for col in categorical_columns:
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])
        test_data[col] = le.transform(test_data[col])
    
    # Create attack classification labels
    if classification_type == 'binary':
        train_data['label'] = (train_data['attack_type'] != 'normal').astype(int)
        test_data['label'] = (test_data['attack_type'] != 'normal').astype(int)
    elif classification_type == '5class':
        # Map to 5 attack categories
        attack_map = {
            'normal': 0,
            'DoS': 1,
            'Probe': 2,
            'R2L': 3,
            'U2R': 4
        }
        train_data['label'] = train_data['attack_type'].map(attack_map)
        test_data['label'] = test_data['attack_type'].map(attack_map)
    elif classification_type == '23class':
        # Full attack type classification
        le = LabelEncoder()
        train_data['label'] = le.fit_transform(train_data['attack_type'])
        test_data['label'] = le.transform(test_data['attack_type'])
    
    # Select features and labels
    X_train = train_data.drop(['attack_type', 'label', 'difficulty_level'], axis=1)
    y_train = train_data['label']
    X_test = test_data.drop(['attack_type', 'label', 'difficulty_level'], axis=1)
    y_test = test_data['label']
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [None]:
# Model Training and Evaluation Function
def train_and_evaluate_models(X_train, X_test, y_train, y_test, classification_type):
    # Initialize models
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='rbf', probability=True),
        'Neural Network': MLPClassifier(max_iter=1000)
    }
    
    results = {}
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)
        
        results[name] = {
            'Accuracy': accuracy,
            'Confusion Matrix': conf_matrix,
            'Classification Report': class_report
        }
    
    return results

In [None]:
# Main Execution
def main():
    # Load Data
    train_data, test_data = load_nsl_kdd_data()
    
    # Classification Scenarios
    classification_types = ['binary', '5class', '23class']
    
    for classification_type in classification_types:
        print(f"\n--- {classification_type.upper()} Classification Scenario ---")
        
        # Preprocess Data
        X_train, X_test, y_train, y_test = preprocess_data(
            train_data, test_data, classification_type
        )
        
        # Train and Evaluate Models
        results = train_and_evaluate_models(
            X_train, X_test, y_train, y_test, classification_type
        )
        
        # Visualize Results
        for model_name, model_results in results.items():
            print(f"\nModel: {model_name}")
            print(f"Accuracy: {model_results['Accuracy']}")
            print("Classification Report:\n", model_results['Classification Report'])

if __name__ == '__main__':
    main()

## Evaluation Metrics and Approaches

### Performance Metrics
1. Accuracy: Overall correct predictions
2. Confusion Matrix: Detailed breakdown of predictions
3. Precision, Recall, F1-Score for each class

### Cross-Validation
- Implemented using separate training and test sets
- Utilized n-fold cross-validation techniques

### Model Selection
1. Random Forest Classifier
2. Support Vector Machine (SVM)
3. Multi-Layer Perceptron Neural Network

### Key Considerations
- Separate training and test data collected in different environments
- Preprocessing includes feature scaling and encoding
- Multiple classification scenarios implemented