Train

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier  # Import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

# Function to load and label datasets
def load_data(normal_path, xss_path):
    with open(normal_path, 'r', encoding='utf-8', errors='ignore') as file:
        normal_urls = [line.strip() for line in file if line.strip()]

    with open(xss_path, 'r', encoding='utf-8', errors='ignore') as file:
        xss_urls = [line.strip() for line in file if line.strip()]

    normal_labels = [0] * len(normal_urls)  # Label 0 for normal
    xss_labels = [1] * len(xss_urls)        # Label 1 for XSS

    urls = normal_urls + xss_urls
    labels = normal_labels + xss_labels

    return pd.DataFrame({'url': urls, 'label': labels})

# Load the data
data = load_data('C:\\Users\\Omen\\Desktop\\XSS\\Train_NonXSS.txt', 'C:\\Users\\Omen\\Desktop\\XSS\\Train_XSS.txt')

# Sample a small portion of the data for quicker testing (optional)
# Uncomment the following line to sample 10% of the data
# data = data.sample(frac=0.1, random_state=42)

# Preprocess the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(data['url']).toarray()

# Additional custom features
data['url_length'] = data['url'].apply(len)
data['special_char_count'] = data['url'].apply(lambda x: sum(1 for char in x if char in ['<', '>', '"', '&']))
data['keyword_presence'] = data['url'].apply(lambda x: 1 if any(kw in x.lower() for kw in ['script', 'alert', 'img', 'onerror']) else 0)

# Combine TF-IDF features with custom features
X_custom = np.hstack([X_tfidf, data[['url_length', 'special_char_count', 'keyword_presence']].values])
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_custom, y, test_size=0.3, random_state=42)

# Apply SMOTE to oversample the minority class (XSS)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define and tune the MLPClassifier using GridSearchCV with fewer folds and early stopping
param_grid = {
    'hidden_layer_sizes': [(100,), (128,)],  # Smaller layers for faster training
    'alpha': [0.001],  # Keep alpha fixed to reduce complexity
    'max_iter': [300]   # Maximum iterations
}

mlp_clf = MLPClassifier(random_state=42, early_stopping=True, verbose=True)  # Early stopping added

# Perform Grid Search to tune hyperparameters with fewer folds
grid_search = GridSearchCV(mlp_clf, param_grid, cv=2, scoring='accuracy', verbose=10)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters found from the grid search
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Train the best model on resampled data
best_mlp_clf = grid_search.best_estimator_
best_mlp_clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data with a custom decision threshold
y_pred_proba = best_mlp_clf.predict_proba(X_test)[:, 1]
threshold = 0.4  # Adjust threshold
y_pred = (y_pred_proba >= threshold).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer for later use
joblib.dump(best_mlp_clf, 'mlpc_xss_model_with_custom_features.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer_with_custom_features.pkl')


Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2; 1/2] START alpha=0.001, hidden_layer_sizes=(100,), max_iter=300........
Iteration 1, loss = 0.32531906
Validation score: 0.992516
Iteration 2, loss = 0.08577751
Validation score: 0.980979
Iteration 3, loss = 0.03834355
Validation score: 0.998441
Iteration 4, loss = 0.02649224
Validation score: 0.998129
Iteration 5, loss = 0.03579210
Validation score: 0.998129
Iteration 6, loss = 0.02140377
Validation score: 0.998441
Iteration 7, loss = 0.01718755
Validation score: 0.997505
Iteration 8, loss = 0.02373639
Validation score: 0.989710
Iteration 9, loss = 0.03021476
Validation score: 0.998129
Iteration 10, loss = 0.01482936
Validation score: 0.998129
Iteration 11, loss = 0.01865769
Validation score: 0.998129
Iteration 12, loss = 0.02569481
Validation score: 0.993140
Iteration 13, loss = 0.01455128
Validation score: 0.998129
Iteration 14, loss = 0.01887197
Validation score: 0.997817
Validation score did not improve more than 

['tfidf_vectorizer_with_custom_features.pkl']

Test without featured

In [3]:
import numpy as np
import joblib
from sklearn.metrics import classification_report, accuracy_score

# Function to process data and return TF-IDF features plus additional features (URL length, special chars, keyword presence)
def process_data(urls, vectorizer):
    # Use TF-IDF features (vectorizer was trained with max_features=5000)
    X_test_real_tfidf = vectorizer.transform(urls).toarray()
    
    # Calculate additional features
    url_length = np.array([len(url) for url in urls])
    special_char_count = np.array([sum(1 for char in url if char in ['<', '>', '"', '&']) for url in urls])
    keyword_presence = np.array([1 if any(kw in url.lower() for kw in ['script', 'alert', 'img', 'onerror']) else 0 for url in urls])

    # Combine TF-IDF features with the additional features
    X_test_real_features = np.hstack([X_test_real_tfidf, 
                                      url_length.reshape(-1, 1), 
                                      special_char_count.reshape(-1, 1), 
                                      keyword_presence.reshape(-1, 1)])
    
    return X_test_real_features

# Test Model Function to count detected XSS payloads and show the percentage
def test_model(test_data_path, vectorizer_path='tfidf_vectorizer_with_custom_features.pkl', model_path='mlpc_xss_model_with_custom_features.pkl', threshold=0.95):
    print("Loading model and vectorizer")
    
    # Load the pre-trained MLPClassifier model
    model = joblib.load(model_path)
    
    # Load the vectorizer
    vectorizer = joblib.load(vectorizer_path)

    # Read the test data
    with open(test_data_path, 'r', encoding='utf-8', errors='ignore') as file:
        urls = [line.strip() for line in file if line.strip()]

    # Process the data (use TF-IDF features and additional features)
    X_test_real_features = process_data(urls, vectorizer)
    
    # Predict the probabilities using the MLPClassifier model
    y_pred_proba = model.predict_proba(X_test_real_features)[:, 1]
    
    # Apply the threshold to the predicted probabilities
    y_pred = (y_pred_proba >= threshold).astype(int)

    total_xss_detected = np.sum(y_pred)
    total_payloads = len(y_pred)

    # Calculate the percentage of XSS payloads detected
    percentage_detected = (total_xss_detected / total_payloads) * 100 if total_payloads > 0 else 0

    # Print the number and percentage of XSS detected
    print(f"Total XSS payloads detected: {total_xss_detected}/{total_payloads} ({percentage_detected:.2f}%)")
    
    return y_pred

# Main Execution
if __name__ == "__main__":
    # Path to the test file (all URLs in this file are XSS)
    test_data_path = r'C:\Users\Omen\Desktop\XSS\xxs_payloads_04.txt'
    
    # Call the function to count and print the number and percentage of detected XSS
    y_pred = test_model(
        test_data_path=test_data_path,  
        vectorizer_path='tfidf_vectorizer_with_custom_features.pkl',     
        model_path='mlpc_xss_model_with_custom_features.pkl', 
        threshold=0.95  # Adjust the threshold as needed
    )


Loading model and vectorizer
Total XSS payloads detected: 42186/42760 (98.66%)


Test with prefetch with multiprocessing

In [14]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
import joblib

# Function to calculate additional features
def calculate_features(urls):
    url_length = np.array([len(url) for url in urls])
    special_char_count = np.array([sum(1 for char in url if char in ['<', '>', '"', '&']) for url in urls])
    keyword_presence = np.array([1 if any(kw in url.lower() for kw in ['script', 'alert', 'img', 'onerror']) else 0 for url in urls])
    return url_length, special_char_count, keyword_presence

# Function to process data and return TF-IDF features plus additional features
def process_data(urls, vectorizer):
    # Use TF-IDF features with max_features=5000 to ensure it matches the model
    X_test_real_tfidf = vectorizer.transform(urls).toarray()
    
    # Calculate additional features
    url_length, special_char_count, keyword_presence = calculate_features(urls)

    # Combine TF-IDF features with the additional features (to total 5003)
    X_test_real_features = np.hstack([X_test_real_tfidf, 
                                      url_length.reshape(-1, 1), 
                                      special_char_count.reshape(-1, 1), 
                                      keyword_presence.reshape(-1, 1)])  # 3 custom features
    
    return X_test_real_features

# Worker function for processing batches of data in parallel with thresholding
def worker_process(url_chunk, vectorizer, model, threshold=0.5):
    try:
        # Process the chunk of URLs and get predictions
        X_test_real_features = process_data(url_chunk, vectorizer)
        # Get probability predictions from the model
        y_pred_proba = model.predict_proba(X_test_real_features)[:, 1]
        # Apply threshold
        predictions = (y_pred_proba >= threshold).astype(int)
        return np.sum(predictions), len(predictions)  # Return number of XSS detected and total
    except Exception as e:
        print(f"Error in worker process: {e}")
        return 0, 0

# Test Model Function with multithreading, batch processing, and threshold
def test_model(test_data_path, vectorizer_path='tfidf_vectorizer.pkl', model_path='mlpc_xss_model_with_custom_features.pkl', num_threads=16, chunk_size=10000, threshold=0.95):
    print("Loading model and vectorizer")
    
    # Load the pre-trained model
    model = joblib.load(model_path)  # Assuming you're using a scikit-learn MLP model
    
    # Load the vectorizer
    vectorizer = joblib.load(vectorizer_path)

    # Initialize counts for detected XSS and total payloads
    total_xss_detected = 0
    total_payloads = 0

    futures = []

    # Open the test data file
    with open(test_data_path, 'r', encoding='utf-8', errors='ignore') as file:
        urls = [line.strip() for line in file if line.strip()]
    
    # Use multithreading to process the data in parallel
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for i in range(0, len(urls), chunk_size):
            batch_chunk = urls[i:i + chunk_size]
            # Submit the batch chunk for processing
            futures.append(executor.submit(worker_process, batch_chunk, vectorizer, model, threshold))
        
        # Collect results as they complete
        for future in as_completed(futures):
            xss_detected, payloads = future.result()
            total_xss_detected += xss_detected
            total_payloads += payloads

    # Calculate the final percentage of XSS detected
    if total_payloads > 0:
        percentage_detected = (total_xss_detected / total_payloads) * 100
    else:
        percentage_detected = 0
    
    # Print the total number and percentage of XSS detected
    print(f"Total XSS payloads detected: {total_xss_detected}/{total_payloads} ({percentage_detected:.2f}%)")

# Main Execution
if __name__ == "__main__":
    # Path to the test file (all URLs in this file are XSS or normal)
    test_data_path = r'C:\Users\Omen\Desktop\XSS\Test_Dataset\For Speed\XSS100k.txt'
    
    # Call the function to count and print the number and percentage of detected XSS
    test_model(
        test_data_path=test_data_path,  
        vectorizer_path='tfidf_vectorizer_with_custom_features.pkl',     
        model_path='mlpc_xss_model_with_custom_features.pkl', 
        num_threads=16,  # Number of threads for parallel processing
        chunk_size=10000,  # Process in chunks to manage memory
        threshold=0.95  # Set your threshold here
    )


Loading model and vectorizer
Total XSS payloads detected: 98700/100000 (98.70%)
