In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib

# Load the dataset from a .csv file
# Replace 'your_dataset.csv' with the path to your .csv file
df = pd.read_csv('/home/saja/algorithms/XSS_enhanced_dataset.csv', encoding='latin1')

# Define features and target
features = [ 'Length','Tag_Count','Special_Char_Count','JS_Keyword_Count']  # Update these to match your feature columns
target = 'Label'  # Update this to match your target column

# Split the dataset into features (X) and target (y)
X = df[features]
y = df[target]

# Apply feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm.ravel()
print("Confusion Matrix:\n")
print(f"True Negative (TN): {TN}")
print(f"False Positive (FP): {FP}")
print(f"False Negative (FN): {FN}")
print(f"True Positive (TP): {TP}")

# Feature importance (Optional: If you want to see which features are important)
feature_importances = rf_classifier.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f"Feature: {feature}, Importance: {importance:.4f}")



joblib.dump(rf_classifier, '/home/saja/algorithms/Models/random_forest_model.pkl')
# joblib.dump(vectorizer, '/home/saja/algorithms/Models/random_forest_vectorizer.pkl')
joblib.dump(scaler, '/home/saja/algorithms/Models/random_forest_scaler.pkl')


Accuracy: 0.9973983739837399

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3872
           1       1.00      0.99      1.00      2278

    accuracy                           1.00      6150
   macro avg       1.00      1.00      1.00      6150
weighted avg       1.00      1.00      1.00      6150

Confusion Matrix:

True Negative (TN): 3870
False Positive (FP): 2
False Negative (FN): 14
True Positive (TP): 2264
Feature: query_len, Importance: 0.0396
Feature: num_words_query, Importance: 0.0809
Feature: no_single_qts, Importance: 0.0284
Feature: no_double_qts, Importance: 0.0320
Feature: no_punct, Importance: 0.1322
Feature: no_single_cmnt, Importance: 0.0611
Feature: no_mult_cmnt, Importance: 0.0006
Feature: no_space, Importance: 0.0663
Feature: no_perc, Importance: 0.0031
Feature: no_log_opt, Importance: 0.0798
Feature: no_arith, Importance: 0.0391
Feature: no_null, Importance: 0.0131
Feature: no_hexa, I