In [3]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score , confusion_matrix

# File paths for saving components
model_path = '/home/haneen/GP-latest/Models/DecisionTree_xss_model.pkl'
vectorizer_path = '/home/haneen/GP-latest/Models/DecisionTree_xss_vectorizer.pkl'
scaler_path = '/home/haneen/GP-latest/Models/DecisionTree_xss_scaler.pkl'

# Step 1: Load the Dataset
data = pd.read_csv('/home/haneen/GP-latest/Dataset/XSS_enhanced_dataset.csv', encoding='latin1')
print(data.head())

# Step 2: Preprocess the Data
X_text = data['Sentence'].astype(str)  # Ensure text is string format
y = data['Label']  # Target labels

# Step 3: Process text data with TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_text_tfidf = vectorizer.fit_transform(X_text)

# Step 4: Process numerical features
numeric_features = data.drop(columns=['Sentence', 'Label'])  # Remove text & target columns
scaler = StandardScaler()
X_numeric_scaled = scaler.fit_transform(numeric_features)

# Step 5: Combine text and numeric features
X_combined = np.hstack((X_text_tfidf.toarray(), X_numeric_scaled))

# Step 6: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 7: Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 8: Make Predictions
y_pred = clf.predict(X_test)

# Step 9: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = cm.ravel()
print("Confusion Matrix:\n")
print(f"True Negative (TN): {TN}")
print(f"False Positive (FP): {FP}")
print(f"False Negative (FN): {FN}")
print(f"True Positive (TP): {TP}")

# Step 10: Save the Model, Vectorizer, and Scaler
joblib.dump(clf, model_path)
joblib.dump(vectorizer, vectorizer_path)
joblib.dump(scaler, scaler_path)


                                               Query  Label  query_len  \
0                  " or pg_sleep  (  __time__  )  --      1         33   
1  create user name identified by pass123 tempora...      1         90   
2   and 1  =  utl_inaddr.get_host_address   (    ...      1        218   
3   select * from users where id  =  '1' or @ @1 ...      1         90   
4   select * from users where id  =  1 or 1#"  ( ...      1         85   

   num_words_query  no_single_qts  no_double_qts  no_punct  no_single_cmnt  \
0                7              0              1        10               1   
1               12              0              0         1               0   
2               35              3              0        25               0   
3               20              3              0        13               1   
4               18              0              1        10               1   

   no_mult_cmnt  no_space  no_perc  no_log_opt  no_arith  no_null  no_hexa  \
0       

['../Models/DecisionTree_scaler.pkl']