In [2]:
# Load the data from the three CSV files - normalTrafficTraining.csv, normalTrafficTest.csv, and anomalousTrafficTest.csv
import pandas as pd
from sklearn.model_selection import train_test_split

normalTrafficTraining = pd.read_csv('normalTrafficTraining.csv')
normalTrafficTest = pd.read_csv('normalTrafficTest.csv')
anomalousTrafficTest = pd.read_csv('anomalousTrafficTest.csv')

# Add a column to each dataset to indicate the type of traffic
normalTrafficTraining['class'] = 0
normalTrafficTest['class'] = 0
anomalousTrafficTest['class'] = 1

# Combine the normalTrafficTraining and normalTrafficTest datasets
normalTraffic = pd.concat([normalTrafficTraining, normalTrafficTest])

# Combine the normalTraffic and anomalousTrafficTest datasets
allTraffic = pd.concat([normalTraffic, anomalousTrafficTest])

# Save the combined dataset to a new CSV file
# allTraffic.to_csv('allTraffic.csv', index=False)


In [3]:
allTraffic = pd.read_csv('allTraffic.csv')
# Convert 'Method' to category (or numeric: 0 for GET, 1 for POST)
allTraffic['Method'] = allTraffic['Method'].astype('category')

# Convert 'URL' and 'Cookie' to string
allTraffic['URL'] = allTraffic['URL'].astype(str)
allTraffic['Cookie'] = allTraffic['Cookie'].astype(str)

# Convert 'ContentLen', 'ReqLen', 'MaxByteValReq' to integers or floats
allTraffic['ContentLen'] = allTraffic['ContentLen'].astype(int, errors='ignore')
allTraffic['ReqLen'] = allTraffic['ReqLen'].astype(int, errors='ignore')
allTraffic['MaxByteValReq'] = allTraffic['MaxByteValReq'].astype(float, errors='ignore')

# Convert 'Payload' to string
allTraffic['Payload'] = allTraffic['Payload'].astype(str)

# Convert 'ArgLen', 'NumArgs', 'NumDigitsArgs', 'PathLen', 'NumLettersArgs', 
# 'NumLettersPath', 'NumSpecialCharsPath' to integers
allTraffic['ArgLen'] = allTraffic['ArgLen'].astype(int)
allTraffic['NumArgs'] = allTraffic['NumArgs'].astype(int)
allTraffic['NumDigitsArgs'] = allTraffic['NumDigitsArgs'].astype(int)
allTraffic['PathLen'] = allTraffic['PathLen'].astype(int)
allTraffic['NumLettersArgs'] = allTraffic['NumLettersArgs'].astype(int)
allTraffic['NumLettersPath'] = allTraffic['NumLettersPath'].astype(int)
allTraffic['NumSpecialCharsPath'] = allTraffic['NumSpecialCharsPath'].astype(int)

# Convert 'class' to a categorical or integer type for classification (0 for normal, 1 for anomalous)
allTraffic['class'] = allTraffic['class'].astype('category')

allTraffic['Content_present'] = allTraffic['ContentLen'].notnull().astype(int)

# Print the allTraffic types to verify
print(allTraffic.dtypes)


Method                 category
URL                      object
Cookie                   object
ContentLen              float64
Payload                  object
ReqLen                    int64
ArgLen                    int64
NumArgs                   int64
NumDigitsArgs             int64
PathLen                   int64
NumLettersArgs            int64
NumLettersPath            int64
NumSpecialCharsPath       int64
MaxByteValReq           float64
class                  category
Content_present           int64
dtype: object


In [4]:
# Fill in missing values with the mean of the column for numeric columns
for column in allTraffic.select_dtypes(include=['int', 'float']):
    allTraffic[column].fillna(allTraffic[column].mean(), inplace=True)
    
# Fill in missing values with the mode of the column for categorical columns
for column in allTraffic.select_dtypes(include=['object', 'category']):
    allTraffic[column].fillna(allTraffic[column].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  allTraffic[column].fillna(allTraffic[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  allTraffic[column].fillna(allTraffic[column].mode()[0], inplace=True)


## Oversampling 
### SMOTE
Have same number of normal and anamalous points

In [4]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Separate features and target
X = allTraffic.drop(columns=['class'])  # Replace 'class' with the actual name if different
y = allTraffic['class']

# Encode categorical variables
categorical_features = ['URL', 'Cookie', 'Payload']
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Apply SMOTE on the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Decode the categorical variables
for col in categorical_features:
    X_resampled[col] = label_encoders[col].inverse_transform(X_resampled[col])
    # X_test[col] = label_encoders[col].inverse_transform(X_test[col])
print("Original training target distribution:\n", y.value_counts())
print("Resampled training target distribution:\n", y_resampled.value_counts())


Original training target distribution:
 class
0    72000
1    24668
Name: count, dtype: int64
Resampled training target distribution:
 class
0    72000
1    72000
Name: count, dtype: int64


In [5]:
X_resampled

Unnamed: 0,Method,URL,Cookie,ContentLen,Payload,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq,Content_present
0,0,http://localhost:8080/tienda1/index.jsp,1F767F17239C9B670A39E9B10C3825F4,103.397266,,39,0,0,0,39,0,27,7,120.000000,0
1,0,http://localhost:8080/tienda1/publico/anadir.jsp,81761ACA043B0E6014CA42A4BCD06AB5,103.397266,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117.000000,0
2,1,http://localhost:8080/tienda1/publico/anadir.jsp,933185092E0B668B90676E0A2B0767AF,68.000000,id=3&nombre=Vino+Rioja&precio=100&cantidad=55&...,116,35,5,7,48,24,35,8,117.000000,1
3,0,http://localhost:8080/tienda1/publico/autentic...,8FA18BA82C5336D03D3A8AFA3E68CBB0,103.397266,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119.000000,0
4,1,http://localhost:8080/tienda1/publico/autentic...,7104E6C68A6BCF1423DAE990CE49FEE2,63.000000,modo=entrar&login=choong&pwd=d1se3ci%F3n&remem...,115,32,5,3,52,28,39,8,119.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143995,,http://localhost:8080/tienda1/publico/anadir.jsp,83CB0043BE8361E50FD42EB2430D418F,87.619160,id=2&nombre=Ques%2Bo+Manchego&precio=39&cantid...,154,73,5,11,48,48,35,8,117.000000,0
143996,,http://localhost:8080/tienda1/publico/pagar.js...,0B8E65A97A59643A5682654450BC886A,124.181689,modoA=entrar&login=villella&pwd=5e04ayo&rememb...,115,39,4,8,47,29,34,8,117.639117,0
143997,0,http://localhost:8080/tienda1/miembros/editar.jsp,DD7A96AAFD7BD4FE1478CFCA1E345BCF,103.397266,modo=registro&login=falicov&password=no3otr46&...,321,176,13,35,49,125,36,8,121.000000,0
143998,1,http://localhost:8080/tienda1/publico/registro...,6E5A3E649BB401F1F580EB31D4B34E6F,259.295256,modo=registro&login=dimitrio&password=a30t0na8...,309,162,13,37,50,114,37,8,121.704744,1


In [6]:
# Save the resampled data to a new CSV file
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

resampled_data.to_csv('combinedData_SMOTE_Oversampling.csv', index=False)

### ADASYN

In [7]:
from imblearn.over_sampling import ADASYN

# Apply ADASYN on the training data
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Decode the categorical variables
for col in categorical_features:
    X_resampled[col] = label_encoders[col].inverse_transform(X_resampled[col])
    # X_test[col] = label_encoders[col].inverse_transform(X_test[col])

print("Original training target distribution:\n", y.value_counts())
print("Resampled training target distribution:\n", y_resampled.value_counts())


Original training target distribution:
 class
0    72000
1    24668
Name: count, dtype: int64
Resampled training target distribution:
 class
0    72000
1    71646
Name: count, dtype: int64


In [8]:
# Save the resampled data to a new CSV file
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

resampled_data.to_csv('combinedData_ADASYN_Oversampling.csv', index=False)


## Undersampling
### Random Undersampling

In [9]:
from imblearn.under_sampling import RandomUnderSampler

# Random Undersampling
random_undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = random_undersampler.fit_resample(X, y)

# Decode the categorical variables
for col in categorical_features:
    X_resampled[col] = label_encoders[col].inverse_transform(X_resampled[col])
    # X_test[col] = label_encoders[col].inverse_transform(X_test[col])
    
print("Original training target distribution:\n", y.value_counts())
print("Resampled training target distribution:\n", y_resampled.value_counts())

Original training target distribution:
 class
0    72000
1    24668
Name: count, dtype: int64
Resampled training target distribution:
 class
0    24668
1    24668
Name: count, dtype: int64


In [10]:
# Save the resampled data to a new CSV file
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

resampled_data.to_csv('combinedData_RandomUndersampling.csv', index=False)

### NearMiss

In [11]:
from imblearn.under_sampling import NearMiss

# NearMiss Undersampling
nearmiss_undersampler = NearMiss(version=1)
X_resampled, y_resampled = nearmiss_undersampler.fit_resample(X, y)

# Decode the categorical variables
for col in categorical_features:
    X_resampled[col] = label_encoders[col].inverse_transform(X_resampled[col])
    # X_test[col] = label_encoders[col].inverse_transform(X_test[col])
    
print("Original training target distribution:\n", y.value_counts())
print("Resampled training target distribution:\n", y_resampled.value_counts())

Original training target distribution:
 class
0    72000
1    24668
Name: count, dtype: int64
Resampled training target distribution:
 class
0    24668
1    24668
Name: count, dtype: int64


In [12]:
# Save the resampled data to a new CSV file
resampled_data = pd.concat([X_resampled, y_resampled], axis=1)

resampled_data.to_csv('Data Augmentation/combinedData_NearMissUndersampling.csv', index=False)

## Ensemble Methods
### Random Forests

In [4]:
features = ['Method', 'ReqLen', 'ArgLen', 'NumArgs', 'NumDigitsArgs', 
            'PathLen', 'NumLettersArgs', 'NumLettersPath', 'NumSpecialCharsPath', 
            'MaxByteValReq', 'Content_present']

X = allTraffic[features]
y = allTraffic['class']


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [5]:
oversampled_data = pd.read_csv('Data Augmentation/combinedData_SMOTE_Oversampling.csv')
oversampled_data['Content_present'] = oversampled_data['ContentLen'].notnull().astype(int)
X_train = oversampled_data.drop(columns=['class'])
X_train = X_train[features]
y_train = oversampled_data['class']

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [11]:
# Predict the target on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10800
           1       0.98      0.98      0.98      3701

    accuracy                           0.99     14501
   macro avg       0.99      0.99      0.99     14501
weighted avg       0.99      0.99      0.99     14501

Accuracy: 0.9908282187435349
Precision: 0.9816414686825053
Recall: 0.9824371791407728
F1 Score: 0.9820391627278866


array([[10732,    68],
       [   65,  3636]])

In [8]:
# REmove nan values 
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [1]:
# SVM
from sklearn.svm import SVC
svm_model = SVC(random_state=42)
svm_model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [12]:
# Print 5 random samples from the test data and their predictions
import random
random.seed(42)
random_indices = random.sample(range(0, len(X_test)), 5)
for i in random_indices:
    print('Sample:', X_test.iloc[i])
    print("Actual:", y_test.iloc[i], "Predicted:", y_pred[i])
    

Sample: Method                   1.0
ReqLen                 304.0
ArgLen                 158.0
NumArgs                 13.0
NumDigitsArgs           45.0
PathLen                 50.0
NumLettersArgs         100.0
NumLettersPath          37.0
NumSpecialCharsPath      8.0
MaxByteValReq          122.0
Content_present          1.0
Name: 53331, dtype: float64
Actual: 0 Predicted: 0
Sample: Method                   0.0
ReqLen                  44.0
ArgLen                   0.0
NumArgs                  0.0
NumDigitsArgs            0.0
PathLen                 44.0
NumLettersArgs           0.0
NumLettersPath          30.0
NumSpecialCharsPath      8.0
MaxByteValReq          116.0
Content_present          0.0
Name: 65299, dtype: float64
Actual: 0 Predicted: 0
Sample: Method                   1.0
ReqLen                 116.0
ArgLen                  35.0
NumArgs                  5.0
NumDigitsArgs            7.0
PathLen                 48.0
NumLettersArgs          24.0
NumLettersPath          35.0
NumS

### Balanced Random Forests

In [40]:
from imblearn.ensemble import BalancedRandomForestClassifier

# Train a Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(X_train, y_train)


  warn(
  warn(
  warn(


In [41]:
# Predict the target on the test data
y_pred = brf.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10800
           1       0.98      0.98      0.98      3701

    accuracy                           0.99     14501
   macro avg       0.99      0.99      0.99     14501
weighted avg       0.99      0.99      0.99     14501

Accuracy: 0.9908282187435349
Precision: 0.9800861141011841
Recall: 0.9840583626047015
F1 Score: 0.9820682216529594


array([[10726,    74],
       [   59,  3642]])

### Easy Ensemble

In [15]:
from imblearn.ensemble import EasyEnsembleClassifier

# Train an Easy Ensemble Classifier
eec = EasyEnsembleClassifier(n_estimators=10, random_state=42)

eec.fit(X_train, y_train)

In [16]:
# Predict the target on the test data
y_pred = eec.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.92      0.74      0.82     10800
           1       0.52      0.82      0.64      3701

    accuracy                           0.76     14501
   macro avg       0.72      0.78      0.73     14501
weighted avg       0.82      0.76      0.78     14501

Accuracy: 0.761602648093235
Precision: 0.5209118957833391
Recall: 0.8211294244798703
F1 Score: 0.63744100681699


array([[8005, 2795],
       [ 662, 3039]])

### RUSBoost

In [17]:
from imblearn.ensemble import RUSBoostClassifier

# Initialize the RUSBoost Classifier
rusboost = RUSBoostClassifier(n_estimators=50, random_state=42)
rusboost.fit(X_train, y_train)


In [18]:
# Predict the target on the test data
y_pred = rusboost.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.97      0.74      0.84     10800
           1       0.55      0.93      0.69      3701

    accuracy                           0.79     14501
   macro avg       0.76      0.84      0.77     14501
weighted avg       0.86      0.79      0.80     14501

Accuracy: 0.7903592855665127
Precision: 0.5528715405535114
Recall: 0.9338016752229127
F1 Score: 0.6945337620578779


array([[8005, 2795],
       [ 245, 3456]])

### Balanced Bagging

In [19]:
from imblearn.ensemble import BalancedBaggingClassifier

# Initialize the Balanced Bagging Classifier
bbc = BalancedBaggingClassifier(n_estimators=50, random_state=42)

bbc.fit(X_train, y_train)

In [28]:
# Predict the target on the test data
y_pred = bbc.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.99      0.96      0.97     10800
           1       0.90      0.96      0.93      3701

    accuracy                           0.96     14501
   macro avg       0.94      0.96      0.95     14501
weighted avg       0.96      0.96      0.96     14501

Accuracy: 0.9607613268050479
Precision: 0.8956543708943911
Recall: 0.9578492299378546
F1 Score: 0.9257083170126649


array([[10387,   413],
       [  156,  3545]])

### XGBoost with Scale_Pos_Weight

In [26]:
# List of categorical columns
categorical_cols = ['Method', 'Content_present']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
for col in categorical_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col].astype(str))
    X_test[col] = label_encoder.transform(X_test[col].astype(str))  # Use the same encoder for test data

X_train.head()

Unnamed: 0,Method,ReqLen,ArgLen,NumArgs,NumDigitsArgs,PathLen,NumLettersArgs,NumLettersPath,NumSpecialCharsPath,MaxByteValReq,Content_present
8063,0,56,0,0,0,56,0,43,8,117.0,0
11282,0,287,141,13,35,50,98,37,8,122.0,0
2227,0,44,0,0,0,44,0,30,8,116.0,0
13728,0,60,3,1,1,48,1,35,8,117.0,0
24697,0,115,34,5,6,48,24,35,8,117.0,0


In [27]:
from xgboost import XGBClassifier

# Calculate class weight
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Initialize XGBoost with adjusted class weight
xgb = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
xgb.fit(X_train, y_train)



In [29]:
# Predict the target on the test data
y_pred = xgb.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Metrics for the Balanced Random Forest Classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# Confusion Matrix
confusion_matrix(y_test, y_pred)


              precision    recall  f1-score   support

           0       0.99      0.93      0.96     10800
           1       0.82      0.96      0.88      3701

    accuracy                           0.94     14501
   macro avg       0.90      0.95      0.92     14501
weighted avg       0.94      0.94      0.94     14501

Accuracy: 0.9355216881594373
Precision: 0.8161865569272977
Recall: 0.9646041610375574
F1 Score: 0.8842105263157894


array([[9996,  804],
       [ 131, 3570]])