In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('CloudWatch_Traffic_Web_Attack.csv')

# Data Cleaning

# Handle missing values
data.fillna(method='ffill', inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Filter irrelevant columns
data.drop(columns=['rule_names', 'observation_name', 'source.meta', 'source.name', 'detection_types'], inplace=True)

# Timestamp Conversion
data['creation_time'] = pd.to_datetime(data['creation_time'])
data['end_time'] = pd.to_datetime(data['end_time'])
data['time'] = pd.to_datetime(data['time'])

# Extract additional time-based features
data['creation_day_of_week'] = data['creation_time'].dt.dayofweek
data['creation_hour'] = data['creation_time'].dt.hour
data['end_day_of_week'] = data['end_time'].dt.dayofweek
data['end_hour'] = data['end_time'].dt.hour

# Feature Extraction
features = ['bytes_in', 'bytes_out', 'protocol', 'dst_port', 'response.code', 'creation_day_of_week', 'creation_hour', 'end_day_of_week', 'end_hour']
X = data[features]

# Data Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print the first few rows of the preprocessed data
print(pd.DataFrame(X_scaled, columns=features).head())


# Data Collection and preprocessing

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('CloudWatch_Traffic_Web_Attack.csv')

# Handle missing values
data.fillna(method='ffill', inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Filter irrelevant columns
data.drop(columns=['rule_names', 'observation_name', 'source.meta', 'source.name', 'detection_types'], inplace=True)


In [3]:
# Timestamp Conversion
data['creation_time'] = pd.to_datetime(data['creation_time'])
data['end_time'] = pd.to_datetime(data['end_time'])
data['time'] = pd.to_datetime(data['time'])

# Extract additional time-based features
data['creation_day_of_week'] = data['creation_time'].dt.dayofweek
data['creation_hour'] = data['creation_time'].dt.hour
data['end_day_of_week'] = data['end_time'].dt.dayofweek
data['end_hour'] = data['end_time'].dt.hour

In [5]:
# Feature Extraction
features = ['bytes_in', 'bytes_out', 'dst_port', 'response.code', 'creation_day_of_week', 'creation_hour', 'end_day_of_week', 'end_hour']  # Removed 'protocol'
X = data[features]

# Data Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(pd.DataFrame(X_scaled, columns=features).head())

   bytes_in  bytes_out  dst_port  response.code  creation_day_of_week  \
0 -0.288219  -0.281223       0.0            0.0             -1.965215   
1 -0.282108  -0.260804       0.0            0.0             -1.965215   
2 -0.282689  -0.279344       0.0            0.0             -1.965215   
3 -0.282197  -0.276161       0.0            0.0             -1.965215   
4 -0.287996  -0.277678       0.0            0.0             -1.965215   

   creation_hour  end_day_of_week  end_hour  
0       1.766389         -2.18062  1.918236  
1       1.766389         -2.18062  1.918236  
2       1.766389         -2.18062  1.918236  
3       1.766389         -2.18062  1.918236  
4       1.766389         -2.18062  1.918236  


In [6]:
#Derived Features
data['packet_size'] = data['bytes_in'] + data['bytes_out']

# Time intervals between packets (assuming data is sorted by creation_time)
data.sort_values(by='creation_time', inplace=True)
data['time_interval'] = data['creation_time'].diff().dt.total_seconds().fillna(0)

# Aggregated statistics over fixed intervals (e.g., 1 minute)
data.set_index('creation_time', inplace=True)
data['bytes_in_mean'] = data['bytes_in'].rolling('1T').mean().fillna(0)
data['bytes_out_mean'] = data['bytes_out'].rolling('1T').mean().fillna(0)
data['bytes_in_max'] = data['bytes_in'].rolling('1T').max().fillna(0)
data['bytes_out_max'] = data['bytes_out'].rolling('1T').max().fillna(0)
data.reset_index(inplace=True)


In [8]:

# Behavioral Features

# Number of connections per source IP (behavioral feature)
data['connections_per_ip'] = data.groupby('src_ip')['src_ip'].transform('count')

# Ratio of incoming to outgoing bytes (behavioral feature)
data['bytes_ratio'] = data['bytes_in'] / (data['bytes_out'] + 1)  # Add 1 to avoid division by zero

# Frequency of response codes (behavioral feature)
data['response_code_freq'] = data.groupby('response.code')['response.code'].transform('count')

# Feature Extraction
features = ['bytes_in', 'bytes_out', 'dst_port', 'response.code',
            'creation_day_of_week', 'creation_hour', 'end_day_of_week', 'end_hour',
            'packet_size', 'time_interval', 'bytes_in_mean', 'bytes_out_mean',
            'bytes_in_max', 'bytes_out_max', 'connections_per_ip', 'bytes_ratio',
            'response_code_freq']
X = data[features]

# Data Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print the first few rows of the preprocessed data
print(pd.DataFrame(X_scaled, columns=features).head())

   bytes_in  bytes_out  dst_port  response.code  creation_day_of_week  \
0 -0.288219  -0.281223       0.0            0.0             -1.965215   
1 -0.282108  -0.260804       0.0            0.0             -1.965215   
2 -0.282689  -0.279344       0.0            0.0             -1.965215   
3 -0.282197  -0.276161       0.0            0.0             -1.965215   
4 -0.287996  -0.277678       0.0            0.0             -1.965215   

   creation_hour  end_day_of_week  end_hour  packet_size  time_interval  \
0       1.766389         -2.18062  1.918236    -0.287850      -0.104119   
1       1.766389         -2.18062  1.918236    -0.280910      -0.104119   
2       1.766389         -2.18062  1.918236    -0.282531      -0.104119   
3       1.766389         -2.18062  1.918236    -0.281883      -0.104119   
4       1.766389         -2.18062  1.918236    -0.287435      -0.104119   

   bytes_in_mean  bytes_out_mean  bytes_in_max  bytes_out_max  \
0      -0.694129       -0.679348     -0.84894

# Anomaly detection algorithms (e.g., Isolation Forest,
Autoencoders)

In [9]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
# Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
iso_forest.fit(X_scaled)
anomalies_if = iso_forest.predict(X_scaled)
data['anomaly_if'] = anomalies_if

# Autoencoder
def create_autoencoder(input_dim):
    model = Sequential()
    model.add(Dense(32, activation='relu', input_dim=input_dim))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

input_dim = X_scaled.shape[1]
autoencoder = create_autoencoder(input_dim)

# Train the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, validation_split=0.1, verbose=1)

# Reconstruction error
reconstructions = autoencoder.predict(X_scaled)
reconstruction_error = K.mean(K.square(reconstructions - X_scaled), axis=1)
threshold = K.mean(reconstruction_error) + 2 * K.std(reconstruction_error)
anomalies_ae = (reconstruction_error > threshold).numpy()
data['anomaly_ae'] = anomalies_ae

# Print the first few rows of anomalies
print(data[['anomaly_if', 'anomaly_ae']].head())

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
   anomaly_if  anomaly_ae
0           1       False
1           1       False
2           1       False
3           1       False
4           1       False


# Evaluation and validation of detected anomalies

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

data['true_labels'] = 0
data.loc[50:100, 'true_labels'] = 1

# Evaluate using the created true labels
y_true = data['true_labels']

# Isolation Forest evaluation
print("Isolation Forest Evaluation")
print(confusion_matrix(y_true, data['anomaly_if']))
print(classification_report(y_true, data['anomaly_if']))

# Autoencoder evaluation
print("Autoencoder Evaluation")
print(confusion_matrix(y_true, data['anomaly_ae']))
print(classification_report(y_true, data['anomaly_ae']))

Isolation Forest Evaluation
[[  0   0   0]
 [ 15   0 216]
 [  0   0  51]]
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00       231
           1       0.19      1.00      0.32        51

    accuracy                           0.18       282
   macro avg       0.06      0.33      0.11       282
weighted avg       0.03      0.18      0.06       282

Autoencoder Evaluation
[[226   5]
 [ 51   0]]
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       231
           1       0.00      0.00      0.00        51

    accuracy                           0.80       282
   macro avg       0.41      0.49      0.44       282
weighted avg       0.67      0.80      0.73       282



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
