In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# Load your dataset
file_path = '/kaggle/input/33-dataset/UNSW_2018_IoT_Botnet_Dataset_33.csv'
df = pd.read_csv(file_path)

# Display initial information about the dataset
print("Initial dataset:")
print(df.head())
print("Dataset info:")
print(df.info())

# Separate features and target
X = df.drop('attack', axis=1)  # Drop the target column 'attack' to get features
y = df['attack']  # Target column indicating attack (1) or normal (0)

# Identify categorical features
categorical_features = ['flgs', 'proto', 'state', 'subcategory ']  # Add other categorical features as needed

# Encode categorical features
label_encoder = LabelEncoder()
for feature in categorical_features:
    if feature in X.columns:
        X[feature] = label_encoder.fit_transform(X[feature].astype(str))

# Handle missing values
X = X.fillna(0)  # Fill missing values with 0 or appropriate values

# Convert all remaining data to numeric (if necessary)
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Perform oversampling
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target into a single DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['attack'])], axis=1)

# Save the resampled dataset to a new CSV file
output_path = '/kaggle/working/resample.csv'
df_resampled.to_csv(output_path, index=False)

print("Resampled dataset saved to:", output_path)


  df = pd.read_csv(file_path)


Initial dataset:
    pkSeqID       stime flgs proto            saddr  sport          daddr  \
0  32000001  1528087048    e   udp  192.168.100.149  21451  192.168.100.6   
1  32000002  1528087048    e   udp  192.168.100.149  21452  192.168.100.6   
2  32000003  1528087048    e   udp  192.168.100.149  21453  192.168.100.6   
3  32000004  1528087048    e   udp  192.168.100.149  21454  192.168.100.6   
4  32000005  1528087048    e   udp  192.168.100.149  21455  192.168.100.6   

  dport  pkts  bytes  ... spkts  dpkts  sbytes  dbytes     rate    srate  \
0    80     2    120  ...     2      0     120       0  0.32448  0.32448   
1    80     2    120  ...     2      0     120       0  0.32448  0.32448   
2    80     2    120  ...     2      0     120       0  0.32448  0.32448   
3    80     2    120  ...     2      0     120       0  0.32448  0.32448   
4    80     2    120  ...     2      0     120       0  0.32448  0.32448   

   drate  attack  category  subcategory   
0    0.0       1    

In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('/kaggle/working/resample.csv')

# Select features and target
X = df[[
    'pkts',        # Total count of packets in transaction
    'bytes',       # Total number of bytes in transaction
    'dur',         # Total duration of transaction
    'mean',        # Average duration of aggregated records
    'stddev',      # Standard deviation of aggregated records
    'rate',        # Total packets per second in transaction
    'srate',       # Source-to-destination packets per second
    'drate',       # Destination-to-source packets per second
    'category',    # Traffic category
    'subcategory '  # Traffic subcategory
]]
y = df['attack']

# Initialize LabelEncoders for categorical features
label_encoders = {}
categorical_columns = ['category', 'subcategory ']

# Apply Label Encoding
for column in categorical_columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column].astype(str))  # Convert to string for consistent encoding
    label_encoders[column] = le

# Convert to numpy array
X = X.values

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column].astype(str))  # Convert to string for consistent encoding
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column].astype(str))  # Convert to string for consistent encoding


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [26]:
from pyod.models.iforest import IForest
iforest_clf = IForest()
iforest_clf.fit(X_train)

IForest(behaviour='old', bootstrap=False, contamination=0.1, max_features=1.0,
    max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
    verbose=0)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

iforest_pred = iforest_clf.predict(X_test)
print(classification_report(y_test, iforest_pred, digits=2))

              precision    recall  f1-score   support

           0       0.44      0.80      0.57    200045
           1       0.00      0.00      0.00    199945

    accuracy                           0.40    399990
   macro avg       0.22      0.40      0.29    399990
weighted avg       0.22      0.40      0.29    399990



In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Initialize the neural network
model = Sequential()

# Input layer and first hidden layer
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))

# Additional hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [36]:
model.fit(X_train, y_train, epochs=5, batch_size=4096, validation_split=0.1)

Epoch 1/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 2.0382e-11 - val_accuracy: 1.0000 - val_loss: 1.9322e-11
Epoch 2/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 1.9296e-11 - val_accuracy: 1.0000 - val_loss: 1.8385e-11
Epoch 3/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 1.8382e-11 - val_accuracy: 1.0000 - val_loss: 1.7499e-11
Epoch 4/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 1.0000 - loss: 1.7470e-11 - val_accuracy: 1.0000 - val_loss: 1.6660e-11
Epoch 5/5
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 1.6673e-11 - val_accuracy: 1.0000 - val_loss: 1.5871e-11


<keras.src.callbacks.history.History at 0x7cebd97df100>