In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Load the dataset
df = pd.read_csv("dataset.csv")

# Select relevant columns for training
selected_columns = [
    "Active Mean", "Total Fwd Packets", "tot_dur", "flows", "pktrate", "tx_bytes", "rx_bytes", "tx_kbps", "rx_kbps", "tot_kbps"
]
X = df[selected_columns]
y = df["label"]

# Normalize the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM (samples, timesteps, features)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential([
    LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, activation='relu'),
    Dropout(0.3),
    LSTM(64, return_sequences=True, activation='relu'),
    Dropout(0.3),
    LSTM(32, return_sequences=False, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test))

# Save the model
model.save("lstm_model.h5")

# Load the model for predictions
loaded_model = tf.keras.models.load_model("lstm_model.h5")

# Make a prediction for a new sample
def make_prediction(input_data):
    input_scaled = scaler.transform([input_data])
    input_reshaped = input_scaled.reshape((1, 1, len(input_data)))
    prediction = loaded_model.predict(input_reshaped)
    return {column: value for column, value in zip(selected_columns, input_data)}, prediction[0][0]

# Example usage
new_sample = [100, 716000000, 1.01e11, 3, 451, 143928631, 3917, 0, 0.0, 0.0]
columns_with_values, result = make_prediction(new_sample)
print("Input Columns and Values:")
for column, value in columns_with_values.items():
    print(f"{column}: {value}")
print("Predicted Label:", result)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load the dataset
df = pd.read_csv("dataset.csv")

# Select relevant columns for training
selected_columns = [
    "Active Mean", "Total Fwd Packets", "tot_dur", "flows", "pktrate", "tx_bytes", "rx_bytes", "tx_kbps", "rx_kbps", "tot_kbps"
]
X = df[selected_columns]
y = df["label"]

# Normalize the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data for LSTM (samples, timesteps, features)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y, test_size=0.2, random_state=42)

# Build LSTM model with improvements
model = Sequential([
    LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(64, return_sequences=True, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    LSTM(32, return_sequences=False, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Assuming binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Learning rate scheduler
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=1e-6)

# Train the model with validation data and the learning rate scheduler callback
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[lr_scheduler])

# Save the model
model.save("improved_lstm_model.h5")

# Load the model for predictions
loaded_model = tf.keras.models.load_model("improved_lstm_model.h5")

# Make a prediction for a new sample
def make_prediction(input_data):
    input_scaled = scaler.transform([input_data])
    input_reshaped = input_scaled.reshape((1, 1, len(input_data)))
    prediction = loaded_model.predict(input_reshaped)
    return {column: value for column, value in zip(selected_columns, input_data)}, prediction[0][0]

# Example usage
new_sample = [100, 716000000, 1.01e11, 3, 451, 143928631, 3917, 0, 0.0, 0.0]
columns_with_values, result = make_prediction(new_sample)

# Output the results
print("Input Columns and Values:")
for column, value in columns_with_values.items():
    print(f"{column}: {value}")
print("Predicted Label:", result)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv('dataset.csv')

# Verify column names (to ensure no trailing spaces or typos)
print("Column names in the dataset:")
print(df.columns.tolist())

# Drop redundant/irrelevant columns (with corrected trailing spaces)
columns_to_drop = ['Source IP', 'Source Port', 'src', 'dst', 'Destination IP ', 
                   'Destination Port ', 'tx_kbps', 'rx_kbps', 'tot_kbps', 'port_no']
df = df.drop(columns=columns_to_drop)

# Encode categorical 'Protocol' column
encoder = OneHotEncoder(drop='first')  # Remove 'sparse=False'
protocol_encoded = encoder.fit_transform(df[['Protocol']])
df_protocol = pd.DataFrame(protocol_encoded.toarray(), columns=encoder.get_feature_names_out(['Protocol']))
df = pd.concat([df.drop('Protocol', axis=1), df_protocol], axis=1)

# Split data into features and target
X = df.drop('label', axis=1)
y = df['label']

# Split into train/test sets (stratified due to class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train Random Forest (handles class imbalance)
model = RandomForestClassifier(
    class_weight='balanced',  # Adjust for imbalanced classes
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance (optional)
print("Feature Importances:")
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
print(feature_importances.sort_values(ascending=False))

Column names in the dataset:
['Source IP', 'Source Port', 'src', 'dst', 'Destination IP ', 'Destination Port ', 'Active Mean', 'Total Fwd Packets', 'tot_dur', 'flows', 'FIN Flag Count', 'pktperflow', 'byteperflow', 'pktrate', 'Pairflow', 'Protocol', 'port_no', 'tx_bytes', 'rx_bytes', 'tx_kbps', 'rx_kbps', 'tot_kbps', 'label']
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12712
           1       1.00      1.00      1.00      8157

    accuracy                           1.00     20869
   macro avg       1.00      1.00      1.00     20869
weighted avg       1.00      1.00      1.00     20869

Confusion Matrix:
[[12705     7]
 [   10  8147]]
Feature Importances:
byteperflow          0.288964
pktperflow           0.171412
pktrate              0.153174
FIN Flag Count       0.119160
tot_dur              0.085780
Active Mean          0.073299
Protocol_TCP         0.024460
Protocol_UDP         0.022774
Total Fwd Pa