In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler


def preprocess_data_from_csv(file_path):
    data = pd.read_csv('smaller_update.csv', header=None, on_bad_lines='warn')
    processed_data = []
    max_length = 0

    
    for row in data[0]:
        row = row.replace('<head>', '').replace('<pkt>', '').replace('</s>', '').strip()
        hex_values = row.split()
        max_length = max(max_length, len(hex_values))
    
    for row in data[0]:
        row = row.replace('<head>', '').replace('<pkt>', '').replace('</s>', '').strip()
        hex_values = row.split()
        
        int_values = [int(x, 16) for x in hex_values]
        if len(int_values) >= 20:
            del int_values[12:20]  
        
        max_length = max(max_length, len(int_values))
        processed_data.append(int_values)
    
    for i in range(len(processed_data)):
        if len(processed_data[i]) < max_length:
            processed_data[i].extend([0] * (max_length - len(processed_data[i])))
    
    processed_data = np.array(processed_data)
    
    scaler = MinMaxScaler()
    processed_data = scaler.fit_transform(processed_data)

    return processed_data, scaler

def build_autoencoder(input_dim):
    input_layer = Input(shape=(input_dim,))
    
    
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    
    encoded = Dense(16, activation='relu')(encoded)
    
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder

def train_autoencoder(autoencoder, data):
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, mode='min', restore_best_weights=True)
    history = autoencoder.fit(
        data, data,
        epochs=10,           
        batch_size=64,       
        shuffle=True,
        validation_split=0.2,
        callbacks=[early_stopping]
    )
    return history

def save_autoencoder(autoencoder, filename='newest_model.keras'):
    autoencoder.save(filename)

def load_autoencoder(filename='newest_model.keras'):
    return tf.keras.models.load_model(filename)

if __name__ == "__main__":
    csv_file_path = 'smaller_update.csv'

    processed_data, scaler = preprocess_data_from_csv(csv_file_path)

    input_dim = processed_data.shape[1]  
    print("Input dimension after removing src/dst IPs:", input_dim)
    
    autoencoder = build_autoencoder(input_dim)

    history = train_autoencoder(autoencoder, processed_data)

    save_autoencoder(autoencoder, filename='newest_model.keras')
    print("Finished Training! and Saved")

    
    


Input dimension after removing src/dst IPs: 104
Epoch 1/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 611us/step - loss: 0.0191 - val_loss: 0.0168
Epoch 2/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 575us/step - loss: 0.0172 - val_loss: 0.0166
Epoch 3/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 560us/step - loss: 0.0171 - val_loss: 0.0166
Epoch 4/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 561us/step - loss: 0.0170 - val_loss: 0.0166
Epoch 5/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 561us/step - loss: 0.0169 - val_loss: 0.0165
Epoch 6/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 564us/step - loss: 0.0168 - val_loss: 0.0163
Epoch 7/10
[1m250000/250000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 565us/step - loss: 0.0167 - val_loss: 0.0163
Epoch 8/10
[1m250000/250000[0m [32m━━━━