In [8]:
from kafka import KafkaConsumer
from tensorflow import keras
from joblib import load
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import FeatureHasher
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif as mi
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import make_scorer, f1_score
import river
from river import compose
from river import evaluate
from river import metrics
from river import preprocessing
from river import stream

In [2]:
consumer = KafkaConsumer(
    'ml-raw-dns',
    bootstrap_servers="kafka:9092",
    auto_offset_reset='earliest',
    enable_auto_commit=False
)

In [50]:
# Load your initial Keras model
static_model = load('best_model.joblib')

In [51]:
# Clone the static model
dynamic_model = static_model

In [52]:
# Set the desired window size
window_size = 1000

In [53]:
# Lists to store metrics for plotting
timestamps = []
static_model_metrics = []
dynamic_model_metrics = []

In [54]:
# Metric to evaluate the models
metric = metrics.Accuracy()

In [55]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
import numpy as np

In [61]:
# Word embedding
sld_vectorizer_path = 'sld_count_vectorizer.joblib'
sld_svd_path = 'sld_svd_model.joblib'
lw_vectorizer_path = 'lw_count_vectorizer.joblib'
lw_svd_path = 'lw_svd_model.joblib'

In [62]:
sld_vectorizer = load(sld_vectorizer_path)
sld_svd = load(sld_svd_path)
lw_vectorizer = load(lw_vectorizer_path)
lw_svd = load(lw_svd_path)

In [63]:
# Define a function to transform new text data using the loaded models
def transform_sld(new_text_series):
    # Transform the new text data using the loaded CountVectorizer
    count_matrix = sld_vectorizer.transform(new_text_series)
    
    # Reduce the dimensionality using the loaded TruncatedSVD
    embedded_data = sld_svd.transform(count_matrix)
    
    return embedded_data

In [64]:
# Define a function to transform new text data using the loaded models
def transform_lw(new_text_series):
    # Transform the new text data using the loaded CountVectorizer
    count_matrix = lw_vectorizer.transform(new_text_series)
    
    # Reduce the dimensionality using the loaded TruncatedSVD
    embedded_data = lw_svd.transform(count_matrix)
    
    return embedded_data

In [65]:
def convert_to_seconds(timestamp):
    # Strip any quotation marks and other non-numeric characters before splitting
    clean_timestamp = timestamp.strip('"')
    minutes, seconds = clean_timestamp.split(':')
    return int(minutes) * 60 + float(seconds)

In [72]:
def preprocess_data(data):
    
    # Decode byte string and split by comma
    decoded_data = data.decode('utf-8').strip().split(',')
    
    # Remove additional characters from the 'Target Attack' field
    target_attack = decoded_data[15].strip().strip('\\n"')
    
    # Apply the convert_to_seconds function to the 'timestamp' field
    timestamp_seconds = convert_to_seconds(decoded_data[0].strip('"'))
    
    # Convert to appropriate data types
    data_dict = {
        'timestamp': int(timestamp_seconds),
        'FQDN_count': int(decoded_data[1]),
        'subdomain_length': int(decoded_data[2]),
        'upper': int(decoded_data[3]),
        'lower': int(decoded_data[4]),
        'numeric': int(decoded_data[5]),
        'entropy': float(decoded_data[6]),
        'special': int(decoded_data[7]),
        'labels': int(decoded_data[8]),
        'labels_max': int(decoded_data[9]),
        'labels_average': float(decoded_data[10]),
        'longest_word': decoded_data[11],
        'sld': decoded_data[12],
        'len': int(decoded_data[13]),
        'subdomain': int(decoded_data[14]),
        'Target Attack': int(target_attack)
    }
    
    df = pd.DataFrame([data_dict])
    
    embedd_sld = transform_sld(df['sld'])
    embedd_lw = transform_lw(df['longest_word'])
    
    sld_embeddings_df = pd.DataFrame(embedd_sld, columns=[f'sld_emb_{i}' for i in range(embedd_sld.shape[1])])
    longest_word_embeddings_df = pd.DataFrame(embedd_lw, columns=[f'lw_emb_{i}' for i in range(embedd_lw.shape[1])])
    
    df.reset_index(drop=True, inplace=True)
    sld_embeddings_df.reset_index(drop=True, inplace=True)
    longest_word_embeddings_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, sld_embeddings_df, longest_word_embeddings_df], axis=1)
    


    # Select features based on mutual information
    selected_features = ['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'entropy',
       'special', 'labels', 'labels_max', 'labels_average', 'sld_emb_0',
       'sld_emb_1', 'sld_emb_2', 'sld_emb_3', 'sld_emb_4', 'lw_emb_0',
       'lw_emb_1', 'lw_emb_2', 'lw_emb_3', 'lw_emb_4']

    return df[selected_features], df['Target Attack']

In [76]:
# Process stream in windows of 1,000 observations
for i, message in enumerate(consumer):
    # Deserialize the message
    data = message.value
    print(data)
    
    print('\n')
    
    # Process your data (implement the preprocess_data function)
    X, y = preprocess_data(data)
    
    #print(X)
    #print('\n')
    #print(y)
    
    # Make predictions and update models
    y_pred_static = static_model.predict(X)
    y_pred_dynamic = dynamic_model.predict(X)
    
    y_true_scalar = y.iloc[0]  # or y.item() if y is a Series with a single value
    y_pred_static_scalar = y_pred_static[0]  # or y_pred_static.item()
    y_pred_dynamic_scalar = y_pred_dynamic[0]  # or y_pred_dynamic.item()

    # Update metrics
    metric.update(y_true_scalar, y_pred_static_scalar)
    metric.update(y_true_scalar, y_pred_dynamic_scalar)
    
    # Evaluate models after each window
    if (i + 1) % window_size == 0:
        # Record the timestamp or window index
        timestamps.append(i // window_size)
        
        # Evaluate and record the static model's performance
        static_model_metric = metric.get()
        static_model_metrics.append(static_model_metric)
        
        # Evaluate and record the dynamic model's performance
        dynamic_model_metric = metric.get()
        dynamic_model_metrics.append(dynamic_model_metric)
        
        # Decide whether to retrain based on some criterion
        if should_retrain(dynamic_model_metric, static_model_metric):
            # Retrain your dynamic model here
            pass
        
        # Reset the metric after each window
        metric = metrics.Accuracy()

b'"37:30.9,27,10,0,10,11,2.57041707,6,6,7,3.666666667,2,192,14,1,1\\n"'


b'"44:20.4,6,0,0,5,0,1.5,1,2,3,2.5,spd,spd,4,0,0\\n"'


b'"20:12.3,14,0,0,13,0,2.781301471,1,2,10,6.5,triple,triplebyte,11,0,0\\n"'


b'"00:04.2,24,7,0,10,8,2.054028744,6,6,7,3.166666667,4,224,11,1,1\\n"'


b'"13:55.5,27,10,0,10,11,2.57041707,6,6,7,3.666666667,2,192,14,1,1\\n"'


b'"22:14.3,10,0,0,9,0,2.154135417,1,2,5,4.5,pan,pante,6,0,0\\n"'


b'"01:51.0,16,0,0,15,0,2.954252048,1,2,12,7.5,apex,pornstarapex,13,0,0\\n"'


b'"35:16.5,13,3,0,11,0,2.314246535,2,3,6,3.666666667,leon,meleon,10,1,0\\n"'


b'"22:04.7,27,10,0,10,11,2.57041707,6,6,7,3.666666667,2,192,14,1,1\\n"'


b'"06:45.5,32,0,32,0,0,2.674525829,0,1,32,32,L,FHEPFCELEHFCEPFFFACACACACACACABL,33,0,1\\n"'


b'"07:44.7,20,3,0,18,0,3.077819531,2,3,12,6,mafia,mafiaenshevn,16,1,0\\n"'


b'"20:51.7,27,10,0,10,11,2.767194749,6,6,7,3.666666667,2,192,14,1,0\\n"'


b'"02:10.9,26,9,0,10,10,2.742337624,6,6,7,3.5,2,192,13,1,1\\n"'


b'"13:17.9,26,9,0,10,10,2.742337624

NameError: name 'should_retrain' is not defined