In [None]:
import time

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from src.analytics_module.config import Config
from src.analytics_module.LLM_analytics import LLM_AnalyticsClass
from tqdm import tqdm

In [None]:
config = Config()

# Prepare data

In [None]:
df = pd.read_parquet(config.data_file_path)
df.head()

In [None]:
df.nunique()

In [None]:
fig = px.histogram(df, x="Class", color="Class")
fig.show()

In [None]:
fig.show("png")

In [None]:
Y_http = df["Class"]
X_http = df.drop(["Class"], axis=1)

X_train_http, X_test_http, Y_train_http, Y_test_http = train_test_split(
    X_http, Y_http, train_size=0.8, random_state=config.random_seed
)

In [None]:
print("Train samples:", Y_train_http.shape[0])
print("Test samples:", Y_test_http.shape[0])

# Test LLMs

In [None]:
analyzer = LLM_AnalyticsClass()

In [None]:
%%time

test_pred = analyzer.pred(X_train_http["HTTP"][76466])
test_pred

In [None]:
X_test_http.reset_index(drop=True, inplace=True)
Y_test_http.reset_index(drop=True, inplace=True)

In [None]:
Y_pred = list()
try_per_http = list()
pred_time = list()
for i in tqdm(range(len(X_test_http))):
    s_time = time.time()
    prediction = analyzer.pred(X_test_http["HTTP"][i])
    Y_pred.append(prediction[0])
    try_per_http.append(prediction[1])
    e_time = time.time()
    duration = e_time - s_time
    pred_time.append(duration)

In [None]:
print(np.mean(try_per_http))
print(f1_score(Y_test_http, Y_pred))

In [None]:
px.imshow(
    confusion_matrix(Y_test_http, Y_pred),
    text_auto=True,
    labels=dict(x="Actual values", y="Predicted values"),
    x=["Positive", "Negative"],
    y=["Positive", "Negative"],
)