In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_labelled.csv')

# Clean label
df['label'] = df['label'].str.lower().str.strip()
df['binary_label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)

print(df['binary_label'].value_counts())
print(df.shape)
df.head(10)


binary_label
0    29778
1     7915
Name: count, dtype: int64
(37693, 12)


Unnamed: 0,ip,datetime,gmt,request,status,size,referer,browser,country,detected,label,binary_label
0,114.125.221.132,01-Jul 2019 10:54:15,+0700],GET /bkd_baru/assets/images/scan_sertifikat/D0...,200.0,12133,http://universitas.com/bkd_baru/assets/images/...,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
1,114.125.221.132,01-Jul 2019 10:54:23,+0700],GET /bkd_baru/assets/images/scan_sertifikat/D0...,200.0,15491,http://universitas.com/bkd_baru/assets/images/...,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
2,114.125.221.132,01-Jul 2019 10:54:42,+0700],POST /bkd_baru/assets/images/scan_sertifikat/D...,200.0,16305,http://universitas.com/bkd_baru/assets/images/...,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
3,114.125.221.132,01-Jul 2019 10:55:08,+0700],GET /bkd_baru/uwubahon.php HTTP/1.1,404.0,1130,-,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
4,114.125.221.132,01-Jul 2019 10:55:28,+0700],GET /bkd_baru/awubahon.php HTTP/1.1,200.0,1735,-,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
5,114.125.221.132,01-Jul 2019 10:55:38,+0700],POST /bkd_baru/awubahon.php HTTP/1.1,200.0,12421,http://universitas.com/bkd_baru/awubahon.php,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,BAHAYA,anomalous,1
6,114.125.207.5,01-Jul 2019 10:55:57,+0700],GET / HTTP/1.1,302.0,-,-,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,AMAN,normal,0
7,114.125.207.5,01-Jul 2019 10:55:57,+0700],GET /bkd_baru HTTP/1.1,301.0,360,-,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,AMAN,normal,0
8,114.125.207.5,01-Jul 2019 10:55:57,+0700],GET /bkd_baru/ HTTP/1.1,200.0,2416,-,Mozilla/5.0 (Linux; Android 5.1.1; SM-J111F Bu...,Indonesia,AMAN,normal,0
9,114.124.140.168,17-Jul 2019 21:11:54,+0700],GET /bkd_baru/bidang_pendidikan/edit/ODM3MA HT...,200.0,31568,http://universitas.com/bkd_baru/bidang_pendidikan,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6...,Indonesia,AMAN,normal,0


In [3]:
import re

def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    return re.sub(r"\s+"," ",s).strip()

def mask_log(s):
    if pd.isna(s): return ""
    s = str(s)

    s = re.sub(r"\b\d{1,3}(\.\d{1,3}){3}\b", "IP", s)     # mask IP
    s = re.sub(r"https?://\S+|www\.\S+", "URL", s)        # URLs
    s = re.sub(r"/[\w\-.\/]+", "PATH", s)                 # file paths
    s = re.sub(r"\?.*", "PARAM", s)                       # query params
    s = re.sub(r"\S+@\S+\.\S+", "EMAIL", s)               # emails
    s = re.sub(r"\b\d+\b", "NUM", s)                      # numbers
    return s.lower().strip()


In [4]:
def build_event(row):
    parts = []
    for col in ["request", "referer", "browser"]:
        if col in row and pd.notna(row[col]):
            txt = clean_text(row[col])
            txt = mask_log(txt)
            parts.append(txt)
    return " | ".join(parts)

df["event_text"] = df.apply(build_event, axis=1)
df["event_text"].head()


Unnamed: 0,event_text
0,get pathparam | url | mozillapath (linux; andr...
1,get pathparam | url | mozillapath (linux; andr...
2,post pathparam | url | mozillapath (linux; and...
3,get path httppath | - | mozillapath (linux; an...
4,get path httppath | - | mozillapath (linux; an...


Compute “Signal Score” for each log

In [5]:
def signal_score(text):
    score = 0
    if not isinstance(text, str):
        return 0

    length = len(text.split())
    score += min(length, 50)  # cap

    score += text.count("path") * 2
    score += text.count("url") * 2
    score += text.count("param") * 2
    score += text.count("ip") * 1
    score += text.count("email") * 2

    return score

df["signal_score"] = df["event_text"].apply(signal_score)
df[["event_text","signal_score"]].head()


Unnamed: 0,event_text,signal_score
0,get pathparam | url | mozillapath (linux; andr...,35
1,get pathparam | url | mozillapath (linux; andr...,35
2,post pathparam | url | mozillapath (linux; and...,35
3,get path httppath | - | mozillapath (linux; an...,34
4,get path httppath | - | mozillapath (linux; an...,34


In [6]:
# Balanced sampling that cannot fail or skew labels
normal_df = df[df["binary_label"] == 0]
anom_df   = df[df["binary_label"] == 1]

print("Original counts:", len(normal_df), "normals,", len(anom_df), "anomalies")

limit = min(len(normal_df), len(anom_df), 1000)

df_2k = pd.concat([
    normal_df.sample(limit, random_state=42),
    anom_df.sample(limit, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset:", df_2k['binary_label'].value_counts())

save_path = "/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_2k_best.csv"
df_2k.to_csv(save_path, index=False)
save_path


Original counts: 29778 normals, 7915 anomalies
Balanced dataset: binary_label
1    1000
0    1000
Name: count, dtype: int64


'/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_2k_best.csv'

In [7]:
save_path = "/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_2k_best.csv"
df_2k.to_csv(save_path, index=False)
save_path


'/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_2k_best.csv'

In [8]:
# Run this cell first (may take 1-3 minutes)
!pip install -q sentence-transformers hmmlearn joblib


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/166.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import os
import re
import math
import pickle
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from hmmlearn.hmm import GaussianHMM
from joblib import dump, load

# Config
CSV_PATH = "/content/drive/MyDrive/LLM4Sec/Week3/apache_logs_2k_best.csv"
SAVE_DIR = "/content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

SEQ_LEN = 10                # events per window
SBERT_MODEL = "all-MiniLM-L6-v2"   # fast & good for Colab
N_NMF = 12
HMM_STATES_RANGE = range(4, 9)  # 4,5,6,7,8
BATCH_SIZE = 128            # SBERT batch size for encode


Load dataset and ensure event_text present

In [10]:
df = pd.read_csv(CSV_PATH)
print("Loaded:", df.shape)
display(df.head())

# If you don't have 'event_text' column, create from request/referer/browser/detected
def clean_text(s):
    if pd.isna(s): return ""
    s = str(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def mask_log(s):
    if pd.isna(s): return ""
    s = str(s)
    # IPs
    s = re.sub(r"\b\d{1,3}(?:\.\d{1,3}){3}\b", "IP", s)
    # URLs
    s = re.sub(r"https?://\S+|www\.\S+", "URL", s)
    # Paths
    s = re.sub(r"/[\w\-\./]+", "PATH", s)
    # Query params
    s = re.sub(r"\?.*", "PARAM", s)
    # Emails
    s = re.sub(r"\S+@\S+\.\S+", "EMAIL", s)
    # numbers
    s = re.sub(r"\b\d+\b", "NUM", s)
    return s.lower().strip()

if 'event_text' not in df.columns:
    def build_event(row):
        parts = []
        for col in ['request','referer','browser']:
            if col in row and pd.notna(row[col]):
                t = clean_text(row[col])
                t = mask_log(t)
                parts.append(t)
        return " | ".join(parts)
    df['event_text'] = df.apply(build_event, axis=1)
else:
    # ensure masked & cleaned
    df['event_text'] = df['event_text'].astype(str).apply(lambda x: mask_log(clean_text(x)))

print("Event text example:")
print(df['event_text'].iloc[0])


Loaded: (2000, 14)


Unnamed: 0,ip,datetime,gmt,request,status,size,referer,browser,country,detected,label,binary_label,event_text,signal_score
0,66.249.66.71,26-Jul 2019 14:53:56,+0700],GET /bkd/index.php?open=Login HTTP/1.1,302.0,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,,,United States,DICURIGAI,anomalous,1,get pathparam,6
1,180.178.99.174,25-Jul 2019 7:42:21,+0700],GET /bkd_baru/assets/images/scan_penugasan/D02...,206.0,2886999,http://universitas.com/bkd_baru//kinerja_dosen...,Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:67...,Indonesia,AMAN,normal,0,get path httppath | url | mozillapath (windows...,27
2,36.71.237.67,18-Jul 2019 7:55:59,+0700],GET /bkd_baru/dosen/input_dosen/RDAxMjM HTTP/1.1,200.0,38041,http://universitas.com/bkd_baru/dosen/input_do...,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,Indonesia,BAHAYA,anomalous,1,get path httppath | url | mozillapath (windows...,31
3,114.122.79.82,24-Jul 2019 16:49:37,+0700],POST /bkd_baru/login/getlogin HTTP/1.1,303.0,-,http://universitas.com/bkd_baru/login,Mozilla/5.0 (Linux; Android 6.0; Lenovo A6600d...,Indonesia,AMAN,normal,0,post path httppath | url | mozillapath (linux;...,27
4,8.37.233.207,23-Jul 2019 14:32:45,+0700],GET /bkd_baru/assets/js/ace/ace.sidebar-scroll...,304.0,-,http://universitas.com/bkd_baru/bidang_penunjang,Mozilla/5.0 (Linux; U; Android 9; en-US; Redmi...,United States,DICURIGAI,anomalous,1,get path httppath | url | mozillapath (linux; ...,30


Event text example:
get pathparam


Create non-overlapping windows (SEQ_LEN) and mapping

In [12]:
"""
SLIDING WINDOW FIX — ONLY THIS BLOCK SHOULD EXIST FOR WINDOWING
"""

# 1. Shuffle BEFORE windowing
df_shuf = df.sample(frac=1, random_state=42).reset_index(drop=True)

events = df_shuf['event_text'].tolist()
labels = df_shuf['binary_label'].tolist()

SEQ_LEN = 10
windows = []
window_labels = []

# 2. Sliding windows (stride = 1)
for i in range(0, len(events) - SEQ_LEN + 1):
    windows.append(" ||| ".join(events[i:i+SEQ_LEN]))
    window_labels.append(max(labels[i:i+SEQ_LEN]))

print("Sliding windows created:", len(windows))
print("Label distribution:\n", pd.Series(window_labels).value_counts())

# 3. Build windows_df
windows_df = pd.DataFrame({
    "window_text": windows,
    "start_idx": list(range(len(windows))),
    "end_idx": list(range(SEQ_LEN, len(windows) + SEQ_LEN)),
    "binary_label": window_labels
})

windows_df.to_csv(os.path.join(SAVE_DIR, "windows_meta.csv"), index=False)
print("Saved windows_meta.csv (sliding windows)")


Sliding windows created: 1991
Label distribution:
 1    1991
Name: count, dtype: int64
Saved windows_meta.csv (sliding windows)


Load SBERT embedder (model caching happens automatically)

In [25]:
print("Loading SBERT model:", SBERT_MODEL)
embedder = SentenceTransformer(SBERT_MODEL)
print("SBERT loaded.")


Loading SBERT model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SBERT loaded.


Embed window-level texts (batched)

In [50]:
# embed windows (each window is concatenated SEQ_LEN events)
print("Embedding windows:", len(windows))
window_emb = embedder.encode(windows, batch_size=BATCH_SIZE, show_progress_bar=True)
window_emb = np.array(window_emb)
print("window_emb.shape:", window_emb.shape)

# Save window embeddings
np.save(os.path.join(SAVE_DIR, 'window_emb.npy'), window_emb)
print("Saved window_emb.npy")


Embedding windows: 200


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

window_emb.shape: (200, 384)
Saved window_emb.npy


Scale Embeddings for NMF

In [51]:
scaler = MinMaxScaler()
window_emb_scaled = scaler.fit_transform(window_emb)
# Clip to [0,1] in case small numerical drift
window_emb_scaled = np.clip(window_emb_scaled, 0.0, 1.0)
dump(scaler, os.path.join(SAVE_DIR, 'scaler_window_emb.joblib'))
print("Scaled window embeddings and saved scaler.")


Scaled window embeddings and saved scaler.


NMF decomposition (topic/concept extraction)

In [52]:
print("Running NMF with n_components =", N_NMF)
nmf = NMF(n_components=N_NMF, init='nndsvda', random_state=42, max_iter=500)
W = nmf.fit_transform(window_emb_scaled)  # (n_windows, N_NMF)
H = nmf.components_                       # (N_NMF, emb_dim)
print("W.shape:", W.shape, "H.shape:", H.shape)

# Save NMF model + matrices
with open(os.path.join(SAVE_DIR, 'nmf_model.pkl'), 'wb') as f:
    pickle.dump(nmf, f)
np.save(os.path.join(SAVE_DIR, 'W_nmf.npy'), W)
np.save(os.path.join(SAVE_DIR, 'H_nmf.npy'), H)
print("Saved NMF artifacts.")


Running NMF with n_components = 12
W.shape: (200, 12) H.shape: (12, 384)
Saved NMF artifacts.


Standardize W and pick best HMM by log-likelihood

In [53]:
W_std = StandardScaler().fit_transform(W)
dump(StandardScaler(), os.path.join(SAVE_DIR, 'W_standardizer_placeholder.joblib'))  # optional

best_score = -np.inf
best_hmm = None
results = []

for n_states in HMM_STATES_RANGE:
    try:
        hmm = GaussianHMM(n_components=n_states, covariance_type='tied', n_iter=300, random_state=42)
        hmm.fit(W_std)
        score = hmm.score(W_std)
        results.append((n_states, score))
        print(f"HMM states={n_states}, score={score:.2f}")
        if score > best_score:
            best_score = score
            best_hmm = hmm
    except Exception as e:
        print("HMM fitting error for states", n_states, ":", e)

print("HMM results:", results)
if best_hmm is None:
    raise RuntimeError("No HMM model could be fit. Check data / parameters.")
print("Selected HMM states:", best_hmm.n_components, "with score:", best_score)
# Save best HMM
with open(os.path.join(SAVE_DIR, 'best_hmm.pkl'), 'wb') as f:
    pickle.dump(best_hmm, f)
print("Saved best HMM model.")


HMM states=4, score=-2169.13
HMM states=5, score=-2225.13
HMM states=6, score=-2062.74
HMM states=7, score=-1994.74
HMM states=8, score=-2023.14
HMM results: [(4, -2169.132154552464), (5, -2225.1313729340795), (6, -2062.737090312539), (7, -1994.7401818155201), (8, -2023.13650609159)]
Selected HMM states: 7 with score: -1994.7401818155201
Saved best HMM model.


Predict HMM clusters and transition matrix features

In [54]:
clusters = best_hmm.predict(W_std)
windows_df['hmm_cluster'] = clusters
# Optionally store per-window posterior probabilities (gamma)
posteriors = best_hmm.predict_proba(W_std)  # shape (n_windows, n_states)
np.save(os.path.join(SAVE_DIR, 'hmm_posteriors.npy'), posteriors)
np.save(os.path.join(SAVE_DIR, 'hmm_clusters.npy'), clusters)
# Save transition matrix
if hasattr(best_hmm, 'transmat_'):
    np.save(os.path.join(SAVE_DIR, 'hmm_transmat.npy'), best_hmm.transmat_)
print("Saved HMM cluster artifacts.")
windows_df.to_csv(os.path.join(SAVE_DIR, 'windows_meta_with_hmm.csv'), index=False)
windows_df.head()


Saved HMM cluster artifacts.


Unnamed: 0,window_text,start_idx,end_idx,binary_label,hmm_cluster
0,get path httppath | url | mozillapath (windows...,0,10,1,1
1,get path httppath | url | mozillapath (macinto...,10,20,1,4
2,get path httppath | url | mozillapath (windows...,20,30,1,1
3,get path httppath | url | mozillapath (windows...,30,40,1,0
4,get path httppath | url | mozillapath (windows...,40,50,1,1


Per-event SBERT embeddings (for GRU input)

In [55]:
# We will embed per-event and then reshape into (n_windows, SEQ_LEN, emb_dim)
# We used 'events' earlier (trimmed to n_full). events is a list of strings length n_full
print("Total per-event count:", len(events))
# Encode in batches to save memory
def batched_encode(texts, model, batch_size=128):
    out = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False)
        out.append(emb)
    return np.vstack(out)

event_emb = batched_encode(df_shuf['event_text'].tolist(), embedder, batch_size=BATCH_SIZE)
print("event_emb.shape:", event_emb.shape)

# Reshape
n_windows = len(windows)
emb_dim = event_emb.shape[1]
X_seq = event_emb.reshape(n_windows, SEQ_LEN, emb_dim)
print("X_seq shape (n_windows, SEQ_LEN, emb_dim):", X_seq.shape)

# Save sequence array (for GRU)
np.save(os.path.join(SAVE_DIR, 'X_seq_events.npy'), X_seq)
print("Saved X_seq_events.npy")


Total per-event count: 2000
event_emb.shape: (2000, 384)
X_seq shape (n_windows, SEQ_LEN, emb_dim): (200, 10, 384)
Saved X_seq_events.npy


In [56]:
final = {
    'X_seq_path': os.path.join(SAVE_DIR, 'X_seq_events.npy'),
    'W_nmf_path': os.path.join(SAVE_DIR, 'W_nmf.npy'),
    'window_emb_path': os.path.join(SAVE_DIR, 'window_emb.npy'),
    'hmm_clusters_path': os.path.join(SAVE_DIR, 'hmm_clusters.npy'),
    'windows_meta_csv': os.path.join(SAVE_DIR, 'windows_meta_with_hmm.csv'),
    'nmf_model': os.path.join(SAVE_DIR, 'nmf_model.pkl'),
    'hmm_model': os.path.join(SAVE_DIR, 'best_hmm.pkl'),
    'sbert_model_name': SBERT_MODEL
}
with open(os.path.join(SAVE_DIR, 'final_feature_manifest.pkl'), 'wb') as f:
    pickle.dump(final, f)

print("Saved final manifest. Artifacts available in:", SAVE_DIR)
print("Example shapes:")
print("X_seq:", X_seq.shape)
print("W_nmf:", W.shape)
print("hmm clusters unique:", np.unique(clusters))


Saved final manifest. Artifacts available in: /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts
Example shapes:
X_seq: (200, 10, 384)
W_nmf: (200, 12)
hmm clusters unique: [0 1 2 3 4 5 6]
