In [1]:
# Colab-ready GRU training notebook (script style, run cell-by-cell in Colab)
# - Fixed attention block (no slicing errors)
# - Loads feature engineering artifacts from Drive
# - Verifies shapes and stops if misaligned
# - Builds BiGRU+Attention model, trains with callbacks and LR scheduler
# - Saves model and artifacts to GRUmodel directory
#
# NOTE: original uploaded GRU script (for reference) is at:
# /mnt/data/gru_webserver (3).py
# The path above is included for traceability only.

# %%
# CELL 1: Mount Google Drive and install dependencies
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:


# Install packages if not present
!pip install -q sentence-transformers hmmlearn joblib

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/166.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# %%
# CELL 2: Imports & configuration
import os, pickle, time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [4]:
# Paths - update if needed
FEAT_DIR = "/content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts"
GRU_SAVE_DIR = os.path.join(FEAT_DIR, "GRUmodel")
os.makedirs(GRU_SAVE_DIR, exist_ok=True)
MANIFEST_PATH = os.path.join(FEAT_DIR, "final_feature_manifest.pkl")

print("Feature artifacts dir:", FEAT_DIR)
print("GRU save dir:", GRU_SAVE_DIR)

Feature artifacts dir: /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts
GRU save dir: /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts/GRUmodel


In [5]:

# %%
# CELL 3: Load manifest and artifacts (with sanity checks)
assert os.path.exists(MANIFEST_PATH), f"Manifest not found: {MANIFEST_PATH}"
with open(MANIFEST_PATH, 'rb') as f:
    manifest = pickle.load(f)
print("Loaded manifest keys:", manifest.keys())

# Helper to load npy with error message
def load_npy(p):
    assert os.path.exists(p), f"Missing file: {p}"
    return np.load(p, allow_pickle=True)

# Load arrays
X_seq = load_npy(manifest['X_seq_path'])           # (n_windows, SEQ_LEN, emb_dim)
W_nmf = load_npy(manifest['W_nmf_path'])           # (n_windows, n_nmf)
window_emb = load_npy(manifest['window_emb_path']) # (n_windows, emb_dim)

# HMM clusters might be saved as npy
hmm_clusters = load_npy(manifest['hmm_clusters_path']) if 'hmm_clusters_path' in manifest else None
windows_meta = pd.read_csv(manifest['windows_meta_csv'])

print('X_seq', X_seq.shape)
print('W_nmf', W_nmf.shape)
print('window_emb', window_emb.shape)
print('hmm_clusters', None if hmm_clusters is None else hmm_clusters.shape)
print('windows_meta', windows_meta.shape)


Loaded manifest keys: dict_keys(['X_seq_path', 'W_nmf_path', 'window_emb_path', 'hmm_clusters_path', 'windows_meta_csv', 'nmf_model', 'hmm_model', 'sbert_model_name'])
X_seq (1991, 10, 384)
W_nmf (1991, 12)
window_emb (1991, 384)
hmm_clusters (1991,)
windows_meta (1991, 5)


In [8]:

# CELL 4: Build auxiliary feature matrix (scale + one-hot HMM)
# Standardize NMF components
scaler_aux = StandardScaler()
W_nmf_std = scaler_aux.fit_transform(W_nmf)

# HMM clusters -> one-hot
if hmm_clusters is not None:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Changed sparse=False to sparse_output=False
    clusters_reshaped = hmm_clusters.reshape(-1,1)
    hmm_ohe = ohe.fit_transform(clusters_reshaped)
    aux = np.hstack([W_nmf_std, hmm_ohe])
else:
    ohe = None
    aux = W_nmf_std

print('aux shape:', aux.shape)

# Save scalers/encoders for inference
with open(os.path.join(GRU_SAVE_DIR, 'scaler_aux.pkl'), 'wb') as f:
    pickle.dump(scaler_aux, f)
if ohe is not None:
    with open(os.path.join(GRU_SAVE_DIR, 'hmm_ohe.pkl'), 'wb') as f:
        pickle.dump(ohe, f)


aux shape: (1991, 20)


In [9]:



# %%
# CELL 5: Build labels (from windows_meta) and final checks
y = windows_meta['binary_label'].astype(int).values
n = X_seq.shape[0]
assert aux.shape[0] == n, f"Mismatch aux rows {aux.shape[0]} vs X_seq {n}"
assert len(y) == n, f"Mismatch y {len(y)} vs X_seq {n}"

print('Final shapes:')
print('X_seq', X_seq.shape)
print('aux', aux.shape)
print('y', y.shape)

# Train/test split
X_train, X_test, aux_train, aux_test, y_train, y_test = train_test_split(
    X_seq, aux, y, test_size=0.25, stratify=y, random_state=42
)
print('Train shapes:', X_train.shape, aux_train.shape, y_train.shape)
print('Test shapes :', X_test.shape, aux_test.shape, y_test.shape)


Final shapes:
X_seq (1991, 10, 384)
aux (1991, 20)
y (1991,)
Train shapes: (1493, 10, 384) (1493, 20) (1493,)
Test shapes : (498, 10, 384) (498, 20) (498,)


In [13]:

# CELL 6: Build model (fixed attention, no slicing)
from tensorflow.keras import Input, Model

def build_model(seq_len, emb_dim, aux_dim,
                rnn_units_1=128, rnn_units_2=64,
                dropout_rate=0.3, dense_units=64, lr=1e-3):

    seq_input = Input(shape=(seq_len, emb_dim), name='seq_input')
    # stacked BiGRU
    x = layers.Bidirectional(layers.GRU(rnn_units_1, return_sequences=True))(seq_input)
    x = layers.Dropout(dropout_rate)(x)
    x = layers.Bidirectional(layers.GRU(rnn_units_2, return_sequences=False))(x)
    x = layers.Dropout(dropout_rate)(x)

    # Attention: query from last GRU output -> attend over sequence embeddings
    # Fix: Project the query to the same dimension as the sequence embeddings (emb_dim)
    query = layers.Dense(emb_dim, activation='tanh')(x) # Changed rnn_units_2 * 2 to emb_dim
    query = layers.Reshape((1, emb_dim))(query) # Changed rnn_units_2 * 2 to emb_dim
    # Use seq_input as value (you can also use intermediate sequence outputs)
    attn_out = layers.Attention()([query, seq_input])
    attn_out = layers.Flatten()(attn_out)

    x = layers.Concatenate()([x, attn_out])

    # aux branch
    aux_input = Input(shape=(aux_dim,), name='aux_input')
    a = layers.BatchNormalization()(aux_input)
    a = layers.Dense(dense_units//2, activation='relu')(a)
    a = layers.Dropout(dropout_rate)(a)

    # combine
    z = layers.Concatenate()([x, a])
    z = layers.Dense(dense_units, activation='relu')(z)
    z = layers.Dropout(dropout_rate)(z)
    out = layers.Dense(1, activation='sigmoid', name='out')(z)

    model = Model(inputs=[seq_input, aux_input], outputs=out)
    model.compile(optimizer=Adam(learning_rate=lr), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# instantiate model
SEQ_LEN = X_seq.shape[1]
EMB_DIM = X_seq.shape[2]
AUX_DIM = aux.shape[1]
model = build_model(seq_len=SEQ_LEN, emb_dim=EMB_DIM, aux_dim=AUX_DIM,
                    rnn_units_1=128, rnn_units_2=64, dropout_rate=0.3, dense_units=64, lr=1e-3)

model.summary()


In [14]:

# %%
# CELL 7: Callbacks and training
save_model_path = os.path.join(GRU_SAVE_DIR, 'gru_best.keras')
ckpt = ModelCheckpoint(save_model_path, monitor='val_loss', save_best_only=True, verbose=1)
es = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

# Optional class weights
from sklearn.utils import class_weight
cw_vals = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(cw_vals))
print('class_weights:', class_weights)

history = model.fit(
    {'seq_input': X_train, 'aux_input': aux_train},
    y_train,
    validation_data=({'seq_input': X_test, 'aux_input': aux_test}, y_test),
    epochs=40,
    batch_size=32,
    callbacks=[ckpt, es, rlr],
    class_weight=class_weights
)


class_weights: {0: np.float64(0.999330655957162), 1: np.float64(1.0006702412868633)}
Epoch 1/40
[1m44/47[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 12ms/step - accuracy: 0.4947 - loss: 0.6983
Epoch 1: val_loss improved from inf to 0.68641, saving model to /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts/GRUmodel/gru_best.keras
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 27ms/step - accuracy: 0.4975 - loss: 0.6980 - val_accuracy: 0.5522 - val_loss: 0.6864 - learning_rate: 0.0010
Epoch 2/40
[1m45/47[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.5442 - loss: 0.6873
Epoch 2: val_loss improved from 0.68641 to 0.68405, saving model to /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts/GRUmodel/gru_best.keras
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.5434 - loss: 0.6876 - val_accuracy: 0.5542 - val_loss: 0.6841 - learning_rate: 0.0010
Epoch 3/40
[1m45/47[0m [32m

In [15]:

# %%
# CELL 8: Evaluate and threshold tuning
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

best_model = tf.keras.models.load_model(save_model_path)
probs = best_model.predict({'seq_input': X_test, 'aux_input': aux_test}).ravel()

# search threshold maximizing F1
best_t, best_f1 = 0.5, 0
for t in np.linspace(0.3, 0.7, 41):
    f1 = f1_score(y_test, (probs > t).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_t = t
print('best threshold', best_t, 'best f1', best_f1)

y_pred = (probs > best_t).astype(int)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))
print('AUC:', roc_auc_score(y_test, probs))
print('\nClassification report:\n', classification_report(y_test, y_pred))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred))



[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
best threshold 0.45999999999999996 best f1 0.8464730290456431
Accuracy: 0.8514056224899599
F1: 0.8464730290456431
AUC: 0.8934694601699972

Classification report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.86       249
           1       0.88      0.82      0.85       249

    accuracy                           0.85       498
   macro avg       0.85      0.85      0.85       498
weighted avg       0.85      0.85      0.85       498

Confusion matrix:
 [[220  29]
 [ 45 204]]


In [16]:

# %%
# CELL 9: Save ARTIFACTS (model, tokenizer meta, wrapper)
# Save Keras model (already saved by checkpoint). Save tokenizer_meta
tokenizer_meta = {
    'type': 'embedding',
    'embedder': manifest.get('sbert_model_name', 'all-MiniLM-L6-v2'),
    'seq_len': SEQ_LEN,
    'emb_dim': EMB_DIM
}
with open(os.path.join(GRU_SAVE_DIR, 'tokenizer_meta.pkl'), 'wb') as f:
    pickle.dump(tokenizer_meta, f)

# Save model wrapper (simple dict with model path and metadata)
wrapper = {
    'model_path': save_model_path,
    'tokenizer_meta_path': os.path.join(GRU_SAVE_DIR, 'tokenizer_meta.pkl'),
    'saved_at': time.ctime()
}
with open(os.path.join(GRU_SAVE_DIR, 'gru_model_wrapper.pkl'), 'wb') as f:
    pickle.dump(wrapper, f)

print('Saved artifacts to', GRU_SAVE_DIR)


Saved artifacts to /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts/GRUmodel


In [17]:

# %%
# CELL 10: Explainability table (top NMF comps + KB matches if present)
# Ensure kb_sources are available: try to load windows_meta_with_kb if present
kb_cols = [c for c in windows_meta.columns if c.startswith('kb_top_')]
print('KB columns present:', kb_cols)

# Compute all-window predictions
all_prob = best_model.predict({'seq_input': X_seq, 'aux_input': aux}).ravel()
all_pred = (all_prob > best_t).astype(int)

# Load W_nmf (already loaded). Build explain rows for test indices
_, test_idx = train_test_split(np.arange(X_seq.shape[0]), test_size=0.25, stratify=y, random_state=42)
rows = []
for i in test_idx:
    nmf_row = W_nmf[i]
    top_idx = np.argsort(nmf_row)[::-1][:3]
    row = {
        'window_id': int(i),
        'start_idx': int(windows_meta['start_idx'].iloc[i]),
        'label_true': int(windows_meta['binary_label'].iloc[i]),
        'pred_prob': float(all_prob[i]),
        'pred_label': int(all_pred[i]),
        'top_nmf': ';'.join([f'comp{t}:{nmf_row[t]:.3f}' for t in top_idx])
    }
    for col in kb_cols:
        row[col] = windows_meta[col].iloc[i] if col in windows_meta.columns else ''
    rows.append(row)

explain_df = pd.DataFrame(rows)
explain_df.to_csv(os.path.join(GRU_SAVE_DIR, 'gru_predictions_explain.csv'), index=False)
print('Saved explain csv to', os.path.join(GRU_SAVE_DIR, 'gru_predictions_explain.csv'))

# %%
# END OF NOTEBOOK
print('Done. Run cells sequentially in Colab.')


KB columns present: []
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Saved explain csv to /content/drive/MyDrive/LLM4Sec/Week3/feature_eng_artifacts/GRUmodel/gru_predictions_explain.csv
Done. Run cells sequentially in Colab.
