# Typing Profiling: CyberLab Human Model → Cowrie Scoring

This notebook builds a **human interaction model** from CyberLab, then applies it to Cowrie honeypot sessions to estimate how **human-like** each session’s command timing appears.

We present three levels of inference:

1. **Simple PDF Baseline (Quantile Rule)**
   Scores sessions under the CyberLab human probability density function (PDF) and labels them using **CyberLab-derived log-likelihood quantiles**.

2. **Primary: Human-Likeness Tail Test (Mahalanobis χ²)**
   Uses the global CyberLab Gaussian and converts distance-to-human into a **tail probability** `p_human_tail` (smaller ⇒ less human-like).

3. **Secondary: Human-vs-Background Posterior (Gated Mixture)**
   Adds a background (bot-like) density model (GMM) and computes a **posterior** `p(human | x)` while still keeping CyberLab as the human anchor.

Outputs are written to `./output/` as compact, reusable CSVs.


In [2]:
import os
import glob
import numpy as np
import pandas as pd

from pathlib import Path
from scipy.stats import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

OUT_DIR = Path("output")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# Inputs
CYBERLAB_SESS_PATH = "output/cyberlab_clustering_features.csv"
COWRIE_LINES_PATH  = "../../fi_fs/data/processed/Cowrie_Merged_Geo_Enriched_lines.csv"

# Outputs
HUMAN_MODEL_NPZ    = OUT_DIR / "human_gaussian_densities.npz"
COWRIE_DENSITY_CSV = OUT_DIR / "cowrie_session_density_features.csv"
COWRIE_SCORED_CSV  = OUT_DIR / "cowrie_session_typing_scored.csv"
COWRIE_A_CSV       = OUT_DIR / "cowrie_session_typing_humanlikeness_tailtest.csv"
COWRIE_B_CSV       = OUT_DIR / "cowrie_session_typing_posterior_gatedmixture.csv"

print("Paths:")
print("  CYBERLAB_SESS_PATH:", CYBERLAB_SESS_PATH)
print("  COWRIE_LINES_PATH :", COWRIE_LINES_PATH)
print("  HUMAN_MODEL_NPZ   :", HUMAN_MODEL_NPZ)
print("  COWRIE_DENSITY_CSV:", COWRIE_DENSITY_CSV)
print("  COWRIE_SCORED_CSV :", COWRIE_SCORED_CSV)
print("  COWRIE_A_CSV      :", COWRIE_A_CSV)
print("  COWRIE_B_CSV      :", COWRIE_B_CSV)


Paths:
  CYBERLAB_SESS_PATH: output/cyberlab_clustering_features.csv
  COWRIE_LINES_PATH : ../../fi_fs/data/processed/Cowrie_Merged_Geo_Enriched_lines.csv
  HUMAN_MODEL_NPZ   : output/human_gaussian_densities.npz
  COWRIE_DENSITY_CSV: output/cowrie_session_density_features.csv
  COWRIE_SCORED_CSV : output/cowrie_session_typing_scored.csv
  COWRIE_A_CSV      : output/cowrie_session_typing_humanlikeness_tailtest.csv
  COWRIE_B_CSV      : output/cowrie_session_typing_posterior_gatedmixture.csv


In [4]:
def tukey_upper_fence_pos(x: pd.Series) -> float:
    s = x[(x > 0) & np.isfinite(x)]
    if len(s) < 4:
        return np.inf
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    return np.inf if iqr <= 0 else (q3 + 1.5 * iqr)

def fit_gaussian(df: pd.DataFrame, cols, reg=1e-6, min_samples=5):
    Z = df[cols].dropna().to_numpy()
    n, d = Z.shape
    if n < min_samples:
        raise ValueError(f"Need >= {min_samples} samples, got {n}.")
    mu = Z.mean(axis=0)
    Sigma = np.atleast_2d(np.cov(Z, rowvar=False)) + reg * np.eye(d)
    return mu, Sigma

def mvn_logpdf(Z, mu, Sigma):
    Z = np.atleast_2d(Z)
    mu = np.asarray(mu)
    Sigma = np.asarray(Sigma)
    d = Z.shape[1]
    try:
        L = np.linalg.cholesky(Sigma)
    except np.linalg.LinAlgError:
        L = np.linalg.cholesky(Sigma + 1e-5 * np.eye(d))
    diff = (Z - mu)
    sol = np.linalg.solve(L, diff.T)
    maha = np.sum(sol**2, axis=0)
    log_det = 2.0 * np.sum(np.log(np.diag(L)))
    log_norm = 0.5 * (d * np.log(2.0 * np.pi) + log_det)
    return -log_norm - 0.5 * maha

def logsumexp(a, axis=1, keepdims=True):
    a = np.asarray(a)
    m = np.max(a, axis=axis, keepdims=True)
    out = np.log(np.sum(np.exp(a - m), axis=axis, keepdims=True)) + m
    return out if keepdims else out.squeeze()

def three_way_label(values, thr_hi, thr_lo, hi="human_like", mid="uncertain", lo="non_human_like"):
    # values high = more human-like
    return np.where(values >= thr_hi, hi, np.where(values >= thr_lo, mid, lo))

def top_bottom(df, score_col, n=3, min_pairs=None):
    x = df.copy()
    if min_pairs is not None:
        x = x[x["n_pairs"] >= int(min_pairs)].copy()
    top = x.sort_values(score_col, ascending=False).head(n).copy()
    bot = x.sort_values(score_col, ascending=True).head(n).copy()
    top.insert(0, "rank_group", "most_human_like")
    bot.insert(0, "rank_group", "most_non_human_like")
    return pd.concat([top, bot], ignore_index=True)
