# Sarcasm Detector — Colab/WSL Ready (BiLSTM+MaxPool)
**One-click** notebook to load data (TFDS or CSV), train, evaluate (ROC‑AUC, best‑F1 threshold), and save artifacts (vocab, meta, model).  
> Works in **Colab** (GPU/CPU) and **local WSL** (CPU). If Colab GPU quota is unavailable, it will still run on CPU.

## 1) Setup & environment check

In [None]:
import os, sys, random, numpy as np, tensorflow as tf
SEED=42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

IN_COLAB = "google.colab" in sys.modules
print("Python :", sys.version)
print("TF     :", tf.__version__, "(Colab:", IN_COLAB, ")")

# If running on Colab: upgrade/install deps to latest stable
if IN_COLAB:
    # Light install (keeps Colab runtime defaults; uncomment if you need exact versions)
    # !pip -q install "tensorflow==2.19.*" "tensorflow-text==2.19.*" scikit-learn pandas matplotlib pyyaml emoji
    try:
        import tensorflow_text as text
    except Exception as e:
        print("Installing tensorflow-text for Colab...")
        !pip -q install "tensorflow-text==2.19.*"
        import tensorflow_text as text
else:
    # Local/WSL guidance
    print("Local/WSL detected. If TF 2.19 has dependency issues, use 2.18:")
    print("  python3 -m pip install 'tensorflow==2.18.0' 'tensorflow-text==2.18.0' scikit-learn pandas matplotlib pyyaml emoji")

## 2) Imports

In [None]:
import re, json, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
try:
    import tensorflow_text as text  # noqa: F401
    print("TF-Text OK")
except Exception as e:
    print("TF-Text not available:", e)

## 3) Tokenizer with custom standardization (keep emoji, `!` and `?`)

In [None]:
def custom_standardize(s):
    # lowercase & keep unicode (emoji), ! and ?
    s = tf.strings.lower(s)
    s = tf.strings.regex_replace(s, r"[^a-z0-9!\?\s\u0080-\uFFFF]", " ")
    return s

def build_vectorizer(max_tokens=30000, seq_len=64, vocab=None):
    vec = layers.TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=seq_len,
        standardize=custom_standardize,
    )
    if vocab is not None:
        vec.set_vocabulary(vocab)
    return vec

## 4) Load data — TFDS *sarcasm* if available, else CSV on Drive

In [None]:
USE_TFDS = True
df = None
if USE_TFDS:
    try:
        import tensorflow_datasets as tfds
        ds_tr, ds_te = tfds.load("sarcasm", split=["train","test"], as_supervised=True)
        tr = [(x.numpy().decode("utf-8"), int(y.numpy())) for x,y in tfds.as_numpy(ds_tr)]
        te = [(x.numpy().decode("utf-8"), int(y.numpy())) for x,y in tfds.as_numpy(ds_te)]
        import pandas as pd
        df = pd.DataFrame(tr+te, columns=["text","label"])
        print("Loaded TFDS 'sarcasm' →", df.shape)
    except Exception as e:
        print("TFDS 'sarcasm' not available:", e)
        USE_TFDS = False

if not USE_TFDS:
    # If you are on Colab, mount Drive and point to your CSV (must have 'text','label' columns)
    if "google.colab" in sys.modules:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=True)
        PATH = "/content/drive/MyDrive/sarcasm-detector/data/raw/sarcasm.csv"  # change if needed
    else:
        PATH = "./data/raw/sarcasm.csv"  # change if running locally

    df = pd.read_csv(PATH)
    assert set(df.columns) >= {"text","label"}, "CSV must contain 'text' and 'label' columns"
    print("Loaded CSV →", df.shape, "from", PATH)

df.head()

## 5) Split train/val/test + class weights

In [None]:
X_train, X_tmp, y_train, y_tmp = train_test_split(
    df["text"].astype(str).values, df["label"].astype(int).values,
    test_size=0.2, stratify=df["label"], random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, stratify=y_tmp, random_state=SEED
)

classes = np.unique(y_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("Class weight:", class_weight)

## 6) Vectorize + simple handcrafted features

In [None]:
max_tokens, seq_len = 30000, 64
vec = build_vectorizer(max_tokens, seq_len)
vec.adapt(tf.constant(X_train))

EMOJI_RE = re.compile(r":\)|:\(|😂|😒|😑|😏|🙄")
def basic_features(text):
    caps_ratio = sum(c.isupper() for c in text)/max(1,len(text))
    punct_burst = int(("!!!" in text) or ("???" in text))
    emoji_flag = int(bool(EMOJI_RE.search(text)))
    return np.array([punct_burst, caps_ratio, emoji_flag], dtype="float32")

def batch_features(texts):
    return np.stack([basic_features(t) for t in texts], axis=0)

tok_tr = vec(tf.constant(X_train)).numpy()
tok_va = vec(tf.constant(X_val)).numpy()
tok_te = vec(tf.constant(X_test)).numpy()
f_tr, f_va, f_te = batch_features(X_train), batch_features(X_val), batch_features(X_test)

tok_tr.shape, f_tr.shape

## 7) Model — BiLSTM + GlobalMaxPool (robust & simple)

In [None]:
text_in = keras.Input(shape=(seq_len,), dtype=tf.int32, name="tok")
x = layers.Embedding(max_tokens, 128, mask_zero=True)(text_in)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.GlobalMaxPool1D()(x)

feat_in = keras.Input(shape=(3,), dtype="float32", name="feat")
h = layers.Concatenate()([x, feat_in])
h = layers.Dense(64, activation="relu")(h)
h = layers.Dropout(0.3)(h)
out = layers.Dense(1, activation="sigmoid")(h)

model = keras.Model([text_in, feat_in], out)
model.compile(optimizer=keras.optimizers.Adam(1e-3),
              loss="binary_crossentropy",
              metrics=[keras.metrics.BinaryAccuracy(name="acc"), keras.metrics.AUC(name="auc")])
model.summary()

## 8) Train with callbacks

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
]

hist = model.fit(
    {"tok": tok_tr, "feat": f_tr}, y_train,
    validation_data=({"tok": tok_va, "feat": f_va}, y_val),
    epochs=15, batch_size=128,
    class_weight=class_weight, callbacks=callbacks, verbose=1
)

## 9) Evaluate (ROC‑AUC, best‑F1 threshold, report)

In [None]:
def eval_split(model, tok, feats, y_true, name="VAL"):
    prob = model.predict({"tok": tok, "feat": feats}, verbose=0).ravel()
    prec, rec, thr = precision_recall_curve(y_true, prob)
    f1s = 2*prec*rec/(prec+rec+1e-9)
    thr_opt = float(thr[np.argmax(f1s[:-1])]) if len(thr)>0 else 0.5
    y_pred = (prob >= thr_opt).astype(int)
    print(f"\n[{name}] ROC-AUC: {roc_auc_score(y_true, prob):.4f} | Best-F1 thr: {thr_opt:.3f}")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion:\n", confusion_matrix(y_true, y_pred))
    return thr_opt

thr_val = eval_split(model, tok_va, f_va, y_val, "VAL")
_ = eval_split(model, tok_te, f_te, y_test, "TEST")

## 10) Save artifacts (vocab, meta, model)

In [None]:
BASE = "/content/drive/MyDrive/sarcasm-detector" if IN_COLAB else "./sarcasm-detector"
PROC = f"{BASE}/data/processed"
os.makedirs(f"{PROC}/models", exist_ok=True)

# vocab
vocab = vec.get_vocabulary()
with open(f"{PROC}/vocab.txt","w") as f: f.write("\n".join(vocab))

# meta
meta = {"max_tokens": int(max_tokens), "seq_len": int(seq_len),
        "features":["punct_burst","caps_ratio","emoji_flag"],
        "thr_val": float(thr_val)}
with open(f"{PROC}/meta.json","w") as f: json.dump(meta, f, indent=2)

# model
model.save(f"{PROC}/models/bilstm_maxpool.keras")
print("Saved →", f"{PROC}")

## 11) Inference helper

In [None]:
def rebuild_vec(vocab, seq_len):
    v = layers.TextVectorization(max_tokens=len(vocab), output_mode="int",
                                 output_sequence_length=seq_len, standardize=custom_standardize)
    v.set_vocabulary(vocab)
    return v

def features_batch(texts):
    EMOJI_RE = re.compile(r":\)|:\(|😂|😒|😑|😏|🙄")
    def basic_features(text):
        caps_ratio = sum(c.isupper() for c in text)/max(1,len(text))
        punct_burst = int(("!!!" in text) or ("???" in text))
        emoji_flag = int(bool(EMOJI_RE.search(text)))
        return np.array([punct_burst, caps_ratio, emoji_flag], dtype="float32")
    return np.stack([basic_features(t) for t in texts], axis=0)

def load_artifacts(base_dir):
    PROC = f"{base_dir}/data/processed"
    model = keras.models.load_model(f"{PROC}/models/bilstm_maxpool.keras")
    vocab = open(f"{PROC}/vocab.txt").read().splitlines()
    meta = json.load(open(f"{PROC}/meta.json"))
    vec2 = rebuild_vec(vocab, meta["seq_len"])
    return model, vec2, meta

BASE = "/content/drive/MyDrive/sarcasm-detector" if IN_COLAB else "./sarcasm-detector"
loaded_model, vec2, meta = load_artifacts(BASE)

def predict_texts(texts, thr=None):
    if thr is None: thr = meta.get("thr_val", 0.5)
    tok = vec2(tf.constant(texts)).numpy()
    feats = features_batch(texts)
    prob = loaded_model.predict({"tok": tok, "feat": feats}, verbose=0).ravel()
    pred = (prob >= thr).astype(int)
    return prob, pred

samples = ["Great, another mandatory meeting at 7am. Perfect.", "Terima kasih atas bantuannya."]
probs, preds = predict_texts(samples)
for s, p, pr in zip(samples, preds, probs):
    print(f"{s[:70]} → pred={p} prob={pr:.3f}")