In [41]:
import os, json, random
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# ---- config

In [2]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

DATA_PATH = "./data/sarcasm.json"
VOCAB_SIZE = 20000 
MAX_LEN = 32
EMBED_DIM = 16
BATCH_SIZE = 128
EPOCHS = 6 

os.makedirs("figs", exist_ok=True)

# ---- load data

In [4]:
import os, pathlib
print("CWD =", os.getcwd())
print("Ada folder data?", pathlib.Path("data").exists())
print("Ada file sarcasm.json?", pathlib.Path("data/sarcasm.json").exists())

CWD = /home/milzon/projects/sarcasm-embeddings
Ada folder data? True
Ada file sarcasm.json? False


In [5]:
from pathlib import Path
import urllib.request, json, os

URL = "https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json"
DEST = Path("data/sarcasm.json")
DEST.parent.mkdir(parents=True, exist_ok=True)

# download if missing
if not DEST.exists():
    print(f"Downloading to {DEST} ...")
    try:
        urllib.request.urlretrieve(URL, DEST.as_posix())
    except Exception as e:
        # fallback simple
        import requests
        r = requests.get(URL, timeout=60)
        r.raise_for_status()
        DEST.write_bytes(r.content)

# verify
print("File size (bytes):", os.path.getsize(DEST))
with open(DEST, "r") as f:
    ds = json.load(f)
print("Records:", len(ds))


File size (bytes): 5643545
Records: 26709


In [6]:
with open(DATA_PATH, "r") as f:
    ds = json.load(f)

texts = [d["headline"] for d in ds]
labels = np.array([d["is_sarcastic"] for d in ds], dtype=np.int32)

# ---- split train/val

In [7]:
N = len(texts)
idx = np.arange(N); np.random.shuffle(idx)
cut = int(0.9 * N)
train_idx, val_idx = idx[:cut], idx[cut:]

train_texts = tf.data.Dataset.from_tensor_slices([texts[i] for i in train_idx])
train_labels = tf.data.Dataset.from_tensor_slices(labels[train_idx])
val_texts   = tf.data.Dataset.from_tensor_slices([texts[i] for i in val_idx])
val_labels  = tf.data.Dataset.from_tensor_slices(labels[val_idx])

2025-09-09 15:38:49.930521: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


# ---- vectorizer

In [8]:
vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    standardize="lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=MAX_LEN
)
vectorizer.adapt(train_texts.batch(1024))

2025-09-09 15:39:17.949529: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# ---- datasets

In [9]:
train_ds = tf.data.Dataset.zip((train_texts.map(vectorizer), train_labels))\
            .shuffle(2048).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.zip((val_texts.map(vectorizer), val_labels))\
            .batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# ---- model (ringan aja dulu)

In [None]:
def build_model():
    return tf.keras.Sequential([
        tf.keras.layers.Input(shape=(MAX_LEN,), dtype="int32"),
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBED_DIM),              # learned embedding
        tf.keras.layers.Conv1D(128, 5, activation="relu"),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

model = build_model()
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss="binary_crossentropy", metrics=["accuracy"])

# ---- train

In [12]:
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, verbose=1)

Epoch 1/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.7177 - loss: 0.5272 - val_accuracy: 0.8476 - val_loss: 0.3589
Epoch 2/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.8908 - loss: 0.2698 - val_accuracy: 0.8525 - val_loss: 0.3524
Epoch 3/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.9452 - loss: 0.1500 - val_accuracy: 0.8484 - val_loss: 0.4234
Epoch 4/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9750 - loss: 0.0750 - val_accuracy: 0.8401 - val_loss: 0.5243
Epoch 5/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step - accuracy: 0.9891 - loss: 0.0371 - val_accuracy: 0.8401 - val_loss: 0.6562
Epoch 6/6
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9944 - loss: 0.0194 - val_accuracy: 0.8304 - val_loss: 0.7698


# ---- plot

In [20]:
fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(history.history["accuracy"]); ax[0].plot(history.history["val_accuracy"])
ax[0].set_title("Accuracy"); ax[0].legend(["train","val"])
ax[1].plot(history.history["loss"]); ax[1].plot(history.history["val_loss"])
ax[1].set_title("Loss"); ax[1].legend(["train","val"])
plt.tight_layout(); plt.savefig("figs/training_curves.png"); plt.close()

# ---- Word embedding projection

In [43]:
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from pathlib import Path
import tensorflow as tf

In [61]:
Path("figs").mkdir(exist_ok=True)

# 1) Encoder = semua layer sebelum Dense(1)
encoder = tf.keras.Sequential(model.layers[:-1])
_ = encoder(tf.zeros((1, MAX_LEN), dtype=tf.int32))  # build sekali

# 2) Kumpulkan vektor token dari val_ds (SUDAH vectorized)
MAX_SAMPLES = 2000
X_vec = np.stack([x.numpy() for x, _ in val_ds.unbatch().take(MAX_SAMPLES)]).astype(np.int32)
y_vec = np.array([int(y.numpy()) for _, y in val_ds.unbatch().take(MAX_SAMPLES)])

# 3) Ekstrak fitur (keluaran Dense(64)) lalu PCA
Z = encoder.predict(X_vec, batch_size=512, verbose=0)   # shape (M, 64)

coords = PCA(n_components=2, random_state=42).fit_transform(Z)
neg = y_vec == 0; pos = ~neg

plt.figure(figsize=(8,6))
plt.scatter(coords[neg,0], coords[neg,1], s=10, alpha=0.6, label="Not Sarcasm")
plt.scatter(coords[pos,0], coords[pos,1], s=10, alpha=0.6, label="Sarcasm")
plt.legend(); plt.title("Sentence Embeddings Projection (PCA)")
plt.tight_layout(); plt.savefig("figs/sentence_embedding_pca.png", dpi=220); plt.close()
print("Saved: figs/sentence_embedding_pca.png")

Saved: figs/sentence_embedding_pca.png


- Export WORD embeddings → projector/word/{vectors.tsv,metadata.tsv}

In [62]:
from pathlib import Path
import numpy as np
import tensorflow as tf

Path("projector/sent").mkdir(parents=True, exist_ok=True)

# Encoder: semua layer sebelum Dense(1)
encoder = tf.keras.Sequential(model.layers[:-1])
_ = encoder(tf.zeros((1, MAX_LEN), dtype=tf.int32))  # build

# Kumpulkan tepat sekali dari val_ds (yang SUDAH vectorized)
MAX_SAMPLES = 2000
X_list, y_list = [], []
for x, y in val_ds.unbatch().take(MAX_SAMPLES):
    X_list.append(x.numpy())
    y_list.append(int(y.numpy()))

X = np.stack(X_list).astype(np.int32)
y = np.array(y_list, dtype=np.int32)

# Hitung embedding kalimat
Z = encoder.predict(X, batch_size=512, verbose=0)  # shape: (M, D)
M = Z.shape[0]                                     # jumlah baris yang akan disimpan

# Pastikan metadata dipotong ke M baris
y = y[:M]

# Tulis vectors.tsv dan metadata.tsv (dengan newline terakhir)
np.savetxt("projector/sent/vectors.tsv", Z, delimiter="\t")
with open("projector/sent/metadata.tsv", "w", encoding="utf-8") as f:
    for i in range(M):
        f.write(f"{y[i]}\n")   # pastikan newline di akhir setiap baris


- Export SENTENCE embeddings → projector/sent/{vectors.tsv,metadata.tsv}
- Kita ambil keluaran Dense(64) (layer sebelum output).
- Gunakan dataset yang sudah di-vectorize (int32). Jangan panggil vectorizer() lagi.

# word projection
- supaya bisa search kata kata di projector

In [65]:
from pathlib import Path
import numpy as np, tensorflow as tf

Path("projector/word").mkdir(parents=True, exist_ok=True)

# Ambil embedding & vocab
embedding_layer = next(l for l in model.layers if isinstance(l, tf.keras.layers.Embedding))
E = embedding_layer.get_weights()[0]
vocab = vectorizer.get_vocabulary()
vocab_eff = min(len(vocab), E.shape[0])

START = 2                                   # skip '' dan [UNK]
N = min(5000, max(0, vocab_eff - START))
idx = np.arange(START, START+N)

# Simpan vektor
np.savetxt("projector/word/vectors.tsv", E[idx], delimiter="\t")

# Simpan metadata **2 kolom + header** -> baris = N + 1
with open("projector/word/metadata.tsv", "w", encoding="utf-8") as f:
    f.write("token\trank\n")                # header WAJIB utk multi-kolom
    for r, i in enumerate(idx):
        tok = vocab[i].replace("\t"," ").replace("\n"," ")
        f.write(f"{tok}\t{r}\n")

# Verifikasi jumlah baris
vec_rows = sum(1 for _ in open("projector/word/vectors.tsv", "r", encoding="utf-8"))
meta_rows = sum(1 for _ in open("projector/word/metadata.tsv", "r", encoding="utf-8")) - 1  # minus header
print("vectors:", vec_rows, "metadata(data rows):", meta_rows)  # harus sama


vectors: 5000 metadata(data rows): 5000


- Hapus semua projector

In [64]:
rm -rf projector


- menyimpan teks validasi mentah (mis. val_texts dari tahap split awal), kbisa bikin metadata 2 kolom:

In [49]:
with open("projector/sent/metadata.tsv", "w", encoding="utf-8") as f:
    for lbl, txt in zip(y_vec, val_texts_list[:len(y_vec)]):
        txt = str(txt).replace("\t"," ").replace("\n"," ")
        f.write(f"{lbl}\t{txt}\n")

- eval & sample predictions
- ambil seluruh val untuk Report

# Cara menayangkan di web
* Buka https://projector.tensorflow.org/

- Klik Load → unggah berkas vectors.tsv dan metadata.tsv.
- Di panel kanan, pilih PCA atau UMAP untuk eksplorasi.
- Color by: label (untuk proyeksi kalimat/sentence).
- Label by: token (untuk proyeksi kata/word).

In [39]:
val_X = np.asarray(list(val_texts.map(vectorizer).as_numpy_iterator()))
val_y = labels[val_idx]
pred  = (model.predict(val_X, verbose=0).ravel() > 0.5).astype(int)

print(classification_report(val_y, pred, digits=3))
print(confusion_matrix(val_y, pred))
# ---- eval & sample predictions
# ambil seluruh val untuk Report

              precision    recall  f1-score   support

           0      0.855     0.839     0.847      1492
           1      0.801     0.819     0.810      1179

    accuracy                          0.830      2671
   macro avg      0.828     0.829     0.828      2671
weighted avg      0.831     0.830     0.831      2671

[[1252  240]
 [ 213  966]]


# simpan contoh

In [15]:
samples = [
    "Great, another delay…",
    "Thank you so much 🙏",
    "Yeah right, that meeting was super useful",
    "I absolutely love waiting in traffic for hours"
]
sample_vec = vectorizer(tf.constant(samples))
probs = model.predict(sample_vec, verbose=0).ravel()
for s, p in zip(samples, probs):
    print(f"{s} -> Sarcasm p={p:.2f}")
#!/usr/bin/env python3

Great, another delay… -> Sarcasm p=0.93
Thank you so much 🙏 -> Sarcasm p=0.21
Yeah right, that meeting was super useful -> Sarcasm p=0.09
I absolutely love waiting in traffic for hours -> Sarcasm p=0.12


# simpan artefak

In [17]:
from pathlib import Path
Path("artifacts").mkdir(exist_ok=True)
Path("figs").mkdir(exist_ok=True)

model.save("artifacts/sarcasm_embed_model.keras")

with open("figs/sample_preds.txt","w") as f:
    for s, p in zip(samples, probs):
        f.write(f"{s}\tSarcasm p={p:.2f}\n")
        
        