DATA PREPROCESSING

In [None]:
!pip -q install datasets==3.0.1 pandas==2.2.2 scikit-learn==1.5.2

from datasets import load_dataset, DatasetDict
import pandas as pd
import numpy as np
import json, os, random
from sklearn.model_selection import train_test_split

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Load GoEmotions (simplified config)
# Two valid IDs:
#  - "go_emotions" (HF canonical)
#  - "google-research-datasets/go_emotions" (mirror)
# We'll prefer the one you pasted:
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

# Peek
ds, ds["train"][0]


README.md: 0.00B [00:00, ?B/s]

KeyboardInterrupt: 

In [None]:
# Get label names (27 emotions + 'neutral')
label_names = ds["train"].features["labels"].feature.names
name_set = set(label_names)

positive = {"admiration","amusement","approval","caring","desire",
            "excitement","gratitude","joy","love","optimism","pride","relief"}
negative = {"anger","annoyance","disappointment","disapproval","disgust",
            "embarrassment","fear","grief","nervousness","remorse","sadness","confusion"}
neutral  = {"neutral","realization","curiosity","surprise"}  # treat these as neutral for sentiment

assert positive.issubset(name_set) and negative.issubset(name_set) and set(neutral).issubset(name_set)

def multi_emotions_to_sentiment(label_ids):
    """label_ids: list[int] of emotion indices; returns 'negative'|'neutral'|'positive' or None to drop"""
    emos = [label_names[i] for i in label_ids]
    p = len(set(emos) & positive)
    n = len(set(emos) & negative)
    u = len(set(emos) & neutral)
    # Choose the max group; if tie or none -> drop (avoid noisy supervision)
    counts = {"positive": p, "negative": n, "neutral": u}
    m = max(counts.values())
    if m == 0:
        return None
    # break ties deterministically but conservative: drop ties
    if list(counts.values()).count(m) > 1:
        return None
    # choose argmax
    return max(counts, key=counts.get)


In [None]:
# "simplified" has splits already; we’ll merge then do a fresh stratified 80/10/10 split.
def as_df(spl):
    # "simplified" has columns: text (str), labels (list[int]), id, etc.
    # It also has 'id' and may have 'rater_id' in raw config; simplified should be fine.
    return spl.to_pandas()

df_all = pd.concat([as_df(ds["train"]), as_df(ds["validation"]), as_df(ds["test"])], ignore_index=True)

# Basic cleaning
def clean_text(s: str) -> str:
    s = (s or "").strip()
    return s

df_all["text"] = df_all["text"].astype(str).apply(clean_text)
df_all = df_all[df_all["text"].str.len() >= 3]

# Map to 3-class
df_all["label"] = df_all["labels"].apply(multi_emotions_to_sentiment)
df_all = df_all.dropna(subset=["label"]).reset_index(drop=True)

# Drop duplicates
df_all = df_all.drop_duplicates(subset=["text", "label"]).reset_index(drop=True)

df_all.head(), df_all.shape


(                                                text labels       id     label
 0  My favourite food is anything I didn't have to...   [27]  eebbqej   neutral
 1  Now if he does off himself, everyone will thin...   [27]  ed00q6i   neutral
 2                     WHY THE FUCK IS BAYLESS ISOING    [2]  eezlygj  negative
 3                        To make her feel threatened   [14]  ed7ypvh  negative
 4                             Dirty Southern Wankers    [3]  ed0bdzj  negative,
 (50276, 4))

In [None]:
train_df, temp_df = train_test_split(
    df_all[["text","label"]],
    test_size=0.20,
    random_state=SEED,
    stratify=df_all["label"]
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=SEED,
    stratify=temp_df["label"]
)

def show_counts(df, name):
    print(f"{name} size = {len(df)}")
    print(df["label"].value_counts(normalize=False).to_string(), "\n")

show_counts(train_df, "train")
show_counts(val_df,   "val")
show_counts(test_df,  "test")


train size = 40220
label
neutral     15821
positive    15234
negative     9165 

val size = 5028
label
neutral     1978
positive    1905
negative    1145 

test size = 5028
label
neutral     1978
positive    1904
negative    1146 



In [None]:
os.makedirs("/content/data", exist_ok=True)
os.makedirs("/content/models/tflite", exist_ok=True)

def to_jsonl(df, path):
    with open(path, "w", encoding="utf-8") as f:
        for _, r in df.iterrows():
            f.write(json.dumps({"text": r["text"], "label": r["label"]}, ensure_ascii=False) + "\n")

to_jsonl(train_df, "/content/data/train.jsonl")
to_jsonl(val_df,   "/content/data/val.jsonl")
to_jsonl(test_df,  "/content/data/test.jsonl")

label_map = {"labels": ["negative","neutral","positive"]}
with open("/content/models/tflite/label_map.json", "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

print("Saved:")
!wc -l /content/data/*.jsonl
!ls -lh /content/models/tflite/label_map.json


Saved:
   5028 /content/data/test.jsonl
  40220 /content/data/train.jsonl
   5028 /content/data/val.jsonl
  50276 total
-rw-r--r-- 1 root root 67 Nov 15 12:01 /content/models/tflite/label_map.json


FINE TUNE TEACHER


In [None]:
# ==== TEACHER FINE-TUNE (TF 2.19, T4) — force native TF weights ====
import os, json, math, random, numpy as np, tensorflow as tf
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, create_optimizer
from transformers import TFDistilBertForSequenceClassification, TFBertForSequenceClassification

# ------ config ------
DATA_DIR   = "/content/data"
SAVE_DIR   = "/content/models"
LABELS     = ["negative","neutral","positive"]
NUM_LABELS = len(LABELS)
MAX_LEN    = 128
BATCH_SIZE = 32
EPOCHS     = 3
SEED       = 42

tf.random.set_seed(SEED); np.random.seed(SEED); random.seed(SEED)
for d in tf.config.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(d, True)

# ------- data loaders -------
def _read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            r = json.loads(line); yield r["text"], LABELS.index(r["label"])

def _count_lines(p):
    import subprocess, shlex
    return int(subprocess.check_output(shlex.split(f"wc -l {p}")).decode().split()[0])

def make_ds(path, tokenizer, batch_size, shuffle=False, repeat=False, max_len=MAX_LEN):
    def gen():
        for t,y in _read_jsonl(path):
            e = tokenizer(t, truncation=True, padding="max_length", max_length=max_len)
            yield (
                {"input_ids": np.int32(e["input_ids"]), "attention_mask": np.int32(e["attention_mask"])},
                np.int32(y)
            )
    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {"input_ids": tf.TensorSpec([max_len], tf.int32),
             "attention_mask": tf.TensorSpec([max_len], tf.int32)},
            tf.TensorSpec([], tf.int32),
        )
    )
    if shuffle: ds = ds.shuffle(4096, seed=SEED, reshuffle_each_iteration=True)
    if repeat:  ds = ds.repeat()
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_p, val_p, test_p = f"{DATA_DIR}/train.jsonl", f"{DATA_DIR}/val.jsonl", f"{DATA_DIR}/test.jsonl"
n_train, n_val, n_test = _count_lines(train_p), _count_lines(val_p), _count_lines(test_p)
steps_per_epoch  = math.ceil(n_train / BATCH_SIZE)
val_steps        = math.ceil(n_val   / BATCH_SIZE)

# ------- try TF DistilBERT first (native TF weights), then TF BERT as fallback -------
def load_teacher_and_tokenizer():
    try:
        print("Trying TF DistilBERT (native TF weights)…")
        tok = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            num_labels=NUM_LABELS,
            from_pt=False,              # <-- do NOT convert from PT
            use_safetensors=False       # <-- force tf_model.h5
        )
        return model, tok
    except Exception as e1:
        print("DistilBERT load failed →", repr(e1))
        print("Trying TF BERT-base (native TF weights)…")
        tok = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=NUM_LABELS,
            from_pt=False,
            use_safetensors=False
        )
        return model, tok

teacher, tok = load_teacher_and_tokenizer()

# ------- datasets -------
train_ds = make_ds(train_p, tok, BATCH_SIZE, shuffle=True,  repeat=True)
val_ds   = make_ds(val_p,   tok, BATCH_SIZE, shuffle=False, repeat=True)
test_ds  = make_ds(test_p,  tok, BATCH_SIZE, shuffle=False, repeat=False)

# class weights (inverse frequency)
counts = {i:0 for i in range(NUM_LABELS)}
for _, y in _read_jsonl(train_p): counts[y]+=1
tot = sum(counts.values())
class_weight = {i: tot/(NUM_LABELS*counts[i]) for i in range(NUM_LABELS)}
print("Train counts:", counts, " → class_weight:", {k: round(v,2) for k,v in class_weight.items()})

# ------- compile & train -------
num_train_steps = steps_per_epoch * EPOCHS
opt, _ = create_optimizer(init_lr=3e-5,
                          num_warmup_steps=int(0.1*num_train_steps),
                          num_train_steps=num_train_steps)

teacher.compile(
    optimizer=opt,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

ckpt_dir = f"{SAVE_DIR}/teacher_ckpt"; os.makedirs(ckpt_dir, exist_ok=True)
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(f"{ckpt_dir}/best",
        save_weights_only=True, save_best_only=True, monitor="val_acc", mode="max"),
    tf.keras.callbacks.EarlyStopping(monitor="val_acc", mode="max", patience=2, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_acc", mode="max", factor=0.5, patience=1, verbose=1),
]

history = teacher.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_steps=val_steps,
    class_weight=class_weight,
    verbose=1,
)

# ------- evaluate (acc + macro-F1) -------
eval_res = teacher.evaluate(test_ds, return_dict=True, verbose=1)
print("\nKeras eval:", eval_res)

y_true, y_pred = [], []
for x, y in test_ds:
    logits = teacher(x, training=False).logits.numpy()
    y_true.extend(y.numpy().tolist())
    y_pred.extend(np.argmax(logits, axis=-1).tolist())
macro_f1 = f1_score(y_true, y_pred, average="macro")
print(f"Macro-F1: {macro_f1:.4f}\n")
print(classification_report(y_true, y_pred, target_names=LABELS, digits=4))

# ------- save for distillation -------
save_teacher = f"{SAVE_DIR}/teacher_saved"; save_tok = f"{SAVE_DIR}/tokenizer_teacher"
teacher.save_pretrained(save_teacher); tok.save_pretrained(save_tok)
print(f"Saved teacher → {save_teacher}\nSaved tokenizer → {save_tok}")


Trying TF DistilBERT (native TF weights)…


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-

Train counts: {0: 9165, 1: 15821, 2: 15234}  → class_weight: {0: 1.46, 1: 0.85, 2: 0.88}
Epoch 1/3
Epoch 2/3
Epoch 3/3

Keras eval: {'loss': 0.6538585424423218, 'acc': 0.7498010993003845}
Macro-F1: 0.7415

              precision    recall  f1-score   support

    negative     0.6349    0.7330    0.6804      1146
     neutral     0.7510    0.6815    0.7146      1978
    positive     0.8283    0.8309    0.8296      1904

    accuracy                         0.7498      5028
   macro avg     0.7381    0.7485    0.7415      5028
weighted avg     0.7538    0.7498    0.7503      5028

Saved teacher → /content/models/teacher_saved
Saved tokenizer → /content/models/tokenizer_teacher


DISTILL TO STUDENT (TINY)

In [None]:
# ==== IMPROVED DISTILLATION → Tiny (TinyBERT 4L) with KD → CE schedule (TF 2.19) ====
import os, json, math, random, subprocess, shlex
import numpy as np
import tensorflow as tf
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer
from tqdm.auto import tqdm  # for pretty progress bars

# ---------------- cfg/paths ----------------
DATA_DIR    = "/content/data"
SAVE_DIR    = "/content/models"
TEACHER_DIR = f"{SAVE_DIR}/teacher_saved"          # from previous teacher step
TOK_DIR     = f"{SAVE_DIR}/tokenizer_teacher"      # use teacher vocab for app tokenizer
STUDENT_CKPT= "huawei-noah/TinyBERT_General_4L_312D"  # fallback to bert-tiny if needed

LABELS      = ["negative","neutral","positive"]
NUM_LABELS  = len(LABELS)
MAX_LEN     = 128
BATCH       = 32
EPOCHS_KD   = 7     # KD epochs
EPOCHS_CE   = 2     # CE fine-tune epochs
SEED        = 42

# KD hyperparams
TEMP  = 4.0
ALPHA = 0.30   # weight on hard CE
BETA  = 0.70   # weight on soft KL
LABEL_SMOOTH = 0.05  # manual smoothing

tf.random.set_seed(SEED); np.random.seed(SEED); random.seed(SEED)
for d in tf.config.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(d, True)

# ---------------- data utils ----------------
tok = AutoTokenizer.from_pretrained(TOK_DIR, use_fast=True)

def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            r = json.loads(line)
            yield r["text"], LABELS.index(r["label"])

def make_ds(path, batch_size, shuffle=False, repeat=False):
    def gen():
        for t, y in read_jsonl(path):
            e = tok(
                t,
                truncation=True,
                padding="max_length",
                max_length=MAX_LEN
            )
            yield (
                {
                    "input_ids":     np.int32(e["input_ids"]),
                    "attention_mask":np.int32(e["attention_mask"])
                },
                np.int32(y)
            )

    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec([MAX_LEN], tf.int32),
                "attention_mask": tf.TensorSpec([MAX_LEN], tf.int32),
            },
            tf.TensorSpec([], tf.int32),
        ),
    )
    if shuffle:
        ds = ds.shuffle(4096, seed=SEED, reshuffle_each_iteration=True)
    if repeat:
        ds = ds.repeat()
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_p = f"{DATA_DIR}/train.jsonl"
val_p   = f"{DATA_DIR}/val.jsonl"
test_p  = f"{DATA_DIR}/test.jsonl"

def n_lines(p):
    return int(subprocess.check_output(shlex.split(f"wc -l {p}")).decode().split()[0])

ntr, nva, nte = n_lines(train_p), n_lines(val_p), n_lines(test_p)
steps    = math.ceil(ntr / BATCH)
valsteps = math.ceil(nva / BATCH)

train_ds = make_ds(train_p, BATCH, shuffle=True,  repeat=True)
val_ds   = make_ds(val_p,   BATCH, shuffle=False, repeat=True)
test_ds  = make_ds(test_p,  BATCH, shuffle=False, repeat=False)

# ---------------- class weights (optional; inverse frequency) ----------------
cnt = {i: 0 for i in range(NUM_LABELS)}
for _, y in read_jsonl(train_p):
    cnt[y] += 1
tot = sum(cnt.values())
class_weight = {i: tot / (NUM_LABELS * cnt[i]) for i in range(NUM_LABELS)}
print("Class weight (computed, not yet applied in loss):",
      {k: round(v, 2) for k, v in class_weight.items()})

# ---------------- models ----------------
teacher = TFAutoModelForSequenceClassification.from_pretrained(
    TEACHER_DIR, num_labels=NUM_LABELS
)
teacher.trainable = False

try:
    student = TFAutoModelForSequenceClassification.from_pretrained(
        STUDENT_CKPT,
        num_labels=NUM_LABELS,
        from_pt=True,
        use_safetensors=False
    )
except Exception as e:
    print("TinyBERT failed, fallback to prajjwal1/bert-tiny →", repr(e))
    student = TFAutoModelForSequenceClassification.from_pretrained(
        "prajjwal1/bert-tiny",
        num_labels=NUM_LABELS,
        from_pt=True,
        use_safetensors=False
    )

# ---------------- losses/optimizers/metrics ----------------
# Keep KLD with default reduction; it's scalar already
kld = tf.keras.losses.KLDivergence()

train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name="train_acc")
val_acc   = tf.keras.metrics.SparseCategoricalAccuracy(name="val_acc")

# Cosine schedule for KD stage
opt_kd, _ = create_optimizer(
    init_lr=3e-5,
    num_warmup_steps=int(0.1 * steps * EPOCHS_KD),
    num_train_steps=steps * EPOCHS_KD,
)
opt_kd.clipnorm = 1.0

opt_ce = tf.keras.optimizers.Adam(1e-5)
opt_ce.clipnorm = 1.0

# ---------------- train/eval steps ----------------
@tf.function
def kd_train_step(x, y):
    # label smoothing
    y_smooth = tf.one_hot(y, NUM_LABELS) * (1.0 - LABEL_SMOOTH) + LABEL_SMOOTH / NUM_LABELS

    with tf.GradientTape() as tape:
        # teacher + student logits
        t_logits = teacher(x, training=False).logits / TEMP
        s_logits = student(x, training=True).logits

        # hard CE (use logits directly with from_logits=True)
        hard_ce = tf.keras.losses.categorical_crossentropy(
            y_smooth, s_logits, from_logits=True
        )

        # KL on softened probabilities
        t_probs = tf.nn.softmax(t_logits, axis=-1)
        s_probs = tf.nn.softmax(s_logits / TEMP, axis=-1)
        soft_kl = kld(t_probs, s_probs) * (TEMP ** 2)

        loss = ALPHA * tf.reduce_mean(hard_ce) + BETA * soft_kl

    grads = tape.gradient(loss, student.trainable_variables)
    opt_kd.apply_gradients(zip(grads, student.trainable_variables))
    train_acc.update_state(y, s_logits)
    return loss

@tf.function
def ce_train_step(x, y):
    y_smooth = tf.one_hot(y, NUM_LABELS) * (1.0 - LABEL_SMOOTH) + LABEL_SMOOTH / NUM_LABELS

    with tf.GradientTape() as tape:
        s_logits = student(x, training=True).logits
        ce_loss = tf.keras.losses.categorical_crossentropy(
            y_smooth, s_logits, from_logits=True
        )
        loss = tf.reduce_mean(ce_loss)

    grads = tape.gradient(loss, student.trainable_variables)
    opt_ce.apply_gradients(zip(grads, student.trainable_variables))
    train_acc.update_state(y, s_logits)
    return loss

@tf.function
def ce_val_step(x, y):
    s_logits = student(x, training=False).logits
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            y, s_logits, from_logits=True
        )
    )
    val_acc.update_state(y, s_logits)
    return loss

# ---------------- KD stage ----------------
best_f1, patience, bad = -1.0, 2, 0

print("\n===== KD STAGE =====")
for ep in range(EPOCHS_KD):
    train_acc.reset_state()
    val_acc.reset_state()

    t_loss_sum = 0.0
    # TRAIN with progress bar
    train_iter = train_ds.take(steps)
    pbar = tqdm(train_iter, total=steps, desc=f"KD Epoch {ep+1}/{EPOCHS_KD} [train]")
    for batch in pbar:
        x, y = batch
        loss = float(kd_train_step(x, y))
        t_loss_sum += loss
        pbar.set_postfix(
            loss=f"{loss:.4f}",
            acc=f"{train_acc.result():.4f}"
        )

    # VAL loop (loss + acc)
    v_loss_sum = 0.0
    val_iter = val_ds.take(valsteps)
    pbar_val = tqdm(val_iter, total=valsteps, desc=f"KD Epoch {ep+1}/{EPOCHS_KD} [val]")
    for batch in pbar_val:
        x, y = batch
        v_loss = float(ce_val_step(x, y))
        v_loss_sum += v_loss
        pbar_val.set_postfix(
            loss=f"{v_loss:.4f}",
            acc=f"{val_acc.result():.4f}"
        )

    # compute validation macro-F1 precisely on full val set
    y_true, y_pred = [], []
    for x, y in make_ds(val_p, BATCH, shuffle=False, repeat=False):
        logits = student(x, training=False).logits.numpy()
        y_true.extend(y.numpy().tolist())
        y_pred.extend(np.argmax(logits, axis=-1).tolist())
    val_f1 = f1_score(y_true, y_pred, average="macro")

    print(
        f"KD Epoch {ep+1}/{EPOCHS_KD} | "
        f"train_loss={t_loss_sum/steps:.4f} | train_acc={train_acc.result():.4f} | "
        f"val_loss={v_loss_sum/valsteps:.4f} | val_acc={val_acc.result():.4f} | "
        f"val_f1={val_f1:.4f}"
    )

    if val_f1 > best_f1:
        best_f1 = val_f1
        bad = 0
        student.save_pretrained(f"{SAVE_DIR}/student_tf_best")
        print(f"  ↳ New best KD F1: {best_f1:.4f} (checkpoint saved)")
    else:
        bad += 1
        print(f"  ↳ No improvement. bad={bad}/{patience}")
        if bad >= patience:
            print("Early stop (KD stage).")
            break

# Reload best KD checkpoint before CE fine-tuning
print("\nLoading best KD checkpoint before CE fine-tuning...")
student = TFAutoModelForSequenceClassification.from_pretrained(
    f"{SAVE_DIR}/student_tf_best", num_labels=NUM_LABELS
)

# ---------------- CE fine-tune stage ----------------
print("\n===== CE FINE-TUNE STAGE =====")
for ep in range(EPOCHS_CE):
    train_acc.reset_state()
    c_loss_sum = 0.0

    train_iter = train_ds.take(steps)
    pbar = tqdm(train_iter, total=steps, desc=f"CE Epoch {ep+1}/{EPOCHS_CE} [train]")
    for batch in pbar:
        x, y = batch
        loss = float(ce_train_step(x, y))
        c_loss_sum += loss
        pbar.set_postfix(
            loss=f"{loss:.4f}",
            acc=f"{train_acc.result():.4f}"
        )

    # validation macro-F1 on full val set
    y_true, y_pred = [], []
    for x, y in make_ds(val_p, BATCH, shuffle=False, repeat=False):
        logits = student(x, training=False).logits.numpy()
        y_true.extend(y.numpy().tolist())
        y_pred.extend(np.argmax(logits, axis=-1).tolist())
    val_f1 = f1_score(y_true, y_pred, average="macro")

    print(
        f"CE Epoch {ep+1}/{EPOCHS_CE} | "
        f"train_loss={c_loss_sum/steps:.4f} | train_acc={train_acc.result():.4f} | "
        f"val_f1={val_f1:.4f}"
    )

    # keep using best_f1 (from KD+CE) to save global best
    if val_f1 > best_f1:
        best_f1 = val_f1
        student.save_pretrained(f"{SAVE_DIR}/student_tf_best")
        print(f"  ↳ New best overall F1 after CE: {best_f1:.4f} (checkpoint saved)")

# ---------------- final test eval ----------------
print("\n===== FINAL TEST EVAL =====")
best = TFAutoModelForSequenceClassification.from_pretrained(
    f"{SAVE_DIR}/student_tf_best", num_labels=NUM_LABELS
)

y_true, y_pred = [], []
for x, y in test_ds:
    logits = best(x, training=False).logits.numpy()
    y_true.extend(y.numpy().tolist())
    y_pred.extend(np.argmax(logits, axis=-1).tolist())
test_f1 = f1_score(y_true, y_pred, average="macro")
print(f"\nFINAL Student Macro-F1 (best ckpt): {test_f1:.4f}")

# ---------------- save student + tokenizer ----------------
best.save_pretrained(f"{SAVE_DIR}/student_tf")
tok.save_pretrained(f"{SAVE_DIR}/tokenizer_student")
print("Saved student →", f"{SAVE_DIR}/student_tf")
print("Saved tokenizer →", f"{SAVE_DIR}/tokenizer_student")


Class weight (computed, not yet applied in loss): {0: 1.46, 1: 0.85, 2: 0.88}


Some layers from the model checkpoint at /content/models/teacher_saved were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/models/teacher_saved and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['fit_denses.1.weight', 'fit_denses.3.bias', 'fit_denses.2.weight', 'fit_denses.4.weight', 'fit_denses.0.bias', 'fit_denses.1.bias', 'fit_denses.3.weight', 'fit_denses.0.weight', 'fit_denses.2.bias', 'fit_denses.4.bias']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classif


===== KD STAGE =====


KD Epoch 1/7 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

KD Epoch 1/7 [val]:   0%|          | 0/158 [00:00<?, ?it/s]

KD Epoch 1/7 | train_loss=0.9493 | train_acc=0.6581 | val_loss=0.6432 | val_acc=0.7504 | val_f1=0.7437
  ↳ New best KD F1: 0.7437 (checkpoint saved)


KD Epoch 2/7 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

KD Epoch 2/7 [val]:   0%|          | 0/158 [00:00<?, ?it/s]

KD Epoch 2/7 | train_loss=0.4746 | train_acc=0.7547 | val_loss=0.6435 | val_acc=0.7480 | val_f1=0.7405
  ↳ No improvement. bad=1/2


KD Epoch 3/7 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

KD Epoch 3/7 [val]:   0%|          | 0/158 [00:00<?, ?it/s]

KD Epoch 3/7 | train_loss=0.4009 | train_acc=0.7744 | val_loss=0.6366 | val_acc=0.7605 | val_f1=0.7529
  ↳ New best KD F1: 0.7529 (checkpoint saved)


KD Epoch 4/7 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

KD Epoch 4/7 [val]:   0%|          | 0/158 [00:00<?, ?it/s]

KD Epoch 4/7 | train_loss=0.3537 | train_acc=0.7905 | val_loss=0.6675 | val_acc=0.7468 | val_f1=0.7386
  ↳ No improvement. bad=1/2


KD Epoch 5/7 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

KD Epoch 5/7 [val]:   0%|          | 0/158 [00:00<?, ?it/s]

KD Epoch 5/7 | train_loss=0.3244 | train_acc=0.8022 | val_loss=0.6470 | val_acc=0.7559 | val_f1=0.7484
  ↳ No improvement. bad=2/2
Early stop (KD stage).

Loading best KD checkpoint before CE fine-tuning...


Some layers from the model checkpoint at /content/models/student_tf_best were not used when initializing TFBertForSequenceClassification: ['dropout_53']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/models/student_tf_best.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.



===== CE FINE-TUNE STAGE =====


CE Epoch 1/2 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

CE Epoch 1/2 | train_loss=0.5760 | train_acc=0.7948 | val_f1=0.7530
  ↳ New best overall F1 after CE: 0.7530 (checkpoint saved)


CE Epoch 2/2 [train]:   0%|          | 0/1257 [00:00<?, ?it/s]

CE Epoch 2/2 | train_loss=0.5606 | train_acc=0.8026 | val_f1=0.7498

===== FINAL TEST EVAL =====


Some layers from the model checkpoint at /content/models/student_tf_best were not used when initializing TFBertForSequenceClassification: ['dropout_67']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/models/student_tf_best.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.



FINAL Student Macro-F1 (best ckpt): 0.7452
Saved student → /content/models/student_tf
Saved tokenizer → /content/models/tokenizer_student


QUANTIZE TO INT8 TFLITE

In [None]:
# ==== INT8 model with FLOAT32 output (SAFE for Flutter) ====
import os, json, numpy as np, tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

SAVE_DIR = "/content/models"
STU_DIR  = f"{SAVE_DIR}/student_tf"
TOK_DIR  = f"{SAVE_DIR}/tokenizer_student"
OUT_DIR  = f"{SAVE_DIR}/tflite_fpout"
MAX_LEN  = 128

os.makedirs(OUT_DIR, exist_ok=True)

# Load student + tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained(STU_DIR)
tok   = AutoTokenizer.from_pretrained(TOK_DIR)

@tf.function(input_signature=[
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"),
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask"),
])
def serving(input_ids, attention_mask):
    logits = model(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        training=False
    ).logits
    return {"logits": logits}

concrete = serving.get_concrete_function()

# Representative dataset (INT8 calibration)
calib_texts = [
    "I feel okay today.", "I'm very sad.", "Hopeful and calm.",
    "Everything is pointless.", "Nervous about tomorrow.",
    "Not bad, just tired.", "Angry and frustrated.",
    "Feeling nothing.", "Grateful for the help.", "This is overwhelming."
]

def rep_ds():
    for t in calib_texts:
        e = tok(
            t,
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="np",
        )
        yield [
            e["input_ids"].astype(np.int32),
            e["attention_mask"].astype(np.int32),
        ]

# TFLite conversion
converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete])
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = rep_ds

# Allow int8 kernels but also fallback to builtins where needed
converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
    tf.lite.OpsSet.TFLITE_BUILTINS,
]

# ⚠️ DO NOT set inference_input_type (int32 is not allowed here)
# converter.inference_input_type  = tf.int32  # ← remove this line

# Output stays float32 (this one is allowed; you can also omit it,
# default is float32 anyway)
converter.inference_output_type = tf.float32

tflite_bytes = converter.convert()

out_path = f"{OUT_DIR}/student_quant_fpout.tflite"
with open(out_path, "wb") as f:
    f.write(tflite_bytes)

# Save label map
label_map = {"labels": ["negative", "neutral", "positive"]}
with open(f"{OUT_DIR}/label_map.json", "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

print("Saved:", out_path)
!ls -lh {OUT_DIR}


Some layers from the model checkpoint at /content/models/student_tf were not used when initializing TFBertForSequenceClassification: ['dropout_81']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/models/student_tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Saved: /content/models/tflite_fpout/student_quant_fpout.tflite
total 14M
-rw-r--r-- 1 root root  67 Nov 15 13:11 label_map.json
-rw-r--r-- 1 root root 14M Nov 15 13:11 student_quant_fpout.tflite


In [None]:
from google.colab import files
uploaded = files.upload()   # upload your tflite + tokenizer + label_map.json


In [None]:
# ==== Inference: send text -> get sentiment from student_quant_fpout.tflite ====
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer

TFLITE_PATH    = "/content/models/tflite_fpout/student_quant_fpout.tflite"
TOKENIZER_PATH = "/content/models/tokenizer_student"
MAX_LEN        = 128
LABELS         = ["negative", "neutral", "positive"]

# 1) load tokenizer
tok = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# 2) load interpreter
interpreter = tf.lite.Interpreter(model_path=TFLITE_PATH, num_threads=4)
interpreter.allocate_tensors()

input_details  = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input details:", input_details)
print("Output details:", output_details)

def softmax(logits):
    logits = np.array(logits, dtype=np.float32)
    logits = logits - np.max(logits, axis=-1, keepdims=True)
    e = np.exp(logits)
    return e / np.sum(e, axis=-1, keepdims=True)

def predict_sentiment(text: str):
    # 1) tokenize
    enc = tok(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="np",
    )

    # cast to whatever the model expects (usually int32)
    ids_dtype  = input_details[0]["dtype"]
    mask_dtype = input_details[1]["dtype"]

    input_ids = enc["input_ids"].astype(ids_dtype)
    mask      = enc["attention_mask"].astype(mask_dtype)

    # 2) set tensors
    interpreter.set_tensor(input_details[0]["index"], input_ids)
    interpreter.set_tensor(input_details[1]["index"], mask)

    # 3) run
    interpreter.invoke()

    # 4) get logits (float32, because fpout)
    logits = interpreter.get_tensor(output_details[0]["index"])[0]  # shape (3,)
    probs  = softmax(logits)                                        # shape (3,)

    pred_id    = int(np.argmax(probs))
    pred_label = LABELS[pred_id]

    print(f"\nText: {text}")
    print(f"Predicted label: {pred_label}")
    print(
        "Probabilities: "
        f"neg={probs[0]*100:.2f}%, "
        f"neu={probs[1]*100:.2f}%, "
        f"pos={probs[2]*100:.2f}%"
    )

    return {
        "text": text,
        "label": pred_label,
        "logits": logits.tolist(),
        "probabilities": {
            "negative": float(probs[0]),
            "neutral":  float(probs[1]),
            "positive": float(probs[2]),
        },
    }


# quick sanity checks
predict_sentiment("I feel a bit anxious but hopeful about the future.")
predict_sentiment("Everything is terrible and I don't see a way out.")
predict_sentiment("I feel calm, supported, and grateful today.")



Input details: [{'name': 'input_ids', 'index': 0, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'attention_mask', 'index': 1, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Output details: [{'name': 'Identity', 'index': 485, 'shape': array([1, 3], dtype=int32), 'shape_signature': array([-1,  3], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=in

{'text': 'I feel calm, supported, and grateful today.',
 'label': 'neutral',
 'logits': [0.4523107409477234, 1.4159293174743652, -1.5732548236846924],
 'probabilities': {'negative': 0.2664475739002228,
  'neutral': 0.6984028816223145,
  'positive': 0.035149555653333664}}

In [None]:
import os
import shutil
import zipfile

BASE = "/content/models"
OUT_ZIP = "/content/sentiment_model_bundle.zip"

# Paths
TFLITE_PATH = f"{BASE}/tflite_fpout/student_quant_fpout.tflite"
LABEL_PATH  = f"{BASE}/tflite_fpout/label_map.json"
TOKENIZER_DIR = f"{BASE}/tokenizer_student"

# Make bundle directory
BUNDLE_DIR = "/content/model_bundle"
shutil.rmtree(BUNDLE_DIR, ignore_errors=True)
os.makedirs(BUNDLE_DIR, exist_ok=True)

# Copy required files
shutil.copy(TFLITE_PATH, f"{BUNDLE_DIR}/student_quant_fpout.tflite")
shutil.copy(LABEL_PATH,  f"{BUNDLE_DIR}/label_map.json")

# Copy tokenizer folder
shutil.copytree(TOKENIZER_DIR, f"{BUNDLE_DIR}/tokenizer_student")

# Create ZIP
with zipfile.ZipFile(OUT_ZIP, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(BUNDLE_DIR):
        for file in files:
            full_path = os.path.join(root, file)
            rel_path  = os.path.relpath(full_path, BUNDLE_DIR)
            zipf.write(full_path, rel_path)

print("ZIP created at:", OUT_ZIP)

# Show file size
!ls -lh /content/sentiment_model_bundle.zip


ZIP created at: /content/sentiment_model_bundle.zip
-rw-r--r-- 1 root root 11M Nov 15 13:16 /content/sentiment_model_bundle.zip


In [None]:
# ==== FLOAT32 TFLite export (no quantization, faithful to TF) ====
import os, tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

SAVE_DIR = "/content/models"
STU_DIR  = f"{SAVE_DIR}/student_tf"
TOK_DIR  = f"{SAVE_DIR}/tokenizer_student"
OUT_DIR  = f"{SAVE_DIR}/tflite_fp32"
MAX_LEN  = 128

os.makedirs(OUT_DIR, exist_ok=True)

# load fine-tuned student + tokenizer
model = TFAutoModelForSequenceClassification.from_pretrained(STU_DIR)
tok   = AutoTokenizer.from_pretrained(TOK_DIR)

@tf.function(input_signature=[
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"),
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask"),
])
def serving(input_ids, attention_mask):
    logits = model(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        training=False
    ).logits
    return {"logits": logits}

concrete = serving.get_concrete_function()

converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete])
# ⚠️ no optimizations, pure float32
tflite_bytes = converter.convert()

out_path = f"{OUT_DIR}/student_fp32.tflite"
with open(out_path, "wb") as f:
    f.write(tflite_bytes)

print("Saved float32 TFLite model:", out_path)
!ls -lh {OUT_DIR}


Some layers from the model checkpoint at /content/models/student_tf were not used when initializing TFBertForSequenceClassification: ['dropout_81']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/models/student_tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Saved float32 TFLite model: /content/models/tflite_fp32/student_fp32.tflite
total 55M
-rw-r--r-- 1 root root 55M Nov 15 13:26 student_fp32.tflite


In [None]:
# ==== Inference with student_fp32.tflite ====
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer

TFLITE_PATH    = "/content/models/tflite_fp32/student_fp32.tflite"
TOKENIZER_PATH = "/content/models/tokenizer_student"
MAX_LEN        = 128
LABELS         = ["negative","neutral","positive"]

tok = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

interpreter = tf.lite.Interpreter(model_path=TFLITE_PATH, num_threads=4)
interpreter.allocate_tensors()

input_details  = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Input details:", input_details)
print("Output details:", output_details)

def softmax(x):
    x = np.array(x, dtype=np.float32)
    x = x - np.max(x, axis=-1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=-1, keepdims=True)

def predict_sentiment(text: str):
    enc = tok(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="np",
    )

    ids_dtype  = input_details[0]["dtype"]
    mask_dtype = input_details[1]["dtype"]

    input_ids = enc["input_ids"].astype(ids_dtype)
    mask      = enc["attention_mask"].astype(mask_dtype)

    interpreter.set_tensor(input_details[0]["index"], input_ids)
    interpreter.set_tensor(input_details[1]["index"], mask)
    interpreter.invoke()

    logits = interpreter.get_tensor(output_details[0]["index"])[0]  # shape (3,)
    probs  = softmax(logits)                                        # shape (3,)

    pred_id    = int(np.argmax(probs))
    pred_label = LABELS[pred_id]

    print(f"\nText: {text}")
    print(f"Predicted label: {pred_label}")
    print(
        "Probabilities: "
        f"neg={probs[0]*100:.2f}%, "
        f"neu={probs[1]*100:.2f}%, "
        f"pos={probs[2]*100:.2f}%"
    )

    return {
        "text": text,
        "label": pred_label,
        "logits": logits.tolist(),
        "probabilities": {
            "negative": float(probs[0]),
            "neutral":  float(probs[1]),
            "positive": float(probs[2]),
        },
    }

# Test with clearly different sentiments
predict_sentiment("I feel a bit anxious but hopeful about the future.")
predict_sentiment("Everything is terrible and I don't see a way out.")
predict_sentiment("I feel calm, supported, and grateful today.")


Input details: [{'name': 'input_ids', 'index': 0, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'attention_mask', 'index': 1, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Output details: [{'name': 'Identity', 'index': 483, 'shape': array([1, 3], dtype=int32), 'shape_signature': array([-1,  3], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=in

{'text': 'I feel calm, supported, and grateful today.',
 'label': 'positive',
 'logits': [-1.7378828525543213, -1.027045726776123, 2.3247697353363037],
 'probabilities': {'negative': 0.016349487006664276,
  'neutral': 0.0332825668156147,
  'positive': 0.9503679275512695}}

DISTILBERT


In [None]:
# ==== CELL 1: Load GoEmotions (Simplified) ====
from datasets import load_dataset

# This loads the simplified 3-class version (positive / negative / neutral)
ds = load_dataset("google-research-datasets/go_emotions", "simplified")

print(ds)
print("Train size:", len(ds["train"]))
print("Validation size:", len(ds["validation"]))
print("Test size:", len(ds["test"]))


simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})
Train size: 43410
Validation size: 5426
Test size: 5427


In [None]:
import json, os
os.makedirs(DATA_DIR, exist_ok=True)

NEG = {0,1,2,3,4,5,6,7}
NEU = {8,9,10}
POS = {11,12,13,14,15,16,17}

def map_emotions_to_sentiment(label_list):
    """Map multi-label emotion list → single sentiment label"""
    label_set = set(label_list)

    if label_set & NEG:
        return "negative"
    if label_set & POS:
        return "positive"
    return "neutral"

def save_jsonl(dataset_split, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        for row in dataset_split:
            text = row["text"].strip()
            labels = row["labels"]         # <- list
            sentiment = map_emotions_to_sentiment(labels)

            item = {"text": text, "label": sentiment}
            f.write(json.dumps(item) + "\n")

# save 3 files
save_jsonl(ds["train"],      f"{DATA_DIR}/train.jsonl")
save_jsonl(ds["validation"], f"{DATA_DIR}/val.jsonl")
save_jsonl(ds["test"],       f"{DATA_DIR}/test.jsonl")

print("Saved:")
!ls -lh {DATA_DIR}


Saved:
total 5.4M
-rw-r--r-- 1 root root 545K Nov 24 11:58 test.jsonl
-rw-r--r-- 1 root root 4.3M Nov 24 11:58 train.jsonl
-rw-r--r-- 1 root root 547K Nov 24 11:58 val.jsonl


In [None]:
# ================== DIRECT FINE-TUNING — DISTILBERT (FAST TRAINING) ==================
import os, json, numpy as np, tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, create_optimizer
from sklearn.metrics import f1_score
from tqdm.auto import tqdm

DATA_DIR = "/content/data"
SAVE_DIR = "/content/models_distilbert"
os.makedirs(SAVE_DIR, exist_ok=True)

MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 3
MAX_LEN = 128
BATCH = 32
EPOCHS = 3  # Fast fine-tuning

LABELS = ["negative","neutral","positive"]

# ---------- Load Tokenizer ----------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# ---------- Load JSONL ----------
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            x = json.loads(line)
            yield x["text"], LABELS.index(x["label"])

def make_ds(path, batch_size, shuffle=False):
    def gen():
        for text, y in read_jsonl(path):
            enc = tokenizer(
                text, truncation=True, padding="max_length",
                max_length=MAX_LEN
            )
            yield (
                {
                    "input_ids": np.int32(enc["input_ids"]),
                    "attention_mask": np.int32(enc["attention_mask"])
                },
                np.int32(y)
            )
    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec([MAX_LEN], tf.int32),
                "attention_mask": tf.TensorSpec([MAX_LEN], tf.int32),
            },
            tf.TensorSpec([], tf.int32)
        )
    )
    if shuffle:
        ds = ds.shuffle(4096)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(f"{DATA_DIR}/train.jsonl", BATCH, shuffle=True)
val_ds   = make_ds(f"{DATA_DIR}/val.jsonl",   BATCH)
test_ds  = make_ds(f"{DATA_DIR}/test.jsonl",  BATCH)

# ---------- Load DistilBERT Base (FIX: from_pt + disable safetensors) ----------
student = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    from_pt=True,           # <- load from PyTorch weights
    use_safetensors=False   # <- avoid safe_open error
)

# ---------- Optimizer ----------
steps_per_epoch = int(43410 / BATCH)
opt, _ = create_optimizer(
    init_lr=3e-5,
    num_train_steps=steps_per_epoch * EPOCHS,
    num_warmup_steps=int(0.1 * steps_per_epoch * EPOCHS),
)
opt.clipnorm = 1.0

# ---------- Training Loop ----------
best_f1 = -1.0

print("\n===== DIRECT FINE-TUNING DISTILBERT =====")
for ep in range(EPOCHS):
    print(f"\nEpoch {ep+1}/{EPOCHS}")

    # Training
    student.trainable = True
    pbar = tqdm(train_ds, desc="Training")
    for batch in pbar:
        x, y = batch
        with tf.GradientTape() as tape:
            logits = student(x, training=True).logits
            loss = tf.reduce_mean(
                tf.keras.losses.sparse_categorical_crossentropy(
                    y, logits, from_logits=True
                )
            )
        grads = tape.gradient(loss, student.trainable_variables)
        opt.apply_gradients(zip(grads, student.trainable_variables))
        pbar.set_postfix(loss=float(loss))

    # Validation F1
    y_true, y_pred = [], []
    for x, y in val_ds:
        logits = student(x, training=False).logits.numpy()
        y_true.extend(y.numpy())
        y_pred.extend(np.argmax(logits, axis=-1))
    val_f1 = f1_score(y_true, y_pred, average="macro")

    print(f"Val Macro-F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        student.save_pretrained(f"{SAVE_DIR}/best_distilbert")
        tokenizer.save_pretrained(f"{SAVE_DIR}/best_distilbert")
        print("✓ Saved new best model")

# ---------- Test Evaluation ----------
best = TFAutoModelForSequenceClassification.from_pretrained(
    f"{SAVE_DIR}/best_distilbert",
    num_labels=NUM_LABELS
)

y_true, y_pred = [], []
for x, y in test_ds:
    logits = best(x, training=False).logits.numpy()
    y_true.extend(y.numpy())
    y_pred.extend(np.argmax(logits, axis=-1))

test_f1 = f1_score(y_true, y_pred, average="macro")
print("\nFINAL TEST MACRO-F1:", test_f1)


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'cla


===== DIRECT FINE-TUNING DISTILBERT =====

Epoch 1/3


Training: 0it [00:00, ?it/s]



Val Macro-F1: 0.7245
✓ Saved new best model

Epoch 2/3


Training: 0it [00:00, ?it/s]

Val Macro-F1: 0.7329
✓ Saved new best model

Epoch 3/3


Training: 0it [00:00, ?it/s]

Val Macro-F1: 0.7283


Some layers from the model checkpoint at /content/models_distilbert/best_distilbert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/models_distilbert/best_distilbert and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



FINAL TEST MACRO-F1: 0.7233541071954143


In [None]:
BEST_DIR = "/content/models_distilbert/best_distilbert"

print("Loading trained DistilBERT…")

student = TFDistilBertForSequenceClassification.from_pretrained(
    BEST_DIR,
    num_labels=3,
    from_pt=False
)

tok = AutoTokenizer.from_pretrained(
    BEST_DIR,
    use_fast=True
)


Loading trained DistilBERT…


Some layers from the model checkpoint at /content/models_distilbert/best_distilbert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/models_distilbert/best_distilbert and are newly initialized: ['dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoTokenizer

TOK_OUT = "/content/models_distilbert/tokenizer_distilbert"
os.makedirs(TOK_OUT, exist_ok=True)

tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tok.save_pretrained(TOK_OUT)

print("Tokenizer saved to:", TOK_OUT)


Tokenizer saved to: /content/models_distilbert/tokenizer_distilbert


In [None]:
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer

MODEL_DIR = "/content/models_distilbert/best_distilbert"
TOK_DIR   = "/content/models_distilbert/tokenizer_distilbert"

print("Loading trained DistilBERT…")
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tok   = AutoTokenizer.from_pretrained(TOK_DIR)

print("Loaded successfully!")


Loading trained DistilBERT…


Some layers from the model checkpoint at /content/models_distilbert/best_distilbert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/models_distilbert/best_distilbert and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded successfully!


In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import json, os

MODEL_DIR = "/content/models_distilbert/best_distilbert"
TOKENIZER_DIR = "/content/models_distilbert/tokenizer_distilbert"
OUT_DIR = "/content/models_distilbert/tflite_fp32"
os.makedirs(OUT_DIR, exist_ok=True)

print("Loading trained DistilBERT (TF only)…")

model = TFAutoModelForSequenceClassification.from_pretrained(
    MODEL_DIR,
    num_labels=3,
    local_files_only=True,
    from_pt=False
)

tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_DIR,
    use_fast=True,
    local_files_only=True
)

MAX_LEN = 128

@tf.function(input_signature=[
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="input_ids"),
    tf.TensorSpec([None, MAX_LEN], tf.int32, name="attention_mask"),
])
def serving(input_ids, attention_mask):
    outputs = model(
        {"input_ids": input_ids, "attention_mask": attention_mask},
        training=False
    )
    return {"logits": outputs.logits}  # float32 [batch, 3]

concrete_func = serving.get_concrete_function()

# ✅ Use dynamic range quantization (NO representative dataset needed)
converter = tf.lite.TFLiteConverter.from_concrete_functions(
    [concrete_func],
    trackable_obj=model,   # gets rid of the deprecation warning
)

# This will quantize weights where possible but keep
# input = INT32, output = FLOAT32 as defined by the graph
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# ❌ DO NOT set inference_input_type / output_type or supported_ops here
# We let TFLite infer correct types from the function signature.

print("Converting to TFLite…")
tflite_model = converter.convert()

tflite_path = f"{OUT_DIR}/distilbert_fp32.tflite"
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

# Label map
label_map = {"labels": ["negative", "neutral", "positive"]}
with open(f"{OUT_DIR}/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

print("✅ Saved TFLite model to:", tflite_path)
print("✅ Saved label_map.json")
print("OUT_DIR contents:")
!ls -lh {OUT_DIR}


Loading trained DistilBERT (TF only)…


Some layers from the model checkpoint at /content/models_distilbert/best_distilbert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/models_distilbert/best_distilbert and are newly initialized: ['dropout_219']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Converting to TFLite…




✅ Saved TFLite model to: /content/models_distilbert/tflite_fp32/distilbert_fp32.tflite
✅ Saved label_map.json
OUT_DIR contents:
total 65M
-rw-r--r-- 1 root root 65M Nov 24 13:24 distilbert_fp32.tflite
-rw-r--r-- 1 root root  67 Nov 24 13:24 label_map.json


In [None]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer

TFLITE_PATH = "/content/models_distilbert/tflite_fp32/distilbert_fp32.tflite"
TOKENIZER_DIR = "/content/models_distilbert/tokenizer_distilbert"

# Load TFLite model
interpreter = tf.lite.Interpreter(model_path=TFLITE_PATH)
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print("Inputs:", input_details)
print("Outputs:", output_details)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    TOKENIZER_DIR,
    use_fast=True
)

MAX_LEN = 128
LABELS = ["negative", "neutral", "positive"]

def run_tflite(text):
    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

    input_ids = np.array([enc["input_ids"]], dtype=np.int32)
    attention_mask = np.array([enc["attention_mask"]], dtype=np.int32)

    # ✅ match the names/order from `input_details`
    interpreter.set_tensor(input_details[0]["index"], attention_mask)  # index 0 = attention_mask
    interpreter.set_tensor(input_details[1]["index"], input_ids)       # index 1 = input_ids
    interpreter.invoke()

    logits = interpreter.get_tensor(output_details[0]["index"])[0]
    probs = tf.nn.softmax(logits).numpy()
    pred_id = int(np.argmax(probs))
    return LABELS[pred_id], probs


for txt in [
    "I am very happy today!",
    "This is the worst thing ever.",
    "It is okay, nothing special."
]:
    label, probs = run_tflite(txt)
    print(f"\nText: {txt}")
    print("Pred:", label)
    print("Probs:", probs)


Inputs: [{'name': 'serving_default_attention_mask:0', 'index': 0, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}, {'name': 'serving_default_input_ids:0', 'index': 1, 'shape': array([  1, 128], dtype=int32), 'shape_signature': array([ -1, 128], dtype=int32), 'dtype': <class 'numpy.int32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
Outputs: [{'name': 'StatefulPartitionedCall:0', 'index': 707, 'shape': array([1, 3], dtype=int32), 'shape_signature': array([-1,  3], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=floa