# 04 – Embeddings + TF–IDF + Meta Scaling

In [None]:
!pip install transformers scikit-learn

In [None]:
import os
import joblib
import numpy as np
import torch

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [None]:
# paths
PREPRO_DIR   = "/kaggle/input/01-data-loading-preprocessing"
TRAIN_PKL    = f"{PREPRO_DIR}/train_df.pkl"
VAL_PKL      = f"{PREPRO_DIR}/val_df.pkl"
TEST_PKL     = f"{PREPRO_DIR}/test_df.pkl"

MODEL_DIR    = "/kaggle/input/03-distilbert-final/distilbert_model"
OUTPUT_XY    = "/kaggle/working/Xy_data.pkl"
OUTPUT_VECT  = "/kaggle/working/tfidf_vect_refit.pkl"
OUTPUT_SCAL  = "/kaggle/working/scaler_refit.pkl"
OUTPUT_PROBS = "/kaggle/working/test_probs.pkl"

In [None]:
# load splits
train_df = joblib.load(TRAIN_PKL)
val_df   = joblib.load(VAL_PKL)
test_df  = joblib.load(TEST_PKL)


# texts & labels
texts_tr, labels_tr   = train_df.text.tolist(), train_df.label.values
texts_val, labels_val = val_df.text.tolist(),   val_df.label.values
texts_te, labels_te   = test_df.text.tolist(),  test_df.label.values

In [None]:
# load DistilBERT
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = DistilBertForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_DIR)
model.eval()

# helper: extract CLS embeddings + predicted probs
def extract(texts, batch_size=32, max_length=256):
    embs, probs = [], []
    for i in range(0, len(texts), batch_size):
        chunk = texts[i: i + batch_size]
        enc = tokenizer(
            chunk,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        ).to(device)
        with torch.no_grad():
            out = model(**enc, output_hidden_states=True)
            embs.append(out.hidden_states[-1][:, 0, :].cpu().numpy())
            probs.append(torch.softmax(out.logits, dim=-1).cpu().numpy())
    return np.vstack(embs), np.vstack(probs)

In [None]:
# 1) extract for train/val/test
emb_tr, prob_tr     = extract(texts_tr)
emb_val, prob_val   = extract(texts_val)
emb_te, prob_te     = extract(texts_te)

# Persist just the test-set DistilBERT probabilities for Hybrid 06
joblib.dump(prob_te, OUTPUT_PROBS)
print("Saved DistilBERT test-set probs to", OUTPUT_PROBS)

In [None]:
# 2) TF–IDF: refit on train, transform val & test
vect    = TfidfVectorizer(
    max_features=15000,
    stop_words="english",
    ngram_range=(1, 3)
)
Xtf_tr  = vect.fit_transform(texts_tr).toarray().astype(np.float32)
Xtf_val = vect.transform(texts_val).toarray().astype(np.float32)
Xtf_te  = vect.transform(texts_te).toarray().astype(np.float32)
joblib.dump(vect, OUTPUT_VECT)
print("Saved TF–IDF vectorizer to", OUTPUT_VECT)

In [None]:
# 3) Meta-feature scaling
meta_cols = [
    "num_words",
    "num_exclaims",
    "num_questions",
    "vader_compound",
    "num_adjectives",
    "afinn_score"
]
ms    = StandardScaler()
Mtr   = train_df[meta_cols].values
Mval  = val_df[meta_cols].values
Mte   = test_df[meta_cols].values

Ms_tr  = ms.fit_transform(Mtr)
Ms_val = ms.transform(Mval)
Ms_te  = ms.transform(Mte)
joblib.dump(ms, OUTPUT_SCAL)
print("Saved meta‐feature scaler to", OUTPUT_SCAL)

In [None]:
# 4) Encode source
src_tr  = train_df.source.factorize()[0].reshape(-1, 1)
src_val = val_df.source.factorize()[0].reshape(-1, 1)
src_te  = test_df.source.factorize()[0].reshape(-1, 1)

In [None]:
# 5) Assemble final feature matrices
X_train = np.hstack([emb_tr,   prob_tr,   Xtf_tr,   src_tr,   Ms_tr])
X_val   = np.hstack([emb_val,  prob_val,  Xtf_val,  src_val,  Ms_val])
X_test  = np.hstack([emb_te,   prob_te,   Xtf_te,   src_te,   Ms_te])

y_train, y_val, y_test = labels_tr, labels_val, labels_te

In [None]:
# 6) Persist everything
joblib.dump((X_train, y_train, X_val, y_val, X_test, y_test), OUTPUT_XY)
print("Saved feature arrays to", OUTPUT_XY)