# PubMed RCT - Embeddings Model (GloVe)

This notebook builds a classification model using pre-trained GloVe word embeddings.

**Architecture:**
- TextVectorization layer
- Pre-trained GloVe embeddings (100d, frozen)
- GlobalAveragePooling1D
- Dense(64, ReLU) + Dropout(0.5)
- Dense(5, softmax)

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix

np.random.seed(42)
tf.random.set_seed(42)
%matplotlib inline

## Load Data

In [None]:
def load_pubmed_data(filepath):
    """Load and preprocess PubMed RCT data from a text file.
    Returns a list of dicts with keys: target, text, line_number, total_lines.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()

    samples = []
    abstract_lines = ""

    for line in lines:
        if line.startswith("###"):
            abstract_lines = ""
        elif line.isspace():
            split = abstract_lines.splitlines()
            for i, al in enumerate(split):
                parts = al.split("\t")
                if len(parts) == 2:
                    samples.append({
                        "target": parts[0],
                        "text": parts[1].lower(),
                        "line_number": i,
                        "total_lines": len(split) - 1
                    })
        else:
            abstract_lines += line

    return samples

In [None]:
DATA_DIR = "../data/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"
CLASS_NAMES = ["BACKGROUND", "OBJECTIVE", "METHODS", "RESULTS", "CONCLUSIONS"]
MAX_LENGTH = 55

train_df = pd.DataFrame(load_pubmed_data(os.path.join(DATA_DIR, "train.txt")))
val_df = pd.DataFrame(load_pubmed_data(os.path.join(DATA_DIR, "dev.txt")))
test_df = pd.DataFrame(load_pubmed_data(os.path.join(DATA_DIR, "test.txt")))

train_sentences = train_df["text"].to_numpy()
val_sentences = val_df["text"].to_numpy()
test_sentences = test_df["text"].to_numpy()

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
train_labels = encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
val_labels = encoder.transform(val_df["target"].to_numpy().reshape(-1, 1))
test_labels = encoder.transform(test_df["target"].to_numpy().reshape(-1, 1))

print(f"Train: {len(train_sentences)} | Val: {len(val_sentences)} | Test: {len(test_sentences)}")
print(f"Classes: {encoder.categories_[0]}")

## Text Vectorization

In [None]:
max_tokens = 68000

text_vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=MAX_LENGTH
)
text_vectorizer.adapt(train_sentences)

vocab = text_vectorizer.get_vocabulary()
print(f"Vocabulary size: {len(vocab)}")

## Load GloVe Embeddings

Download GloVe from https://nlp.stanford.edu/data/glove.6B.zip and place `glove.6B.100d.txt` in the `data/` folder.

In [None]:
embedding_dim = 100
glove_path = "../data/glove.6B.100d.txt"

# Load GloVe vectors into a dictionary
embeddings_index = {}
if os.path.exists(glove_path):
    with open(glove_path, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = vector
    print(f"Loaded {len(embeddings_index)} word vectors from GloVe")
else:
    print(f"GloVe file not found at {glove_path}. Using random embeddings.")

In [None]:
# Build embedding matrix matching our vocabulary
embedding_matrix = np.zeros((len(vocab), embedding_dim))
found = 0

for i, word in enumerate(vocab):
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[i] = vec
        found += 1

print(f"Words found in GloVe: {found}/{len(vocab)} ({found/len(vocab)*100:.1f}%)")

## Build the Model

In [None]:
inputs = layers.Input(shape=[], dtype="string")

x = text_vectorizer(inputs)
x = layers.Embedding(
    input_dim=len(vocab),
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False  # freeze pre-trained embeddings
)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(len(CLASS_NAMES), activation="softmax")(x)

model = Model(inputs, outputs, name="glove_model")
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])
model.summary()

## Train

In [None]:
history = model.fit(
    train_sentences, train_labels,
    epochs=10,
    batch_size=32,
    validation_data=(val_sentences, val_labels)
)

## Training Curves

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.plot(history.history["loss"], label="Train")
ax1.plot(history.history["val_loss"], label="Validation")
ax1.set_title("Loss")
ax1.set_xlabel("Epoch")
ax1.legend()

ax2.plot(history.history["accuracy"], label="Train")
ax2.plot(history.history["val_accuracy"], label="Validation")
ax2.set_title("Accuracy")
ax2.set_xlabel("Epoch")
ax2.legend()

plt.tight_layout()
plt.show()

## Evaluation on Test Set

In [None]:
test_loss, test_acc = model.evaluate(test_sentences, test_labels, verbose=0)
print(f"Test loss: {test_loss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")

# Predictions
preds = np.argmax(model.predict(test_sentences, verbose=0), axis=1)
true = np.argmax(test_labels, axis=1)

print()
print(classification_report(true, preds, target_names=CLASS_NAMES))

In [None]:
# Confusion matrix
cm = confusion_matrix(true, preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
plt.title("Confusion Matrix - Embeddings Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.tight_layout()
plt.show()

## Save Results

In [None]:
os.makedirs("../results", exist_ok=True)

results = {
    "model_name": "Embeddings Model (GloVe)",
    "test_accuracy": float(test_acc),
    "test_loss": float(test_loss),
}
with open("../results/embeddings_results.json", "w") as f:
    json.dump(results, f, indent=2)

print(f"Results saved. Test accuracy = {test_acc*100:.2f}%")