In [1]:
import tensorflow_datasets as tfds
from transformers import pipeline
from sklearn.metrics import accuracy_score

# ============================================
# 1. LOAD TFDS DATASET
# ============================================
print("Loading IMDB dataset (50 samples for quick testing)...")

ds_data = tfds.load("imdb_reviews", split="test[:50]", shuffle_files=False)

texts = []
true_labels = []

for example in ds_data:
    text = example["text"].numpy().decode("utf-8")
    label = int(example["label"].numpy())
    texts.append(text)
    true_labels.append(label)

# ============================================
# 2. LOAD MODELS (AUTOMATICALLY USE PYTORCH)
# ============================================

print("\nLoading Model A: DistilBERT...")
classifier_a = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

print("Loading Model B: BERT-IMDB...")
classifier_b = pipeline(
    "sentiment-analysis",
    model="textattack/bert-base-uncased-imdb"
)

# ============================================
# 3. GET PREDICTIONS
# ============================================

def get_predictions(classifier, data):
    results = classifier(data, truncation=True, max_length=512)
    preds = []

    for r in results:
        label = r["label"].upper()
        if "NEG" in label or "LABEL_0" in label:
            preds.append(0)
        else:
            preds.append(1)
    return preds

print("\nRunning predictions...")
preds_a = get_predictions(classifier_a, texts)
preds_b = get_predictions(classifier_b, texts)

# ============================================
# 4. ACCURACY SCORES
# ============================================

acc_a = accuracy_score(true_labels, preds_a)
acc_b = accuracy_score(true_labels, preds_b)

print("\n==================== RESULTS ====================")
print(f"DistilBERT Accuracy: {acc_a:.2f}")
print(f"BERT-IMDB Accuracy:  {acc_b:.2f}")

best = "BERT-IMDB" if acc_b > acc_a else "DistilBERT"
print(f"\nBest Model = {best}")

if best == "BERT-IMDB":
    print("Reason: BERT-IMDB is fine-tuned specifically on IMDB movie reviews → higher accuracy.")
else:
    print("Reason: DistilBERT is lighter & faster → may slightly underperform but runs much faster.")


  from .autonotebook import tqdm as notebook_tqdm


Loading IMDB dataset (50 samples for quick testing)...

Loading Model A: DistilBERT...




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


Loading Model B: BERT-IMDB...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu



Running predictions...

DistilBERT Accuracy: 0.78
BERT-IMDB Accuracy:  0.82

Best Model = BERT-IMDB
Reason: BERT-IMDB is fine-tuned specifically on IMDB movie reviews → higher accuracy.
