In [1]:
!pip -q install datasets evaluate transformers[sentencepiece]
!pip -q install --upgrade fsspec
!pip -q install --upgrade datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.5.1 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine ==

In [4]:
print(raw_dataset)


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
})


In [5]:
# 🔹 SNLI (Stanford Natural Language Inference)

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import numpy as np
from functools import partial

raw_dataset = load_dataset("stanfordnlp/snli")
# print(raw_dataset["train"][0])
print(raw_dataset)

# Remove examples with label -1 (invalid)
def filter_labels(example):
    return example["label"] != -1

filtered_dataset = raw_dataset.filter(filter_labels)

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example, tokenizer, sentence1_key, sentence2_key=None):
    if sentence2_key is None:
        return tokenizer(example[sentence1_key], truncation=True)
    else:
        return tokenizer(example[sentence1_key], example[sentence2_key], truncation=True)

tokenized_datasets = filtered_dataset .map(partial(tokenize_function, tokenizer=tokenizer, sentence1_key="premise", sentence2_key="hypothesis"), batched=True)
# print(tokenized_datasets)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# unique_labels = set(tokenized_datasets["train"]["labels"])
# print(f"Unique labels: {unique_labels}")
# print(f"Number of unique labels: {len(unique_labels)}")

# for split in tokenized_datasets:
#     unique = set(tokenized_datasets[split]["labels"])
#     print(f"{split} split - Unique labels: {unique} (Total: {len(unique)})")


data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator,
)

small_train_dataset = tf_train_dataset.take(5000)  # first 5k samples

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator,
)

small_val_dataset = tf_validation_dataset.take(1000)

# print(f'TF Train Dataset: {tf_train_dataset}')



DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
})


Map:   0%|          | 0/9842 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [7]:
! git config --global user.email "mouhamadibrahim42@gmail.com"
! git config --global user.name "Mhammad2022Ibrahim"

In [None]:
from transformers import TFAutoModelForSequenceClassification, create_optimizer, PushToHubCallback
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import tensorflow as tf
# from data_snli import checkpoint, tf_train_dataset, tf_validation_dataset

# import os
# from dotenv import load_dotenv

# load_dotenv()

# HUB_TOKEN = os.getenv("HUB_TOKEN")

# Load the pre-trained model for sequence classification
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# model.compile(
#     optimizer="adam",
#     loss=SparseCategoricalCrossentropy(from_logits=True),
#     metrics=["accuracy"],
# )
# # Train the model
# print("Training the model...")
# model.fit(
#     tf_train_dataset,
#     validation_data=tf_validation_dataset,
# )

batch_size = 8
num_epochs = 3
# num_train_steps = len(tf_train_dataset) * num_epochs
num_train_steps = len(small_train_dataset) * num_epochs
num_warmup_steps = int(0.1 * num_train_steps)

# Hugging Face optimizer with learning rate scheduling and weight decay
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    weight_decay_rate=0.01
)

# Loss function
loss = SparseCategoricalCrossentropy(from_logits=True)

# Compile model
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

# Optional: Push the model to Hugging Face Hub
# model.push_to_hub("snli-bert-base-uncased", organization="your_org_name", private=True, commit_message="Initial commit")
push_to_hub_callback = PushToHubCallback(output_dir="./snli-bert-base-uncased",
                                         hub_model_id="Mhammad2023/snli-bert-base-uncased",
                                         hub_token=HUB_TOKEN
                                         
                                         )

# Train the model
print("Training the model...")
model.fit(
    # tf_train_dataset,
    small_train_dataset,
    # validation_data=tf_validation_dataset,
    validation_data=small_val_dataset,
    epochs=num_epochs,
    batch_size=batch_size,
    callbacks=[push_to_hub_callback]
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/snli-bert-base-uncased is already a clone of https://huggingface.co/Mhammad2023/snli-bert-base-uncased. Make sure you pull the latest changes with `repo.git_pull()`.


Training the model...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7e97704e6cd0>

In [9]:
preds = model.predict(small_val_dataset)["logits"]



In [10]:
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

(8000, 3) (8000,)


In [12]:
!pip -q install evaluate

In [17]:
tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    # No label_cols needed for prediction
    shuffle=False, # No need to shuffle for prediction
    collate_fn=data_collator,
    batch_size=8, # Use the same batch size as before or adjust if needed
)

In [19]:
preds = model.predict(tf_test_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)
accuracy_metric = evaluate.load("accuracy")
accuracy_metric.compute(predictions=class_preds, references=filtered_dataset["test"]["label"])



{'accuracy': 0.8784609120521173}

In [20]:
import evaluate

# Load the F1 metric
f1_metric = evaluate.load("f1")

# Compute F1 score (for multi-class classification)
f1_result = f1_metric.compute(
    predictions=class_preds,
    references=filtered_dataset["test"]["label"],
    average="weighted"  # or use "macro" or "micro" depending on your use case
)

print(f1_result)

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'f1': 0.8783964921834587}
