In [4]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import os

# Load data
train_df = pd.read_csv('/kaggle/input/data-files/en_train.csv')
test_df = pd.read_csv('/kaggle/input/data-files/en_dev.csv')

In [5]:
# BERT Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode_texts(texts, max_length=128):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='tf')

# Split training data into train and validation sets
train_texts, val_texts, train_binary_labels, val_binary_labels = train_test_split(
    train_df['text'], train_df['binary'], test_size=0.2, random_state=42
)
train_multi_labels, val_multi_labels = train_test_split(
    train_df['multiclass'], test_size=0.2, random_state=42
)

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_df['text'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

I0000 00:00:1746842187.156666      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [6]:
# Label mapping
binary_label_map = {'Not Hope': 0, 'Hope': 1}
multi_label_map = {'Not Hope': 0, 'Generalized Hope': 1, 'Realistic Hope': 2, 'Unrealistic Hope': 3, 'Sarcasm': 4}

y_train_binary = train_binary_labels.map(binary_label_map)
y_val_binary = val_binary_labels.map(binary_label_map)
y_test_binary = test_df['binary'].map(binary_label_map)
y_train_multi = train_multi_labels.map(multi_label_map)
y_val_multi = val_multi_labels.map(multi_label_map)
y_test_multi = test_df['multiclass'].map(multi_label_map)

# Binary Model
model_binary = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model_binary.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Callback to save the best model
checkpoint_binary = tf.keras.callbacks.ModelCheckpoint(
    '/kaggle/working/bert_binary_model',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    save_format='tf'
)

model_binary.fit(
    [train_encodings['input_ids'], train_encodings['attention_mask']],
    y_train_binary,
    validation_data=([val_encodings['input_ids'], val_encodings['attention_mask']], y_val_binary),
    epochs=3,  
    batch_size=8,
    callbacks=[checkpoint_binary],
    verbose=1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b148e3c2e50>

In [8]:
# Evaluate on test set
binary_pred = model_binary.predict([test_encodings['input_ids'], test_encodings['attention_mask']])
binary_pred_labels = tf.argmax(binary_pred.logits, axis=1)
binary_acc = accuracy_score(y_test_binary, binary_pred_labels)

# Calculate weighted metrics for binary model
binary_w_prec, binary_w_rec, binary_w_f1, _ = precision_recall_fscore_support(y_test_binary, binary_pred_labels, average='weighted')
binary_m_prec, binary_m_rec, binary_m_f1, _ = precision_recall_fscore_support(y_test_binary, binary_pred_labels, average='macro')




In [9]:
# Multiclass Model
model_multi = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
model_multi.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Callback to save the best model
checkpoint_multi = tf.keras.callbacks.ModelCheckpoint(
    '/kaggle/working/bert_multi_model',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    save_format='tf'
)

model_multi.fit(
    [train_encodings['input_ids'], train_encodings['attention_mask']],
    y_train_multi,
    validation_data=([val_encodings['input_ids'], val_encodings['attention_mask']], y_val_multi),
    epochs=3,  
    batch_size=8,
    callbacks=[checkpoint_multi],
    verbose=1
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b12e45a6750>

In [10]:
# Evaluate on test set
multi_pred = model_multi.predict([test_encodings['input_ids'], test_encodings['attention_mask']])
multi_pred_labels = tf.argmax(multi_pred.logits, axis=1)
multi_acc = accuracy_score(y_test_multi, multi_pred_labels)

# Calculate weighted and macro metrics for multiclass model
multi_w_prec, multi_w_rec, multi_w_f1, _ = precision_recall_fscore_support(y_test_multi, multi_pred_labels, average='weighted')
multi_m_prec, multi_m_rec, multi_m_f1, _ = precision_recall_fscore_support(y_test_multi, multi_pred_labels, average='macro')



In [11]:
# Save tokenizer
tokenizer.save_pretrained('/kaggle/working/bert_tokenizer')

('/kaggle/working/bert_tokenizer/tokenizer_config.json',
 '/kaggle/working/bert_tokenizer/special_tokens_map.json',
 '/kaggle/working/bert_tokenizer/vocab.txt',
 '/kaggle/working/bert_tokenizer/added_tokens.json')

In [12]:
# Print results
print(f"Binary Accuracy: {binary_acc:.4f}")
print(f"Binary Weighted Precision: {binary_w_prec:.4f}")
print(f"Binary Weighted Recall: {binary_w_rec:.4f}")
print(f"Binary Weighted F1: {binary_w_f1:.4f}")
print(f"Binary Macro Precision: {binary_m_prec:.4f}")
print(f"Binary Macro Recall: {binary_m_rec:.4f}")
print(f"Binary Macro F1: {binary_m_f1:.4f}")
print(f"Multiclass Accuracy: {multi_acc:.4f}")
print(f"Multiclass Weighted Precision: {multi_w_prec:.4f}")
print(f"Multiclass Weighted Recall: {multi_w_rec:.4f}")
print(f"Multiclass Weighted F1: {multi_w_f1:.4f}")
print(f"Multiclass Macro Precision: {multi_m_prec:.4f}")
print(f"Multiclass Macro Recall: {multi_m_rec:.4f}")
print(f"Multiclass Macro F1: {multi_m_f1:.4f}")

Binary Accuracy: 0.8365
Binary Weighted Precision: 0.8420
Binary Weighted Recall: 0.8365
Binary Weighted F1: 0.8365
Binary Macro Precision: 0.8393
Binary Macro Recall: 0.8391
Binary Macro F1: 0.8365
Multiclass Accuracy: 0.7487
Multiclass Weighted Precision: 0.7755
Multiclass Weighted Recall: 0.7487
Multiclass Weighted F1: 0.7517
Multiclass Macro Precision: 0.7136
Multiclass Macro Recall: 0.7530
Multiclass Macro F1: 0.7190
