# Importing Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, AdamWeightDecay
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
# Load and Preprocess Data
data = pd.read_excel('/kaggle/input/urdu-dataset-nlp/Proposed Dataset.xlsx')
data.dropna(inplace=True)
data.head()

Unnamed: 0,sentence,target,sents
0,اس کا تھمب نیل ویگن پورن جیسا لگتا ہے۔,0,اس کا تھمب نیل ویگن پورن جیسا لگتا
1,اس کے بال بہت اچھے ہیں اوہ ما خدا مجھے اس پر و...,0,اس بال بہت اوہ ما خدا مجھے اس واقعی فخر کاش می...
2,یہ ویڈیو کتنی پرانی ہے کیونکہ میں جانتا ہوں کہ...,0,ویڈیو کتنی پرانی کیونکہ میں جانتا آپ کسی بیکن ...
3,اگر وہ مکمل ویگن ہو رہی ہے تو وہ جانتی ہے کہ و...,0,اگر وہ مکمل ویگن وہ جانتی وہ ایسی مصنوعات استع...
4,میرے پسندیدہ سبزی خور کھانے میں سے کچھ سوادج پ...,0,میرے پسندیدہ سبزی خور کھانے میں سے کچھ سوادج پ...


In [3]:
texts = data['sents'].tolist()
labels = data['target'].tolist()

In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def tokenize_texts(texts, labels, max_length=128):
    # Convert each element in 'texts' to a string
    texts = [str(text) for text in texts]

    encodings = tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding=True,
        return_tensors="tf"
    )
    return tf.data.Dataset.from_tensor_slices((dict(encodings), labels))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [6]:
train_dataset = tokenize_texts(train_texts, train_labels).shuffle(len(train_texts)).batch(8)
val_dataset = tokenize_texts(val_texts, val_labels).batch(8)
test_dataset = tokenize_texts(test_texts, test_labels).batch(8)

In [7]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

optimizer = AdamWeightDecay(learning_rate=1e-5, weight_decay_rate=0.01)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
for epoch in range(10):
    print(f"Epoch {epoch+1} of 10")
    model.fit(train_dataset, validation_data=val_dataset, epochs=1)
    
    # Save the model manually after each epoch
    model.save_pretrained(f'model_epoch_{epoch:02d}')
    print(f"Model saved at model_epoch_{epoch:02d}")

Epoch 1 of 10
Model saved at model_epoch_00
Epoch 2 of 10
Model saved at model_epoch_01
Epoch 3 of 10
Model saved at model_epoch_02
Epoch 4 of 10
Model saved at model_epoch_03
Epoch 5 of 10
Model saved at model_epoch_04
Epoch 6 of 10
Model saved at model_epoch_08
Epoch 10 of 10
Model saved at model_epoch_09


In [15]:
test_results = model.evaluate(test_dataset)
print(f"Validation Loss: {test_results[0]}")
print(f"Validation Accuracy: {test_results[1]}")

Validation Loss: 0.6393029093742371
Validation Accuracy: 0.778035581111908


In [16]:
y_true = []
y_pred = []
for batch in test_dataset:
    inputs, labels = batch
    logits = model.predict(inputs).logits
    predictions = tf.argmax(logits, axis=1).numpy()
    y_true.extend(labels.numpy())
    y_pred.extend(predictions)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.7780355761794276
Precision: 0.8102543798871622
Recall: 0.7741630414223567
F1 Score: 0.7917976495623155


In [17]:
model.save(f"Bert_model.keras")

