In [None]:
pip install transformers farasapy datasets scikit-learn arabert

In [None]:
# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

# Set random seed for reproducibility
tf.random.set_seed(42)

# Load and preprocess your dataset
df = pd.read_csv("/kaggle/input/dataset/ghada.csv")  # Replace with your file name
df.dropna(subset=['text', 'label'], inplace=True)
df['label'] = df['label'].map({'Negative': 0, 'Positive': 1}).astype(int)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)
# Load AraBERT tokenizer and model
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenization function
def tokenize(texts, max_len=128):
    encodings = tokenizer(texts.tolist(), truncation=True, padding="max_length", max_length=max_len)
    return tf.constant(encodings['input_ids']), tf.constant(encodings['attention_mask'])

# Tokenize train and test data
train_input_ids, train_attention_mask = tokenize(train_texts)
test_input_ids, test_attention_mask = tokenize(test_texts)
train_labels = tf.constant(train_labels.values)
test_labels = tf.constant(test_labels.values)
weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels.numpy())
class_weights = {i: w for i, w in enumerate(weights)}

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Add early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model
history = model.fit(
    [train_input_ids, train_attention_mask],
    train_labels,
    validation_split=0.2,
    epochs=5,
    batch_size=32,
    shuffle=True,
    class_weight=class_weights,
    callbacks=[early_stopping]
)

# Evaluate on test data
preds = model.predict([test_input_ids, test_attention_mask])
predicted_labels = tf.argmax(preds.logits, axis=1)

In [None]:
# Classification metrics
print("\nClassification Report:\n", classification_report(test_labels, predicted_labels))
print("Precision:", precision_score(test_labels, predicted_labels))
print("Recall:", recall_score(test_labels, predicted_labels))
print("F1 Score:", f1_score(test_labels, predicted_labels))

# Plot accuracy and loss
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
axs[0].plot(history.history['accuracy'], label='Train')
axs[0].plot(history.history['val_accuracy'], label='Val')
axs[0].set_title('Accuracy')
axs[0].legend()

axs[1].plot(history.history['loss'], label='Train')
axs[1].plot(history.history['val_loss'], label='Val')
axs[1].set_title('Loss')
axs[1].legend()
plt.show()

In [None]:
model.save("/kaggle/working/my_arabert_model")

In [2]:
import tensorflow as tf
import sys

print("TensorFlow version:", tf.__version__)
print("Python version:", sys.version)

TensorFlow version: 2.17.1
Python version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
