Imports & Dataset Loading

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the dataset you scraped
df = pd.read_csv("wadhefa_dataset.csv")

# Drop unused column
df = df[['description', 'category']]
print("Dataset Preview:")
print(df.head())

print("\nCategory counts:")
print(df['category'].value_counts())


Dataset Preview:
                                         description category
0  تعلن شركة تقنية معلومات عن توفر شاغر وظيفي بمس...       IT
1  '- تصميم الشبكات (LAN, WAN, WLAN)\n- ادارة اجه...       IT
2  '- مطلوب رسام اوتوكاد للرسم وقراءة المخططات وح...       IT
3  الوصف الوظيفي:\n- تطوير وبناء تطبيقات موبايل م...       IT
4  '- مطلوب مبرمج لديه خبرة في في Oracle APEX للع...       IT

Category counts:
category
Marketing      99
IT             98
Finance        97
Engineering    95
Education      95
Healthcare     93
Name: count, dtype: int64


Encode Labels

In [2]:
# Create label mapping
label_list = df['category'].unique().tolist()
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

# Add numeric labels
df['label'] = df['category'].map(label_to_id)

print("Label Mapping:", label_to_id)


Label Mapping: {'IT': 0, 'Marketing': 1, 'Finance': 2, 'Engineering': 3, 'Healthcare': 4, 'Education': 5}


Train-Test Split

In [3]:
# Split the dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['description'].tolist(),
    df['label'].tolist(),
    test_size=0.1,
    random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Testing samples: {len(test_texts)}")


Training samples: 519
Testing samples: 58


Tokenization

In [4]:
# Use AraBERT tokenizer
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


Dataset Class

In [5]:
class JobDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = JobDataset(train_encodings, train_labels)
test_dataset = JobDataset(test_encodings, test_labels)


Model Setup

In [6]:
# Load pre-trained AraBERT with classification head
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list)
)

print(model.config)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}



Training Arguments & Trainer

In [7]:
training_args = TrainingArguments(
    output_dir='./job_results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_dir='./job_logs',
    eval_strategy="steps",
    eval_steps=50,
    save_steps=500,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

print("Starting training...")
trainer.train()
print("Training finished!")


Starting training...




Step,Training Loss,Validation Loss
50,No log,1.11683
100,No log,0.866959
150,No log,0.662744
200,No log,0.619623
250,No log,0.665401
300,No log,0.696986


Training finished!


Save Model

In [8]:
# Save the trained classifier
model.save_pretrained('./job_classifier')
tokenizer.save_pretrained('./job_classifier')
print("Job classifier model saved successfully!")


Job classifier model saved successfully!


Inference on New Job Posts

In [9]:
from transformers import pipeline

# Load the trained pipeline
classifier = pipeline('text-classification',
                      model='./job_classifier',
                      tokenizer='./job_classifier',
                      function_to_apply='softmax')

# Example job descriptions
test_jobs = [
    "مطلوب مطور بايثون للعمل على تطبيقات ويب",
    "إدارة الحملات التسويقية عبر وسائل التواصل الاجتماعي",
    "وظيفة في قسم المحاسبة لمتابعة التقارير المالية"
]

print("\nPredictions:")
for job in test_jobs:
    result = classifier(job)[0]
    pred_label = id_to_label[int(result['label'].split('_')[-1])]
    print(f"Job: {job}")
    print(f"--> Predicted: {pred_label} (Confidence: {result['score']:.4f})\n")


Device set to use cpu



Predictions:
Job: مطلوب مطور بايثون للعمل على تطبيقات ويب
--> Predicted: IT (Confidence: 0.9579)

Job: إدارة الحملات التسويقية عبر وسائل التواصل الاجتماعي
--> Predicted: Marketing (Confidence: 0.9305)

Job: وظيفة في قسم المحاسبة لمتابعة التقارير المالية
--> Predicted: Finance (Confidence: 0.9437)

