In [1]:
# Upgrade to latest compatible versions
!pip install --upgrade transformers accelerate --quiet
print("Upgraded to latest versions!")

Upgraded to latest versions!


In [2]:
# Install compatible versions to avoid errors
!pip install torch transformers==4.36.0 accelerate==0.24.1 pandas scikit-learn tqdm --quiet
print("Compatible versions installed successfully!")

Compatible versions installed successfully!


  error: subprocess-exited-with-error
  
  × Preparing metadata (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [23 lines of output]
      Checking for Rust toolchain....
      Rust not found, installing into a temporary directory
      Python reports SOABI: cp313-win_amd64
      Computed rustc target triple: x86_64-pc-windows-msvc
      Installation directory: C:\Users\maanx\AppData\Local\puccinialin\puccinialin\Cache
      Rustup already downloaded
      Installing rust to C:\Users\maanx\AppData\Local\puccinialin\puccinialin\Cache\rustup
      warn: It looks like you have an existing rustup settings file at:
      warn: C:\Users\maanx\.rustup\settings.toml
      warn: Rustup will install the default toolchain as specified in the settings file,
      warn: instead of the one inferred from the default host triple.
      warn: installing msvc toolchain without its prerequisites
      info: profile set to 'minimal'
      info: default host triple is x86_64-pc-windows-ms

In [3]:
# Check installed versions
import transformers
import accelerate
import torch

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"Torch version: {torch.__version__}")

Transformers version: 4.56.1
Accelerate version: 1.10.1
Torch version: 2.8.0+cpu


In [4]:
!pip install transformers torch pandas numpy scikit-learn tqdm



Cell 1

In [5]:
# Data handling and manipulation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Deep Learning & NLP
import torch
from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification, 
                          TrainingArguments, 
                          Trainer,
                          pipeline)

# Progress bar
from tqdm.notebook import tqdm

# Suppress optional warnings to keep the output clean (optional)
import warnings
warnings.filterwarnings('ignore')

Cell 2

In [6]:
# 1. Load your dataset from the CSV file
df = pd.read_csv('fake_cvs.csv')

# 2. Let's see what the data looks like
print("Dataset Preview:")
print(df.head())
print(f"\nDataset Shape: {df.shape}") # Shows (number_of_rows, number_of_columns)

# 3. Check the distribution of labels (categories)
print("\nLabel Counts:")
print(df['label'].value_counts())

# 4. Map text labels to numbers (e.g., "IT" -> 0, "Marketing" -> 1)
label_list = df['label'].unique().tolist()
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print(f"\nLabel Mapping: {label_to_id}")

# Apply the mapping to the DataFrame
df['label'] = df['label'].map(label_to_id)

Dataset Preview:
                                                text label
0  مطور خلفي كبير يمتلك خبرة 6 سنوات في بايثون وج...    IT
1  مطور شامل Stack يجيد JavaScript و React.js و N...    IT
2  مهندس DevOps لديه خبرة في خدمات السحابة AWS وا...    IT
3  مطور واجهات أمامية متخصص في Vue.js وأطر عمل CS...    IT
4  عالم بيانات ماهر في Python و R ومكتبات التعلم ...    IT

Dataset Shape: (454, 2)

Label Counts:
label
IT             88
Marketing      80
Finance        76
Engineering    76
Healthcare     72
Education      62
Name: count, dtype: int64

Label Mapping: {'IT': 0, 'Marketing': 1, 'Finance': 2, 'Engineering': 3, 'Healthcare': 4, 'Education': 5}


Cell 3

In [7]:
# 1. Split the data (90% for training, 10% for testing)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), 
    df['label'].tolist(), 
    test_size=0.1, 
    random_state=42 # random_state ensures you get the same split every time
)

print(f"Number of training examples: {len(train_texts)}")
print(f"Number of testing examples: {len(test_texts)}")

# 2. Load the AraBERT Tokenizer
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Tokenize the texts (Convert text to numbers AraBERT understands)
print("\nTokenizing training texts...")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)

print("Tokenizing testing texts...")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

print("Tokenization complete!")

Number of training examples: 408
Number of testing examples: 46

Tokenizing training texts...
Tokenizing testing texts...
Tokenization complete!


Cell 4

In [8]:
# Create a custom Dataset class
class CVDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert tokenized inputs to tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset objects
train_dataset = CVDataset(train_encodings, train_labels)
test_dataset = CVDataset(test_encodings, test_labels)

Cell 5

In [9]:
# Load the pre-trained model and adjust it for our number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list)
)

# Print model architecture (optional)
print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}



Cell 6

In [10]:
# Define training parameters
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save model checkpoints
    num_train_epochs=10,             # Number of full passes through the data
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Learning rate warmup
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for logging
    logging_steps=10,                # Log every 10 steps
    eval_strategy="steps",           # Evaluate during training (NEW PARAMETER NAME)
    eval_steps=50,                   # Evaluate every 50 steps
    save_steps=500,                  # Save checkpoint every 500 steps
    load_best_model_at_end=True,     # Load the best model at the end
)

Cell 7

In [11]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Start training!
print("Starting training...")
trainer.train()
print("Training finished!")

Starting training...


Step,Training Loss,Validation Loss
50,1.7359,1.717639
100,1.4911,1.39397
150,0.7681,0.628416
200,0.1619,0.217348
250,0.0561,0.199178
300,0.0117,0.370876
350,0.007,0.239726
400,0.0042,0.277427
450,0.0031,0.088522
500,0.0126,0.077836


Training finished!


Cell 8

In [12]:
# Save the model and tokenizer
model.save_pretrained('./my_cv_classifier')
tokenizer.save_pretrained('./my_cv_classifier')
print("Model and tokenizer saved successfully!")

Model and tokenizer saved successfully!


cell 8.1

In [13]:
import re
from PyPDF2 import PdfReader

def pdf_to_text(pdf_path: str) -> str:
    """
    Extract raw text from a PDF file.
    """
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text.strip()

def extract_description(cv_text: str) -> str:
    """
    Extract description/summary/profile section from CV text.
    """
    headers = [
        "summary", "profile", "professional summary", "career objective",
        "objective", "about me", "personal profile", "description",
        "ملخص", "الملف الشخصي", "الهدف الوظيفي"  # Arabic headers
    ]
    
    lower_text = cv_text.lower()
    for header in headers:
        if header in lower_text:
            start = lower_text.index(header)
            section = cv_text[start:]
            stop_match = re.search(r"(experience|education|skills|projects|خبرة|التعليم|المهارات|المشاريع)", section, re.IGNORECASE)
            if stop_match:
                section = section[:stop_match.start()]
            return section.strip()
    
    # fallback: first paragraph
    first_para = cv_text.split("\n\n")[0]
    if len(first_para.split()) > 10:
        return first_para.strip()
    return cv_text

def extract_description_from_pdf(pdf_path: str) -> str:
    """
    Reads a PDF and extracts only the description section.
    """
    text = pdf_to_text(pdf_path)
    return extract_description(text)


cell 8.2

In [14]:
from transformers import pipeline

# Load trained pipeline
classifier = pipeline('text-classification', 
                      model='./my_cv_classifier', 
                      tokenizer='./my_cv_classifier', 
                      function_to_apply='softmax')


Device set to use cpu


In [15]:
# Example: classify a PDF CV
pdf_path = "sample_cv.pdf"  # change to your CV file
desc = extract_description_from_pdf(pdf_path)

print("Extracted description:\n", desc, "\n")

result = classifier(desc)[0]
predicted_label = id_to_label[int(result['label'].split('_')[-1])]
print(f"Predicted Category: {predicted_label} (Confidence: {result['score']:.4f})")


Extracted description:
 CAREER OBJECTIVE
Fresh IT graduate with a strong interest in Data Science and a solid foundation in machine learning, data analysis, and
programming. Proﬁcient in Python and key ML tools. Eager to grow in a data-driven role and apply analytical thinking to
real-world problems. 

Predicted Category: IT (Confidence: 0.9983)


Cell 9

In [16]:
# Method 1: Using the pipeline (Easiest)
classifier = pipeline('text-classification', 
                      model=model, 
                      tokenizer=tokenizer, 
                      function_to_apply='softmax')

# Test with new examples
test_texts = [
    "خبرة في تحليل البيانات واستخدام SQL و Python لاستخراج النتائج المهمة",  # Arabic IT
    "Experience in data analysis and using SQL and Python",                 # English IT
    "إدارة حملات التسويق على وسائل التواصل الاجتماعي",                      # Arabic Marketing
    "Managing social media marketing campaigns",                            # English Marketing
]

print("Making predictions:\n")
for text in test_texts:
    result = classifier(text)[0]
    predicted_label = id_to_label[int(result['label'].split('_')[-1])] # Convert output to label name
    print(f"Text: {text}")
    print(f"--> Predicted: {predicted_label} (Confidence: {result['score']:.4f})\n")

Device set to use cpu


Making predictions:

Text: خبرة في تحليل البيانات واستخدام SQL و Python لاستخراج النتائج المهمة
--> Predicted: IT (Confidence: 0.9976)

Text: Experience in data analysis and using SQL and Python
--> Predicted: IT (Confidence: 0.9980)

Text: إدارة حملات التسويق على وسائل التواصل الاجتماعي
--> Predicted: Marketing (Confidence: 0.9976)

Text: Managing social media marketing campaigns
--> Predicted: Marketing (Confidence: 0.9974)



matching function

In [17]:
from utils_matcher import match_cv_to_jobs

recommended_jobs = match_cv_to_jobs("IT", jobs_csv="job_posts_part/classified_wadhefa_dataset.csv", top_n=5)
print(recommended_jobs[["description", "url", "predicted_category"]])


                                         description  \
0  - تحديد مواصفات الحلول البديلة والاضافات والتح...   
1  - Manage and optimize the company’s ecommerce ...   

                                                 url predicted_category  
0  https://www.wadhefa.com/details/job/86996/search/                 IT  
1  https://www.wadhefa.com/details/job/86792/search/                 IT  


In [18]:
import pandas as pd

df = pd.read_csv("job_posts_part/classified_wadhefa_dataset.csv")

print("Shape:", df.shape)
print("Unique predicted categories:", df["predicted_category"].unique())


Shape: (3, 4)
Unique predicted categories: ['IT' 'Marketing']
