## Opening

### Modules init

In [1]:
# System/env config
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))
print('Current dir for import:', parent_dir)

from src.config import Config
config = Config()
print('Config initialized')


import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

# Modules for data 
import re
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from datasets import Dataset
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)

import evaluate
import torch




Current dir for import: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting


  from .autonotebook import tqdm as notebook_tqdm


Config initialized


In [2]:
# Temp. Used for fast init and testing
with open(os.path.join(os.getcwd(), 'id2label.json'), "r", encoding="utf-8") as f:
    id2label = json.load(f)
with open(os.path.join(os.getcwd(), 'label2id.json'), "r", encoding="utf-8") as f:
    label2id = json.load(f)    

### Download ETL Data

In [2]:
df = pd.read_parquet(config.get('cleaned_parquet'))

In [3]:
df

Unnamed: 0,category,job_posting
6,Automotive,£500 Bonus on Attandance during Black Friday a...
10,Manufacturing,"$2, 500 POTENTIAL RETENTION BONUS! WHAT'S NEW ..."
17,Automotive,£500 Bonus on Attandance during Black Friday a...
18,Automotive,£500 Bonus on Attandance during Black Friday a...
20,Government,"AS9102 First article & ANSI Y14.5M 1982 "" Basi..."
...,...,...
1760381,Sales,Structured training and development programmes...
1760386,Management,Be responsible for managing impact to people f...
1760387,Operations,"At Liberty Mutual, technology isn't just a par..."
1760388,Science,€30k per annum. 12 month contract. Dublin. We ...


### Preprocess cleaned data For BERT - numbers and labels

For BERT we need to cleand data from uneccessary numbers, and encode labels

In [4]:
def clean_for_bert(text: str) -> str:
    text = str(text)
    text = re.sub(r"<.*?>", " ", text)                     # HTML
    text = re.sub(r"http\S+|www\.\S+", " ", text)          # URL
    text = re.sub(r"\S+@\S+", " ", text)                   # email
    text = re.sub(r"\+?\d[\d\-\(\) ]{7,}\d", " ", text)    # phones
    text = re.sub(r"\s+", " ", text).strip()
    return text

assert {"job_posting", "category"}.issubset(df.columns), df.columns

df["job_posting"] = df["job_posting"].apply(clean_for_bert)


labels = sorted(df["category"].unique())
label2id = {lbl:i for i,lbl in enumerate(labels)}
id2label = {i:lbl for lbl,i in label2id.items()}
df["category"] = df["category"].map(label2id).astype(int)

with open("label2id.json", "w") as f: json.dump(label2id, f)
with open("id2label.json", "w") as f: json.dump(id2label, f)

df = df.rename(columns={'job_posting':'text', 'category': 'labels'})
df.to_parquet('../data/02_cleaned/bert_train_data.parquet')

## Tokenization and Training

### Get Data

In [15]:
with open(os.path.join(os.getcwd(), 'id2label.json'), "r", encoding="utf-8") as f:
    id2label = json.load(f)
with open(os.path.join(os.getcwd(), 'label2id.json'), "r", encoding="utf-8") as f:
    label2id = json.load(f)    

In [6]:
df = pd.read_parquet('../data/02_cleaned/bert_train_data.parquet')

In [20]:
df

Unnamed: 0,text,labels
6,£500 Bonus on Attandance during Black Friday a...,2
10,"$2, 500 POTENTIAL RETENTION BONUS! WHAT'S NEW ...",16
17,£500 Bonus on Attandance during Black Friday a...,2
18,£500 Bonus on Attandance during Black Friday a...,2
20,"AS9102 First article & ANSI Y14.5M 1982 "" Basi...",9
...,...,...
1760381,Structured training and development programmes...,21
1760386,Be responsible for managing impact to people f...,15
1760387,"At Liberty Mutual, technology isn't just a par...",18
1760388,€30k per annum. 12 month contract. Dublin. We ...,22


### Split

In [10]:
train, test_df = train_test_split(df[["text", "labels"]], test_size=0.05, random_state=42, stratify=df["labels"])

train_df, val_df = train_test_split(
    train[["text", "labels"]], test_size=0.1, random_state=42, stratify=train["labels"]
)

### Tokenization

In [None]:
model_name = "distilbert/distilbert-base-uncased"  
max_length = 512       # tokens in sequence             
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding=False
    )

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df,   preserve_index=False)

train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize,   batched=True, remove_columns=["text"])


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 541403/541403 [07:19<00:00, 1232.30 examples/s]
Map: 100%|██████████| 60156/60156 [00:50<00:00, 1180.56 examples/s]


In [1]:
train_ds.save_to_disk("../data/tokenized/train")
val_ds.save_to_disk("../data/tokenized/val")

NameError: name 'train_ds' is not defined

### Load Tokenized Data

Important that data was tokenized without padding for sequences, we should set Collator later

In [14]:
model_name = "distilbert/distilbert-base-uncased"  
max_length = 512       # tokens in sequence             
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)
with open(os.path.join(os.getcwd(), 'id2label.json'), "r", encoding="utf-8") as f:
    id2label = json.load(f)
with open(os.path.join(os.getcwd(), 'label2id.json'), "r", encoding="utf-8") as f:
    label2id = json.load(f)    

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
train_ds = load_from_disk("../data/tokenized/train")
val_ds   = load_from_disk("../data/tokenized/val")

In [5]:
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

### Arguments and preobject parametrs

In [6]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
os.cpu_count()

16

In [8]:
import multiprocessing
multiprocessing.cpu_count()

16

In [None]:
training_args = TrainingArguments(
    output_dir=config.get('models_dir'),
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,
    per_device_train_batch_size=16,     
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,    
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    fp16=True,                         
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none",
    optim="adamw_torch_fused",  
    dataloader_num_workers=8, 
)

### Training

In [16]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device count:", torch.cuda.device_count())
    print("Current device index:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(0))
    print("CUDA version (from toolkit):", torch.version.cuda)
else:
    print("No CUDA detected by PyTorch")

Torch version: 2.8.0+cu126
CUDA available: True
CUDA device count: 1
Current device index: 0
Device name: NVIDIA GeForce RTX 3050 Laptop GPU
CUDA version (from toolkit): 12.6


In [11]:
print(torch.cuda.is_available())  

True


In [None]:

# datasets expects 'labels'
# train_ds = train_ds.rename_column("label", "labels")
# val_ds   = val_ds.rename_column("label", "labels")



num_labels = 25

config_train = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config_train)

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=y_pred, references=y_true)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=y_pred, references=y_true, average="macro")["f1"]
    }



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

#last_checkpoint = config.get('model') / 'checkpoint-32000"
#trainer.train(resume_from_checkpoint=last_checkpoint)

eval_metrics = trainer.evaluate()
print(eval_metrics) 

trainer.save_model("bert_jobcls/best_model")
tokenizer.save_pretrained("bert_jobcls/best_model")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Macro F1
1000,2.1621,2.068564,0.457311,0.264657
2000,1.7332,1.691451,0.53456,0.42832
3000,1.5885,1.523974,0.564881,0.470517
4000,1.4202,1.406068,0.595369,0.531457
5000,1.3639,1.339099,0.60506,0.556096
6000,1.2732,1.294651,0.617012,0.567973
7000,1.2493,1.242742,0.630428,0.58466
8000,1.2316,1.203903,0.636545,0.603474
9000,1.2195,1.196508,0.639238,0.601018
10000,1.1926,1.165419,0.64469,0.610899


### Class Test

In [3]:
from src.model import ClassifierModel 

In [5]:
Path.cwd()

WindowsPath('c:/Users/Мариан/Desktop/Jupyter Notes/Projects/Trainee_iFortex/Git/job_posting/notebooks')

In [7]:
finetuned_dir = "../models/checkpoint-33000"

clf = ClassifierModel(finetuned_dir, finetuned=True, max_length=256)

text = "Senior Python developer needed in Amsterdam, experience with NLP required."
pred = clf.predict(text, return_probas=True)
print(pred)

{'label': 'IT', 'score': 0.86962890625, 'probas': {'Accounting': 0.0030994415283203125, 'Administrative': 0.0008625984191894531, 'Automotive': 0.00019359588623046875, 'Banking': 0.0011854171752929688, 'Construction': 0.0017900466918945312, 'Consulting': 0.0108795166015625, 'Education': 0.0007853507995605469, 'Engineering': 0.09716796875, 'Finance': 0.002796173095703125, 'Government': 0.00012373924255371094, 'Healthcare': 9.053945541381836e-05, 'Hospitality': 1.9252300262451172e-05, 'IT': 0.86962890625, 'Insurance': 0.001094818115234375, 'Legal': 0.000850677490234375, 'Management': 0.0002815723419189453, 'Manufacturing': 0.0005865097045898438, 'Marketing': 0.0009908676147460938, 'Operations': 0.0027217864990234375, 'Procurement': 0.00010138750076293945, 'Retail': 6.091594696044922e-05, 'Sales': 0.00040650367736816406, 'Science': 0.0016508102416992188, 'Telecommunications': 0.0024089813232421875, 'Transportation': 0.0001932382583618164}}
