## Intro

### Load Modules and configs

In [1]:
# System/env config
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))
print('Current dir for import:', parent_dir)

from src.config import Config
config = Config()
print('Config initialized')


import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

# Modules for data 
import re
import json
import numpy as np
import pandas as pd
from typing import Any
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from datasets import Dataset, DatasetDict
from datasets import load_from_disk, load_dataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback, AutoModelForTokenClassification, DataCollatorForTokenClassification
)

import evaluate
import torch

import sklearn_crfsuite
from sklearn_crfsuite import CRF
from seqeval.metrics import f1_score as f1_span, precision_score as p_span, recall_score as r_span, classification_report
from seqeval.scheme import IOB2

Current dir for import: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting


  from .autonotebook import tqdm as notebook_tqdm


Config initialized


### Dataset load

In [20]:


ds = load_dataset("jjzha/skillspan", cache_dir=config['raw_dir'])

In [21]:
ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 3174
    })
    test: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 3569
    })
})

### Extract

In [None]:
df_train        = ds['train'].select_columns(['tokens', 'tags_skill']).to_pandas()
df_validation   = ds['validation'].select_columns(['tokens', 'tags_skill']).to_pandas()
df_test         = ds['test'].select_columns(['tokens', 'tags_skill']).to_pandas()

In [23]:
df_train

Unnamed: 0,tokens,tags_skill
0,"[Senior, QA, Engineer, (, m/f/d, ), <ORGANIZAT...","[O, O, O, O, O, O, O]"
1,"[<ADDRESS>, <ADDRESS>, <ADDRESS>, <ADDRESS>, <...","[O, O, O, O, O]"
2,"[Date, posted:, 2021-07-14]","[O, O, O]"
3,"[Likes:, 0, Dislikes:, 0, Love:, 0]","[O, O, O, O, O, O]"
4,"[Job, description:]","[O, O]"
...,...,...
4795,"[Furthermore, we, expect, you, to, be, able, t...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, I, ..."
4796,"[You, are, structured, and, proactive, and, yo...","[O, O, B, O, B, O, O, O, O, B, O, O, O, O, O, ..."
4797,"[You, are, a, holistic, and, fact, based, prag...","[O, O, O, B, O, B, I, B, B, I, O, O, O, O, O, ..."
4798,"[Last, but, not, least, you, both, have, the, ...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, I, ..."


In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tokens      4800 non-null   object
 1   tags_skill  4800 non-null   object
dtypes: object(2)
memory usage: 75.1+ KB


Protoryping output CoNLL

In [25]:
# for tokens, tags in zip(df_train["tokens"], df_train["tags_skill"]):
#     for t, y in zip(tokens, tags):
#         print(f"{t}\t{y}")
#     print()

### Convert to Conll

In [26]:
# -*- coding: utf-8 -*-
# Prepare CoNLL files using only the 'tags_skill' column.
# All comments are in English.
def normalize_bio_tags(tags, label="SKILL"):
    """Convert bare BIO like ['O','B','I',...] into typed BIO like ['O','B-SKILL','I-SKILL',...]."""
    out = []
    for t in tags:
        if t == "O":
            out.append("O")
        elif t == "B":
            out.append(f"B-{label}")
        elif t == "I":
            out.append(f"I-{label}")
        else:
            # already typed or unexpected; keep as is
            out.append(t)
    return out

def validate_bio_sequence(tags):
    """
    Quick BIO validator: 'I-X' must follow 'B-X' or 'I-X' of the same type.
    Returns True if valid.
    """
    prev_type = None
    prev_tag = "O"
    for t in tags:
        if t == "O":
            prev_tag, prev_type = "O", None
            continue
        m = re.match(r"([BI])-(.+)", t)
        if not m:
            return False
        bi, lab = m.groups()
        if bi == "B":
            prev_tag, prev_type = "B", lab
        else:  # I
            if prev_tag == "O" or prev_type != lab:
                return False
            prev_tag = "I"
    return True

def write_conll_from_df(df: pd.DataFrame, tokens_col="tokens", tags_col="tags_skill", out_path: Path = Path("train.conll")):
    """
    Write a classic CoNLL file with two columns: token<TAB>label.
    Assumes each row has a list of tokens and a same-length list of BIO tags.
    """
    skipped = 0
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            tokens = list(row[tokens_col])
            tags   = list(row[tags_col])
            assert len(tokens) == len(tags), "Tokens and tags length mismatch"
            if not validate_bio_sequence(tags):
                print(f"Invalid BIO sequence: {tags}\n\t\t\tLine: {tokens}")
                print('This line will be skipped')
                skipped += 1
                continue
            for t, y in zip(tokens, tags):
                f.write(f"{t}\t{y}\n")
            f.write("\n")
    print(f'Number of skipped rows: {skipped}')

# === Usage example ===
# Suppose you already loaded df_train, df_dev, df_test with columns: 
#   ["idx", "tokens", "tags_skill", "tags_knowledge", "source"]

for df in (df_train, df_validation, df_test):
    df["tags_skill"] = df["tags_skill"].apply(lambda lst: normalize_bio_tags(lst, label="SKILL"))

out_dir = config['validated_dir']
write_conll_from_df(df_train,      out_path=out_dir / "train.conll"     )
write_conll_from_df(df_validation, out_path=out_dir / "validation.conll")
write_conll_from_df(df_test,       out_path=out_dir / "test.conll"      )

print("Done. CoNLL files saved to:", out_dir.resolve())



Number of skipped rows: 0
Number of skipped rows: 0
Invalid BIO sequence: ['O', 'O', 'O', 'O', 'O', 'O', 'I-SKILL', 'I-SKILL', 'I-SKILL']
			Line: ['Experience', 'with', 'agile', 'approaches', 'to', 'software', 'testing', 'and', 'development']
This line will be skipped
Number of skipped rows: 1
Done. CoNLL files saved to: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting\data\03_validated


### Read Conll

In [14]:
def read_conll(path: Path) -> tuple[list[list[str]], list[list[str]]]:
    """
    Read a two-column CoNLL file (token<TAB>tag>), sentences separated by blank lines.
    Returns: tokens_per_sent, tags_per_sent as lists of lists.
    """
    s_tokens, s_tags = [], []
    tokens, tags = [], []
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if tokens:
                    s_tokens.append(tokens)
                    s_tags.append(tags)
                    tokens, tags = [], []
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                # Skip malformed lines safely
                continue
            tok, lab = parts
            tokens.append(tok)
            tags.append(lab)
    if tokens:
        s_tokens.append(tokens)
        s_tags.append(tags)
    return s_tokens, s_tags



X_tokens_train, y_train = read_conll(config['train_connl'])
X_tokens_val,   y_val   = read_conll(config['validation_connl'])
X_tokens_test,  y_test  = read_conll(config['test_connl'])


print(f"#train sents: {len(X_tokens_train)}  #val: {len(X_tokens_val)}  #test: {len(X_tokens_test)}")


#train sents: 4800  #val: 3174  #test: 3568


## CRF Modeling

### Building features

In [29]:
def build_skill_gazetteers(tokens_per_sent: list[list[str]], tags_per_sent: list[list[str]]) -> dict[str, set[str]]:
    """
    Build simple gazetteers from training data:
      - skill_unigrams: lowercased tokens appearing inside any SKILL span
      - skill_bigrams:  lowercased bigrams inside SKILL spans
    """
    skill_unigrams = set()
    skill_bigrams  = set()
    for toks, labs in zip(tokens_per_sent, tags_per_sent):
        # collect indices of tokens inside SKILL spans (B-SKILL / I-SKILL)
        inside = [i for i, t in enumerate(labs) if t.startswith("B-") or t.startswith("I-")]
        for i in inside:
            skill_unigrams.add(toks[i].lower())
        # bigrams (consecutive tokens both inside a span)
        for i in range(len(toks) - 1):
            if (labs[i].startswith(("B-","I-"))) and (labs[i+1].startswith(("B-","I-"))):
                skill_bigrams.add((toks[i].lower(), toks[i+1].lower()))
    return {"skill_unigrams": skill_unigrams, "skill_bigrams": skill_bigrams}

gazetteers = build_skill_gazetteers(X_tokens_train, y_train)
len(gazetteers["skill_unigrams"]), len(gazetteers["skill_bigrams"])

(2184, 5944)

In [30]:
def char_ngrams(token: str, n: int) -> list[str]:
    token = token
    return [token[i:i+n] for i in range(len(token) - n + 1)] if len(token) >= n else []

def top_char_ngrams(tokens_per_sent: list[list[str]], top_k: int = 400) -> dict[str, set[str]]:
    """
    Compute most frequent char 2-grams and 3-grams from TRAIN tokens only and keep top_k for each size.
    This caps feature explosion while still giving CRF helpful subword signals.
    """
    c2 = Counter()
    c3 = Counter()
    for sent in tokens_per_sent:
        for tok in sent:
            c2.update(char_ngrams(tok, 2))
            c3.update(char_ngrams(tok, 3))
    top2 = set([ng for ng, _ in c2.most_common(top_k)])
    top3 = set([ng for ng, _ in c3.most_common(top_k)])
    return {"char2": top2, "char3": top3}

char_ngram_vocab = top_char_ngrams(X_tokens_train, top_k=400)
len(char_ngram_vocab["char2"]), len(char_ngram_vocab["char3"])

(400, 400)

In [31]:
PREF_SIZES = (2, 3, 4)
SUFF_SIZES = (2, 3, 4)

def word_shape(token: str) -> str:
    """
    Map token to a coarse 'shape' (e.g., 'Xx', 'xxx', 'd-dd', 'xxx-xx', 'Xx.' etc.).
    Helps generalize across casing/digits/punct.
    """
    shape = []
    for ch in token:
        if ch.isupper():
            shape.append('X')
        elif ch.islower():
            shape.append('x')
        elif ch.isdigit():
            shape.append('d')
        elif ch in "-_/\\.":
            shape.append(ch)
        else:
            shape.append('p')  # other punct
    # collapse runs like XXX -> X, xxx -> x to reduce sparsity
    collapsed = []
    for ch in shape:
        if not collapsed or collapsed[-1] != ch:
            collapsed.append(ch)
    return ''.join(collapsed)

def token_features(sent: list[str], i: int,
                   gaz: dict[str, set[str]],
                   char_vocab: dict[str, set[str]]) -> dict[str, Any]:
    """
    Build a feature dict for token sent[i].
    Includes:
      - bias, word lowercase, shape, isupper/istitle/isdigit/has_digit/has_hyphen
      - prefixes/suffixes, limited char 2/3-grams (only if in top lists)
      - simple gazetteers (unigram + adjacent bigrams), plus +/-1 and +/-2 window features
    """
    token = sent[i]
    lower = token.lower()
    feats = {
        'bias': 1.0,
        'word.lower': lower,
        'word.shape': word_shape(token),
        'word.isupper': token.isupper(),
        'word.istitle': token.istitle(),
        'word.isdigit': token.isdigit(),
        'word.has_digit': any(ch.isdigit() for ch in token),
        'word.has_hyphen': '-' in token,
        'word.has_dot': '.' in token,
        'word.has_slash': '/' in token or '\\' in token,
        'gaz.in_skill_unigram': (lower in gaz['skill_unigrams']),
    }
    # prefixes / suffixes
    for n in PREF_SIZES:
        feats[f'pref{n}'] = lower[:n] if len(lower) >= n else lower
    for n in SUFF_SIZES:
        feats[f'suff{n}'] = lower[-n:] if len(lower) >= n else lower

    # limited char 2/3-grams (only those that are in top vocab to control dimensionality)
    for ng in char_ngrams(token, 2):
        if ng in char_vocab['char2']:
            feats[f'char2={ng}'] = True
    for ng in char_ngrams(token, 3):
        if ng in char_vocab['char3']:
            feats[f'char3={ng}'] = True

    # context features (+/- 1, +/- 2)
    def add_ctx(j: int, tag: str):
        if 0 <= j < len(sent):
            w = sent[j]
            lw = w.lower()
            feats[f'{tag}.lower'] = lw
            feats[f'{tag}.shape'] = word_shape(w)
            feats[f'{tag}.istitle'] = w.istitle()
            feats[f'{tag}.isupper'] = w.isupper()

    add_ctx(i-1, '-1')
    add_ctx(i-2, '-2')
    add_ctx(i+1, '+1')
    add_ctx(i+2, '+2')

    # gazetteer bigrams with neighbors (prev+cur, cur+next)
    if i-1 >= 0:
        feats['gaz.prev_cur_in_skill_bigram'] = (sent[i-1].lower(), lower) in gaz['skill_bigrams']
    if i+1 < len(sent):
        feats['gaz.cur_next_in_skill_bigram'] = (lower, sent[i+1].lower()) in gaz['skill_bigrams']

    return feats

def sent2features(sent: list[str],
                  gaz: dict[str, set[str]],
                  char_vocab: dict[str, set[str]]) -> list[dict[str, Any]]:
    return [token_features(sent, i, gaz, char_vocab) for i in range(len(sent))]

In [32]:
def to_crf_Xy(tokens_per_sent: list[list[str]],
              tags_per_sent: list[list[str]],
              gaz: dict[str, set[str]],
              char_vocab: dict[str, set[str]]):
    X = [sent2features(s, gaz, char_vocab) for s in tokens_per_sent]
    y = [list(tags) for tags in tags_per_sent]
    return X, y

X_train, y_train_ = to_crf_Xy(X_tokens_train, y_train, gazetteers, char_ngram_vocab)
X_val_,  y_val_   = to_crf_Xy(X_tokens_val,   y_val,   gazetteers, char_ngram_vocab)
X_test_, y_test_  = to_crf_Xy(X_tokens_test,  y_test,  gazetteers, char_ngram_vocab)

# Quick sanity check
print(len(X_train), len(y_train_), len(X_val_), len(y_val_))
print(len(X_train[0]), len(y_train_[0]), X_train[0][0])

4800 4800 3174 3174
7 7 {'bias': 1.0, 'word.lower': 'senior', 'word.shape': 'Xx', 'word.isupper': False, 'word.istitle': True, 'word.isdigit': False, 'word.has_digit': False, 'word.has_hyphen': False, 'word.has_dot': False, 'word.has_slash': False, 'gaz.in_skill_unigram': True, 'pref2': 'se', 'pref3': 'sen', 'pref4': 'seni', 'suff2': 'or', 'suff3': 'ior', 'suff4': 'nior', 'char2=Se': True, 'char2=en': True, 'char2=ni': True, 'char2=io': True, 'char2=or': True, '+1.lower': 'qa', '+1.shape': 'X', '+1.istitle': False, '+1.isupper': True, '+2.lower': 'engineer', '+2.shape': 'Xx', '+2.istitle': True, '+2.isupper': False, 'gaz.cur_next_in_skill_bigram': False}


### Train CRF pand evaluate

In [33]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,               # L1
    c2=0.1,               # L2
    max_iterations=200,
    all_possible_transitions=True
)

crf.fit(X_train, y_train_)
print("CRF trained.")

CRF trained.


In [34]:
def eval_seqeval(y_true, y_pred, title: str = "Eval"):
    print(f"=== {title} (span-level) ===")
    p = p_span(y_true, y_pred, scheme=IOB2)
    r = r_span(y_true, y_pred, scheme=IOB2)
    f = f1_span(y_true, y_pred, scheme=IOB2)
    print(f"Precision: {p:.4f}  Recall: {r:.4f}  F1: {f:.4f}")
    print(classification_report(y_true, y_pred, scheme=IOB2, digits=4))

y_val_pred  = crf.predict(X_val_)
y_test_pred = crf.predict(X_test_)

eval_seqeval(y_val_,  y_val_,  title="Val [oracle sanity]")     # sanity: should be 1.0
eval_seqeval(y_val_,  y_val_pred,  title="Val")
eval_seqeval(y_test_, y_test_pred, title="Test")

=== Val [oracle sanity] (span-level) ===
Precision: 1.0000  Recall: 1.0000  F1: 1.0000
              precision    recall  f1-score   support

       SKILL     1.0000    1.0000    1.0000      1070

   micro avg     1.0000    1.0000    1.0000      1070
   macro avg     1.0000    1.0000    1.0000      1070
weighted avg     1.0000    1.0000    1.0000      1070

=== Val (span-level) ===
Precision: 0.3375  Recall: 0.1000  F1: 0.1543
              precision    recall  f1-score   support

       SKILL     0.3375    0.1000    0.1543      1070

   micro avg     0.3375    0.1000    0.1543      1070
   macro avg     0.3375    0.1000    0.1543      1070
weighted avg     0.3375    0.1000    0.1543      1070

=== Test (span-level) ===
Precision: 0.3607  Recall: 0.1009  F1: 0.1577
              precision    recall  f1-score   support

       SKILL     0.3607    0.1009    0.1577      1090

   micro avg     0.3607    0.1009    0.1577      1090
   macro avg     0.3607    0.1009    0.1577      1090
weight

## BERT-MLP

### Read and prepare data

In [15]:
train_path = config['train_connl']
val_path   = config['validation_connl']
test_path  = config['test_connl']

X_train_tok, y_train_bio = read_conll(train_path)
X_val_tok,   y_val_bio   = read_conll(val_path)
X_test_tok,  y_test_bio  = read_conll(test_path)

In [16]:
print(len(X_train_tok), len(X_val_tok), len(X_test_tok))

4800 3174 3568


In [17]:
label_list = sorted({lab for seq in (y_train_bio + y_val_bio + y_test_bio) for lab in seq})
assert all(lab == "O" or lab.startswith(("B-","I-")) for lab in label_list), f"Non-BIO labels: {label_list}"


In [18]:
label_list = ["O"] + [l for l in label_list if l != "O"]
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}
label_list, label2id, id2label

(['O', 'B-SKILL', 'I-SKILL'],
 {'O': 0, 'B-SKILL': 1, 'I-SKILL': 2},
 {0: 'O', 1: 'B-SKILL', 2: 'I-SKILL'})

In [39]:
def to_hf(tokens: list[list[str]], labels: list[list[str]]) -> Dataset:
    ids = [[label2id[t] for t in seq] for seq in labels]
    return Dataset.from_dict({"tokens": tokens, "labels": ids})

In [40]:
ds = DatasetDict({
    "train": to_hf(X_train_tok, y_train_bio),
    "validation": to_hf(X_val_tok, y_val_bio),
    "test": to_hf(X_test_tok, y_test_bio),
})
ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3174
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 3568
    })
})

In [41]:
model_name = "bert-base-cased"  # casing helps for NER
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def tokenize_and_align(batch: dict[str, Any], label_all_tokens: bool = False) -> dict[str, Any]:
    enc = tokenizer(
        batch["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=False,
        max_length=256,
    )
    all_labels = []
    for i in range(len(batch["tokens"])):
        word_ids = enc.word_ids(batch_index=i)
        labels_i = batch["labels"][i]
        aligned = []
        prev_wid = None
        for wid in word_ids:
            if wid is None:
                aligned.append(-100)
            elif wid != prev_wid:
                aligned.append(labels_i[wid])
            else:
                aligned.append(labels_i[wid] if label_all_tokens else -100)
            prev_wid = wid
        all_labels.append(aligned)
    enc["labels"] = all_labels
    return enc

encoded = DatasetDict({
    "train": ds["train"].map(tokenize_and_align, batched=True, remove_columns=ds["train"].column_names),
    "validation": ds["validation"].map(tokenize_and_align, batched=True, remove_columns=ds["validation"].column_names),
    "test": ds["test"].map(tokenize_and_align, batched=True, remove_columns=ds["test"].column_names),
})
encoded

Map: 100%|██████████| 4800/4800 [00:00<00:00, 13290.57 examples/s]
Map: 100%|██████████| 3174/3174 [00:00<00:00, 18293.64 examples/s]
Map: 100%|██████████| 3568/3568 [00:00<00:00, 19127.08 examples/s]


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3174
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3568
    })
})

### Arguments and inference

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
).to(device)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load("seqeval")

def align_for_metrics(predictions: np.ndarray, labels: np.ndarray) -> tuple[list[list[str]], list[list[str]]]:
    preds = np.argmax(predictions, axis=2)
    y_true, y_pred = [], []
    for p_seq, l_seq in zip(preds, labels):
        true_tags, pred_tags = [], []
        for p, l in zip(p_seq, l_seq):
            if l == -100:
                continue
            true_tags.append(id2label[int(l)])
            pred_tags.append(id2label[int(p)])
        y_true.append(true_tags)
        y_pred.append(pred_tags)
    return y_true, y_pred

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    y_true, y_pred = align_for_metrics(logits, labels)
    return {
        "precision": p_span(y_true, y_pred, scheme=IOB2),
        "recall":    r_span(y_true, y_pred, scheme=IOB2),
        "f1":        f1_span(y_true, y_pred, scheme=IOB2),
    }

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
out_dir = str(config['models_dir'] / "bert-skill-ner" )

args = TrainingArguments(
    output_dir=out_dir,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=100,
    report_to="none",
    fp16=torch.cuda.is_available(),
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.0984,0.238525,0.430918,0.416822,0.423753
1000,0.023,0.279275,0.464822,0.549533,0.50364
1500,0.0099,0.315194,0.482173,0.530841,0.505338


TrainOutput(global_step=1500, training_loss=0.09613342535495759, metrics={'train_runtime': 386.0806, 'train_samples_per_second': 62.163, 'train_steps_per_second': 3.885, 'total_flos': 1428260985022464.0, 'train_loss': 0.09613342535495759, 'epoch': 5.0})

### Evaluate

In [46]:


# Detailed per-entity report
def classification_report_on(split: str):
    preds = trainer.predict(encoded[split])
    y_true, y_pred = align_for_metrics(preds.predictions, preds.label_ids)
    print(f"\n=== {split.upper()} (span-level, IOB2) ===")
    print(classification_report(y_true, y_pred, scheme=IOB2, digits=4))

classification_report_on("validation")
classification_report_on("test")


=== VALIDATION (span-level, IOB2) ===
              precision    recall  f1-score   support

       SKILL     0.4822    0.5308    0.5053      1070

   micro avg     0.4822    0.5308    0.5053      1070
   macro avg     0.4822    0.5308    0.5053      1070
weighted avg     0.4822    0.5308    0.5053      1070




=== TEST (span-level, IOB2) ===
              precision    recall  f1-score   support

       SKILL     0.4896    0.4734    0.4813      1090

   micro avg     0.4896    0.4734    0.4813      1090
   macro avg     0.4896    0.4734    0.4813      1090
weighted avg     0.4896    0.4734    0.4813      1090



In [45]:
from typing import NamedTuple, Set

class Span(NamedTuple):
    label: str
    start: int
    end: int  # inclusive

def bio_to_spans(tags: list[str]) -> list[Span]:
    spans, start, lab = [], None, None
    for i, t in enumerate(tags):
        if t == "O":
            if lab is not None:
                spans.append(Span(lab, start, i-1))
                lab, start = None, None
            continue
        bi, typ = t.split("-", 1)
        if bi == "B":
            if lab is not None:
                spans.append(Span(lab, start, i-1))
            lab, start = typ, i
        elif bi == "I":
            pass
    if lab is not None:
        spans.append(Span(lab, start, len(tags)-1))
    return spans

def compare_spans(true_spans: list[Span], pred_spans: list[Span]) -> tuple[set[Span], set[Span], set[Span]]:
    true_set, pred_set = set(true_spans), set(pred_spans)
    return true_set & pred_set, pred_set - true_set, true_set - pred_set

def show_errors(split="validation", max_examples=8):
    # We need original tokens for display (read them again from your CoNLL-based python lists)
    orig_tokens = {"validation": X_val_tok, "test": X_test_tok, "train": X_train_tok}[split]
    preds = trainer.predict(encoded[split])
    y_true, y_pred = align_for_metrics(preds.predictions, preds.label_ids)
    shown = 0
    for i, (tseq, pseq) in enumerate(zip(y_true, y_pred)):
        tp, fp, fn = compare_spans(bio_to_spans(tseq), bio_to_spans(pseq))
        if not fp and not fn:
            continue
        print(f"\n--- {split.upper()} example #{i} ---")
        print("TOKENS:", " ".join(orig_tokens[i]))
        print("TRUE:  ", " ".join(tseq))
        print("PRED:  ", " ".join(pseq))
        print("TP:", sorted(tp))
        print("FP:", sorted(fp))
        print("FN:", sorted(fn))
        shown += 1
        if shown >= max_examples:
            break

show_errors("validation", max_examples=8)


--- VALIDATION example #20 ---
TOKENS: We are in growth mode and looking for high calibre deeply experienced hands-on DevOps engineers who seek to be recognized as platform technology leaders growing our practice .
TRUE:   O O O O O O O O B-SKILL I-SKILL O B-SKILL B-SKILL O O O O O O O O O O O O O O O
PRED:   O O O O O O O O O O O O B-SKILL O O O O O O O O O O O O O O O
TP: [Span(label='SKILL', start=12, end=12)]
FP: []
FN: [Span(label='SKILL', start=8, end=9), Span(label='SKILL', start=11, end=11)]

--- VALIDATION example #21 ---
TOKENS: You will work as part of our Agile hybrid DevOps teams to help develop/configure new tools roll out environments and automate processes using a variety of tools and techniques .
TRUE:   O O O O O O O O O O O O O B-SKILL I-SKILL I-SKILL B-SKILL I-SKILL I-SKILL O B-SKILL I-SKILL O O O O O O O O
PRED:   O O O O O O O O O O O O O B-SKILL I-SKILL I-SKILL I-SKILL I-SKILL I-SKILL O B-SKILL I-SKILL O I-SKILL I-SKILL I-SKILL I-SKILL I-SKILL I-SKILL O
TP: [Spa

### Class check

In [2]:
from src.skills_model import SkillExtractor
from pathlib import Path

In [None]:


train_path = Path("../data/03_validated/train.conll")
val_path   = Path("../data/03_validated/validation.conll")
test_path  = Path("../data/03_validated/test.conll")
se = SkillExtractor(model_name="bert-base-cased", max_length=256)
se.prepare_from_conll(train_path, val_path, test_path)

out_dir = Path("../models/bert-skill-ner/test")
se.fit(output_dir=out_dir, early_stopping_patience=3)
se.evaluate("validation")
se.evaluate("test")

save_dir = out_dir / "final"
se.save(save_dir)

se2 = SkillExtractor.load(save_dir)

text = "Senior Python Developer with FastAPI, Docker and a bit of Kubernetes; strong SQL."
skills = se2.predict(text)               
print(skills)

skills_all = se2.predict(text, unique=False)
print(skills_all)

Map: 100%|██████████| 4800/4800 [00:00<00:00, 14140.73 examples/s]
Map: 100%|██████████| 3174/3174 [00:00<00:00, 17422.77 examples/s]
Map: 100%|██████████| 3568/3568 [00:00<00:00, 20250.14 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.0975,0.238221,0.454635,0.430841,0.442418
1000,0.0228,0.273246,0.464006,0.554206,0.505111
1500,0.0096,0.300179,0.489398,0.539252,0.513117



=== VALIDATION (span-level, IOB2) ===
              precision    recall  f1-score   support

       SKILL     0.4894    0.5393    0.5131      1070

   micro avg     0.4894    0.5393    0.5131      1070
   macro avg     0.4894    0.5393    0.5131      1070
weighted avg     0.4894    0.5393    0.5131      1070




=== TEST (span-level, IOB2) ===
              precision    recall  f1-score   support

       SKILL     0.4874    0.4954    0.4914      1090

   micro avg     0.4874    0.4954    0.4914      1090
   macro avg     0.4874    0.4954    0.4914      1090
weighted avg     0.4874    0.4954    0.4914      1090

[]
[]


In [81]:
text = "You will work as part of our Agile hybrid DevOps teams to help develop/configure new tools roll out environments and automate processes using a variety of tools and techniques"
skills = se2.predict(text)               
skills


['develop/configure new tools roll out environments', 'automate processes']

In [77]:
manualy_trained = Path("../models/bert-skill-ner/checkpoint-1500")

In [None]:

skills_all = se2.predict(text, unique=False)
print(skills_all)

In [78]:
model = AutoModelForTokenClassification.from_pretrained(manualy_trained)
tokenizer = AutoTokenizer.from_pretrained(manualy_trained)

In [None]:
from transformers import pipeline

# 
# task = "text-classification"
# model_or_dir = "save_dir"                
# clf = pipeline(task=task, model=model, device_map="auto", tokenizer = tokenizer) 

# texts = ["I loved this film!", "Service was terrible.", 'Python']
# preds_clf = clf(texts, truncation=True, top_k=1) 
# print("Classification:", preds_clf)

# 
ner = pipeline(task="token-classification", model=model, aggregation_strategy="simple", device_map="auto",  tokenizer = tokenizer)
ner_text = ["Full Stack Software Engineer – Java/JavaScript", ' Hello', 'Python Developer' , 'You will work as part of our Agile hybrid DevOps teams to help develop/configure new tools roll out environments and automate processes using a variety of tools and techniques'] 
preds_ner = ner(ner_text)
print("NER:", preds_ner)
preds_ner

# # 
# clf_all = pipeline(task="text-classification", model=model_or_dir, device_map="auto", return_all_scores=True)
# print("All scores:", clf_all("The product is fine, but shipping was slow."))

# 
# quick = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english", device_map="auto")
# print("Quick:", quick("What a fantastic day!"))

Device set to use cuda:0


NER: [[], [], [], [{'entity_group': 'SKILL', 'score': np.float32(0.9988899), 'word': 'develop / configure new tools roll out environments', 'start': 63, 'end': 112}, {'entity_group': 'SKILL', 'score': np.float32(0.98706126), 'word': 'auto', 'start': 117, 'end': 121}, {'entity_group': 'SKILL', 'score': np.float32(0.9881214), 'word': '##mate processes', 'start': 121, 'end': 135}, {'entity_group': 'SKILL', 'score': np.float32(0.7099903), 'word': 'a variety of tools and techniques', 'start': 142, 'end': 175}]]


[[],
 [],
 [],
 [{'entity_group': 'SKILL',
   'score': np.float32(0.9988899),
   'word': 'develop / configure new tools roll out environments',
   'start': 63,
   'end': 112},
  {'entity_group': 'SKILL',
   'score': np.float32(0.98706126),
   'word': 'auto',
   'start': 117,
   'end': 121},
  {'entity_group': 'SKILL',
   'score': np.float32(0.9881214),
   'word': '##mate processes',
   'start': 121,
   'end': 135},
  {'entity_group': 'SKILL',
   'score': np.float32(0.7099903),
   'word': 'a variety of tools and techniques',
   'start': 142,
   'end': 175}]]

: 