### INIT

In [20]:
# System/env config
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))
print('Current dir for import:', parent_dir)

from src.config import Config
config = Config()
print('Config initialized')


import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

# Modules for data 
import re
import json
import numpy as np
import pandas as pd
from typing import Any
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from datasets import Dataset
from datasets import load_from_disk
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)

import evaluate
import torch

import sklearn_crfsuite
from sklearn_crfsuite import CRF
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from seqeval.scheme import IOB2

Current dir for import: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting
Config initialized


In [2]:
from datasets import load_dataset

ds = load_dataset("jjzha/skillspan", cache_dir=config['raw_dir'])

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 4800
    })
    validation: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 3174
    })
    test: Dataset({
        features: ['idx', 'tokens', 'tags_skill', 'tags_knowledge', 'source'],
        num_rows: 3569
    })
})

### Extract

In [4]:
df_train = ds['train'].select_columns(['tokens', 'tags_skill']).to_pandas()
df_validation = ds['validation'].select_columns(['tokens', 'tags_skill']).to_pandas()
df_test = ds['test'].select_columns(['tokens', 'tags_skill']).to_pandas()

In [5]:
df_train

Unnamed: 0,tokens,tags_skill
0,"[Senior, QA, Engineer, (, m/f/d, ), <ORGANIZAT...","[O, O, O, O, O, O, O]"
1,"[<ADDRESS>, <ADDRESS>, <ADDRESS>, <ADDRESS>, <...","[O, O, O, O, O]"
2,"[Date, posted:, 2021-07-14]","[O, O, O]"
3,"[Likes:, 0, Dislikes:, 0, Love:, 0]","[O, O, O, O, O, O]"
4,"[Job, description:]","[O, O]"
...,...,...
4795,"[Furthermore, we, expect, you, to, be, able, t...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, I, ..."
4796,"[You, are, structured, and, proactive, and, yo...","[O, O, B, O, B, O, O, O, O, B, O, O, O, O, O, ..."
4797,"[You, are, a, holistic, and, fact, based, prag...","[O, O, O, B, O, B, I, B, B, I, O, O, O, O, O, ..."
4798,"[Last, but, not, least, you, both, have, the, ...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, I, ..."


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4800 entries, 0 to 4799
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tokens      4800 non-null   object
 1   tags_skill  4800 non-null   object
dtypes: object(2)
memory usage: 75.1+ KB


Protoryping output CoNLL

In [7]:
# for tokens, tags in zip(df_train["tokens"], df_train["tags_skill"]):
#     for t, y in zip(tokens, tags):
#         print(f"{t}\t{y}")
#     print()

In [8]:
# -*- coding: utf-8 -*-
# Prepare CoNLL files using only the 'tags_skill' column.
# All comments are in English.
def normalize_bio_tags(tags, label="SKILL"):
    """Convert bare BIO like ['O','B','I',...] into typed BIO like ['O','B-SKILL','I-SKILL',...]."""
    out = []
    for t in tags:
        if t == "O":
            out.append("O")
        elif t == "B":
            out.append(f"B-{label}")
        elif t == "I":
            out.append(f"I-{label}")
        else:
            # already typed or unexpected; keep as is
            out.append(t)
    return out

def validate_bio_sequence(tags):
    """
    Quick BIO validator: 'I-X' must follow 'B-X' or 'I-X' of the same type.
    Returns True if valid.
    """
    prev_type = None
    prev_tag = "O"
    for t in tags:
        if t == "O":
            prev_tag, prev_type = "O", None
            continue
        m = re.match(r"([BI])-(.+)", t)
        if not m:
            return False
        bi, lab = m.groups()
        if bi == "B":
            prev_tag, prev_type = "B", lab
        else:  # I
            if prev_tag == "O" or prev_type != lab:
                return False
            prev_tag = "I"
    return True

def write_conll_from_df(df: pd.DataFrame, tokens_col="tokens", tags_col="tags_skill", out_path: Path = Path("train.conll")):
    """
    Write a classic CoNLL file with two columns: token<TAB>label.
    Assumes each row has a list of tokens and a same-length list of BIO tags.
    """
    skipped = 0
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            tokens = list(row[tokens_col])
            tags   = list(row[tags_col])
            assert len(tokens) == len(tags), "Tokens and tags length mismatch"
            if not validate_bio_sequence(tags):
                print(f"Invalid BIO sequence: {tags}\n\t\t\tLine: {tokens}")
                print('This line will be skipped')
                skipped += 1
                continue
            for t, y in zip(tokens, tags):
                f.write(f"{t}\t{y}\n")
            f.write("\n")
    print(f'Number of skipped rows: {skipped}')

# === Usage example ===
# Suppose you already loaded df_train, df_dev, df_test with columns: 
#   ["idx", "tokens", "tags_skill", "tags_knowledge", "source"]

for df in (df_train, df_validation, df_test):
    df["tags_skill"] = df["tags_skill"].apply(lambda lst: normalize_bio_tags(lst, label="SKILL"))

out_dir = config['validated_dir']
write_conll_from_df(df_train,      out_path=out_dir / "train.conll"     )
write_conll_from_df(df_validation, out_path=out_dir / "validation.conll")
write_conll_from_df(df_test,       out_path=out_dir / "test.conll"      )

print("Done. CoNLL files saved to:", out_dir.resolve())



Number of skipped rows: 0
Number of skipped rows: 0
Invalid BIO sequence: ['O', 'O', 'O', 'O', 'O', 'O', 'I-SKILL', 'I-SKILL', 'I-SKILL']
			Line: ['Experience', 'with', 'agile', 'approaches', 'to', 'software', 'testing', 'and', 'development']
This line will be skipped
Number of skipped rows: 1
Done. CoNLL files saved to: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting\data\03_validated


In [9]:
# # Single-file pipeline: CoNLL -> CRF baseline -> BERT token classification (BIO/IOB)
# # Requires: pip install sklearn-crfsuite seqeval transformers datasets torch joblib

# import os, re, json, random, sys
# from pathlib import Path
# from typing import List, Tuple, Dict, Any
# import joblib

# # ---------- CoNLL reader ----------
# def read_conll(path: Path) -> Tuple[List[List[str]], List[List[str]]]:
#     """Read two-column CoNLL (token[TAB]label), blank line separates sentences.
#     Returns tokens_list, labels_list where tokens_list[i] aligns to labels_list[i]."""
#     tokens_list, labels_list = [], []
#     cur_tokens, cur_labels = [], []
#     with path.open(encoding="utf-8") as f:
#         for line in f:
#             line = line.rstrip("\n")
#             if not line:
#                 if cur_tokens:
#                     tokens_list.append(cur_tokens); labels_list.append(cur_labels)
#                     cur_tokens, cur_labels = [], []
#                 continue
#             parts = line.split("\t")
#             if len(parts) != 2:
#                 raise ValueError(f"Expected 2 columns token<TAB>label, got: {line}")
#             tok, lab = parts
#             cur_tokens.append(tok); cur_labels.append(lab)
#     if cur_tokens:
#         tokens_list.append(cur_tokens); labels_list.append(cur_labels)
#     return tokens_list, labels_list

# # ---------- Simple BIO features for CRF ----------
# def word2features(sent: List[str], i: int) -> Dict[str, Any]:
#     """Hand-crafted token features for CRF."""
#     w = sent[i]
#     prevw = sent[i-1] if i > 0 else "__BOS__"
#     nextw = sent[i+1] if i < len(sent)-1 else "__EOS__"
#     feats = {
#         "bias": 1.0,
#         "w.lower": w.lower(),
#         "w.isupper": w.isupper(),
#         "w.istitle": w.istitle(),
#         "w.isdigit": w.isdigit(),
#         "suffix3": w[-3:],
#         "suffix2": w[-2:],
#         "prefix2": w[:2],
#         "prev.lower": prevw.lower(),
#         "prev.istitle": prevw.istitle() if prevw not in ("__BOS__","__EOS__") else False,
#         "prev.isupper": prevw.isupper() if prevw not in ("__BOS__","__EOS__") else False,
#         "next.lower": nextw.lower(),
#         "next.istitle": nextw.istitle() if nextw not in ("__BOS__","__EOS__") else False,
#         "next.isupper": nextw.isupper() if nextw not in ("__BOS__","__EOS__") else False,
#         "BOS": i == 0,
#         "EOS": i == len(sent)-1,
#     }
#     return feats

# def sent2features(sent: List[str]) -> List[Dict[str, Any]]:
#     return [word2features(sent, i) for i in range(len(sent))]

# # ---------- Span-F1 via seqeval ----------
# from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

# def print_seqeval_report(y_true: List[List[str]], y_pred: List[List[str]], title: str):
#     print(f"\n=== {title} ===")
#     print("Precision:", round(precision_score(y_true, y_pred), 4))
#     print("Recall   :", round(recall_score(y_true, y_pred), 4))
#     print("F1       :", round(f1_score(y_true, y_pred), 4))
#     print(classification_report(y_true, y_pred, digits=4))

# # ---------- CRF baseline ----------
# def run_crf(train_sents, train_labels, dev_sents, dev_labels, test_sents, test_labels, model_path: Path):
#     from sklearn_crfsuite import CRF
#     X_train = [sent2features(s) for s in train_sents]
#     X_dev   = [sent2features(s) for s in dev_sents]
#     X_test  = [sent2features(s) for s in test_sents]
#     y_train = train_labels
#     y_dev   = dev_labels
#     y_test  = test_labels

#     crf = CRF(algorithm="lbfgs", c1=0.1, c2=0.1, max_iterations=200, all_possible_transitions=True)
#     crf.fit(X_train, y_train)
#     y_dev_pred  = crf.predict(X_dev)
#     y_test_pred = crf.predict(X_test)

#     print_seqeval_report(y_dev, y_dev_pred, "CRF DEV")
#     print_seqeval_report(y_test, y_test_pred, "CRF TEST")

#     model_path.parent.mkdir(parents=True, exist_ok=True)
#     joblib.dump(crf, model_path)
#     print(f"CRF model saved to: {model_path.resolve()}")

# # ---------- BERT token-classification ----------
# def run_bert(
#     train_sents, train_labels, dev_sents, dev_labels, test_sents, test_labels,
#     out_dir: Path, bert_ckpt: str = "bert-base-cased", epochs: int = 3, batch_size: int = 16, lr: float = 5e-5
# ):
#     from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
#     import torch
#     from datasets import Dataset, DatasetDict

#     # Collect label set preserving BIO strings
#     label_list = sorted({lab for seq in (train_labels + dev_labels + test_labels) for lab in seq})
#     if any(lab not in {"O"} and not re.match(r"^[BI]-", lab) for lab in label_list):
#         raise ValueError(f"Labels look non-BIO: {label_list}")

#     label2id = {lab: i for i, lab in enumerate(label_list)}
#     id2label = {i: lab for lab, i in label2id.items()}

#     # Build HF datasets
#     def to_rows(sents: List[List[str]], labels: List[List[str]]) -> Dict[str, List[Any]]:
#         return {"tokens": sents, "ner_tags": [[label2id[l] for l in seq] for seq in labels]}

#     d_train = Dataset.from_dict(to_rows(train_sents, train_labels))
#     d_dev   = Dataset.from_dict(to_rows(dev_sents,   dev_labels))
#     d_test  = Dataset.from_dict(to_rows(test_sents,  test_labels))
#     ds = DatasetDict({"train": d_train, "validation": d_dev, "test": d_test})

#     tokenizer = AutoTokenizer.from_pretrained(bert_ckpt, use_fast=True)

#     # Align labels to wordpieces: keep label for first subword, set -100 for others
#     def tokenize_and_align(example: Dict[str, Any]) -> Dict[str, Any]:
#         toks = example["tokens"]
#         labs = example["ner_tags"]
#         enc = tokenizer(toks, is_split_into_words=True, truncation=True)
#         word_ids = enc.word_ids()
#         aligned = []
#         prev_word = None
#         for idx, wid in enumerate(word_ids):
#             if wid is None:
#                 aligned.append(-100)
#             elif wid != prev_word:
#                 aligned.append(labs[wid])
#             else:
#                 aligned.append(-100)
#             prev_word = wid
#         enc["labels"] = aligned
#         return enc

#     ds_tok = ds.map(tokenize_and_align, batched=False)

#     model = AutoModelForTokenClassification.from_pretrained(
#         bert_ckpt,
#         num_labels=len(label_list),
#         id2label=id2label,
#         label2id=label2id
#     )

#     data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

#     def compute_metrics(p):
#         preds = p.predictions
#         if isinstance(preds, tuple):
#             preds = preds[0]
#         preds = preds.argmax(-1)
#         labels = p.label_ids
#         # Unpack to BIO strings, skipping -100
#         true_str, pred_str = [], []
#         for y_true, y_pred in zip(labels, preds):
#             y_true_seq, y_pred_seq = [], []
#             for t, p_ in zip(y_true, y_pred):
#                 if t == -100:
#                     continue
#                 y_true_seq.append(id2label[int(t)])
#                 y_pred_seq.append(id2label[int(p_)])
#             true_str.append(y_true_seq)
#             pred_str.append(y_pred_seq)
#         return {
#             "precision": precision_score(true_str, pred_str),
#             "recall":    recall_score(true_str, pred_str),
#             "f1":        f1_score(true_str, pred_str),
#         }

#     out_dir.mkdir(parents=True, exist_ok=True)
#     args = TrainingArguments(
#         output_dir=str(out_dir),
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         learning_rate=lr,
#         per_device_train_batch_size=batch_size,
#         per_device_eval_batch_size=batch_size,
#         num_train_epochs=epochs,
#         weight_decay=0.01,
#         logging_steps=50,
#         load_best_model_at_end=True,
#         metric_for_best_model="f1",
#         report_to="none"
#     )

#     trainer = Trainer(
#         model=model,
#         args=args,
#         train_dataset=ds_tok["train"],
#         eval_dataset=ds_tok["validation"],
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics
#     )

#     trainer.train()
#     print("\n=== BERT DEV ===")
#     dev_metrics = trainer.evaluate()
#     print({k: round(float(v), 4) for k, v in dev_metrics.items() if k in ("precision","recall","f1")})

#     print("\n=== BERT TEST ===")
#     test_metrics = trainer.evaluate(ds_tok["test"])
#     print({k: round(float(v), 4) for k, v in test_metrics.items() if k in ("precision","recall","f1")})

#     save_path = out_dir / "final"
#     trainer.save_model(str(save_path))
#     tokenizer.save_pretrained(str(save_path))
#     with (out_dir / "label_mapping.json").open("w", encoding="utf-8") as f:
#         json.dump({"label_list": label_list, "label2id": label2id}, f, ensure_ascii=False, indent=2)
#     print(f"BERT model saved to: {save_path.resolve()}")

# # ---------- Entry point ----------
# if __name__ == "__main__":
#     # Configure paths and model
#     data_dir = Path(os.environ.get("CONLL_DIR", "conll_only_skill"))  # folder with train.conll, validation.conll, test.conll
#     train_path = data_dir / "train.conll"
#     dev_path   = data_dir / "validation.conll"
#     test_path  = data_dir / "test.conll"
#     bert_ckpt  = os.environ.get("BERT_CKPT", "bert-base-cased")
#     out_dir    = Path(os.environ.get("BERT_OUT", "bert_skill_ner"))
#     random.seed(13)

#     train_sents, train_labels = read_conll(train_path)
#     dev_sents,   dev_labels   = read_conll(dev_path)
#     test_sents,  test_labels  = read_conll(test_path)

#     # Quick sanity checks
#     assert all(len(s) == len(l) for s, l in zip(train_sents, train_labels)), "Train tokens/labels len mismatch"
#     assert all(re.match(r"^(O|[BI]-.+)$", lab) for seq in train_labels for lab in seq), "Non-BIO label in train"

#     # CRF baseline
#     run_crf(
#         train_sents, train_labels,
#         dev_sents,   dev_labels,
#         test_sents,  test_labels,
#         model_path=Path("crf_model.joblib")
#     )

#     # BERT fine-tuning
#     run_bert(
#         train_sents, train_labels,
#         dev_sents,   dev_labels,
#         test_sents,  test_labels,
#         out_dir=out_dir,
#         bert_ckpt=bert_ckpt,
#         epochs=3,
#         batch_size=16,
#         lr=5e-5
#     )


In [10]:
# === 1) Data loading (CoNLL -> lists of sentences) ===
def read_conll(path: Path) -> tuple[list[list[str]], list[list[str]]]:
    """
    Read a two-column CoNLL file (token<TAB>tag>), sentences separated by blank lines.
    Returns: tokens_per_sent, tags_per_sent as lists of lists.
    """
    s_tokens, s_tags = [], []
    tokens, tags = [], []
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if tokens:
                    s_tokens.append(tokens)
                    s_tags.append(tags)
                    tokens, tags = [], []
                continue
            parts = line.split("\t")
            if len(parts) != 2:
                # Skip malformed lines safely
                continue
            tok, lab = parts
            tokens.append(tok)
            tags.append(lab)
    if tokens:
        s_tokens.append(tokens)
        s_tags.append(tags)
    return s_tokens, s_tags



X_tokens_train, y_train = read_conll(config['train_connl'])
X_tokens_val,   y_val   = read_conll(config['validation_connl'])
X_tokens_test,  y_test  = read_conll(config['test_connl'])


print(f"#train sents: {len(X_tokens_train)}  #val: {len(X_tokens_val)}  #test: {len(X_tokens_test)}")


#train sents: 4800  #val: 3174  #test: 3568


In [None]:
def build_skill_gazetteers(tokens_per_sent: list[list[str]], tags_per_sent: list[list[str]]) -> dict[str, set[str]]:
    """
    Build simple gazetteers from training data:
      - skill_unigrams: lowercased tokens appearing inside any SKILL span
      - skill_bigrams:  lowercased bigrams inside SKILL spans
    """
    skill_unigrams = set()
    skill_bigrams  = set()
    for toks, labs in zip(tokens_per_sent, tags_per_sent):
        # collect indices of tokens inside SKILL spans (B-SKILL / I-SKILL)
        inside = [i for i, t in enumerate(labs) if t.startswith("B-") or t.startswith("I-")]
        for i in inside:
            skill_unigrams.add(toks[i].lower())
        # bigrams (consecutive tokens both inside a span)
        for i in range(len(toks) - 1):
            if (labs[i].startswith(("B-","I-"))) and (labs[i+1].startswith(("B-","I-"))):
                skill_bigrams.add((toks[i].lower(), toks[i+1].lower()))
    return {"skill_unigrams": skill_unigrams, "skill_bigrams": skill_bigrams}

gazetteers = build_skill_gazetteers(X_tokens_train, y_train)
len(gazetteers["skill_unigrams"]), len(gazetteers["skill_bigrams"])

(2184, 5944)

In [None]:
def char_ngrams(token: str, n: int) -> list[str]:
    token = token
    return [token[i:i+n] for i in range(len(token) - n + 1)] if len(token) >= n else []

def top_char_ngrams(tokens_per_sent: list[list[str]], top_k: int = 400) -> dict[str, set[str]]:
    """
    Compute most frequent char 2-grams and 3-grams from TRAIN tokens only and keep top_k for each size.
    This caps feature explosion while still giving CRF helpful subword signals.
    """
    c2 = Counter()
    c3 = Counter()
    for sent in tokens_per_sent:
        for tok in sent:
            c2.update(char_ngrams(tok, 2))
            c3.update(char_ngrams(tok, 3))
    top2 = set([ng for ng, _ in c2.most_common(top_k)])
    top3 = set([ng for ng, _ in c3.most_common(top_k)])
    return {"char2": top2, "char3": top3}

char_ngram_vocab = top_char_ngrams(X_tokens_train, top_k=400)
len(char_ngram_vocab["char2"]), len(char_ngram_vocab["char3"])

(400, 400)

In [23]:
PREF_SIZES = (2, 3, 4)
SUFF_SIZES = (2, 3, 4)

def word_shape(token: str) -> str:
    """
    Map token to a coarse 'shape' (e.g., 'Xx', 'xxx', 'd-dd', 'xxx-xx', 'Xx.' etc.).
    Helps generalize across casing/digits/punct.
    """
    shape = []
    for ch in token:
        if ch.isupper():
            shape.append('X')
        elif ch.islower():
            shape.append('x')
        elif ch.isdigit():
            shape.append('d')
        elif ch in "-_/\\.":
            shape.append(ch)
        else:
            shape.append('p')  # other punct
    # collapse runs like XXX -> X, xxx -> x to reduce sparsity
    collapsed = []
    for ch in shape:
        if not collapsed or collapsed[-1] != ch:
            collapsed.append(ch)
    return ''.join(collapsed)

def token_features(sent: list[str], i: int,
                   gaz: dict[str, set[str]],
                   char_vocab: dict[str, set[str]]) -> dict[str, Any]:
    """
    Build a feature dict for token sent[i].
    Includes:
      - bias, word lowercase, shape, isupper/istitle/isdigit/has_digit/has_hyphen
      - prefixes/suffixes, limited char 2/3-grams (only if in top lists)
      - simple gazetteers (unigram + adjacent bigrams), plus +/-1 and +/-2 window features
    """
    token = sent[i]
    lower = token.lower()
    feats = {
        'bias': 1.0,
        'word.lower': lower,
        'word.shape': word_shape(token),
        'word.isupper': token.isupper(),
        'word.istitle': token.istitle(),
        'word.isdigit': token.isdigit(),
        'word.has_digit': any(ch.isdigit() for ch in token),
        'word.has_hyphen': '-' in token,
        'word.has_dot': '.' in token,
        'word.has_slash': '/' in token or '\\' in token,
        'gaz.in_skill_unigram': (lower in gaz['skill_unigrams']),
    }
    # prefixes / suffixes
    for n in PREF_SIZES:
        feats[f'pref{n}'] = lower[:n] if len(lower) >= n else lower
    for n in SUFF_SIZES:
        feats[f'suff{n}'] = lower[-n:] if len(lower) >= n else lower

    # limited char 2/3-grams (only those that are in top vocab to control dimensionality)
    for ng in char_ngrams(token, 2):
        if ng in char_vocab['char2']:
            feats[f'char2={ng}'] = True
    for ng in char_ngrams(token, 3):
        if ng in char_vocab['char3']:
            feats[f'char3={ng}'] = True

    # context features (+/- 1, +/- 2)
    def add_ctx(j: int, tag: str):
        if 0 <= j < len(sent):
            w = sent[j]
            lw = w.lower()
            feats[f'{tag}.lower'] = lw
            feats[f'{tag}.shape'] = word_shape(w)
            feats[f'{tag}.istitle'] = w.istitle()
            feats[f'{tag}.isupper'] = w.isupper()

    add_ctx(i-1, '-1')
    add_ctx(i-2, '-2')
    add_ctx(i+1, '+1')
    add_ctx(i+2, '+2')

    # gazetteer bigrams with neighbors (prev+cur, cur+next)
    if i-1 >= 0:
        feats['gaz.prev_cur_in_skill_bigram'] = (sent[i-1].lower(), lower) in gaz['skill_bigrams']
    if i+1 < len(sent):
        feats['gaz.cur_next_in_skill_bigram'] = (lower, sent[i+1].lower()) in gaz['skill_bigrams']

    return feats

def sent2features(sent: list[str],
                  gaz: dict[str, set[str]],
                  char_vocab: dict[str, set[str]]) -> list[dict[str, Any]]:
    return [token_features(sent, i, gaz, char_vocab) for i in range(len(sent))]

In [24]:
def to_crf_Xy(tokens_per_sent: list[list[str]],
              tags_per_sent: list[list[str]],
              gaz: dict[str, set[str]],
              char_vocab: dict[str, set[str]]):
    X = [sent2features(s, gaz, char_vocab) for s in tokens_per_sent]
    y = [list(tags) for tags in tags_per_sent]
    return X, y

X_train, y_train_ = to_crf_Xy(X_tokens_train, y_train, gazetteers, char_ngram_vocab)
X_val_,  y_val_   = to_crf_Xy(X_tokens_val,   y_val,   gazetteers, char_ngram_vocab)
X_test_, y_test_  = to_crf_Xy(X_tokens_test,  y_test,  gazetteers, char_ngram_vocab)

# Quick sanity check
print(len(X_train), len(y_train_), len(X_val_), len(y_val_))
print(len(X_train[0]), len(y_train_[0]), X_train[0][0])

4800 4800 3174 3174
7 7 {'bias': 1.0, 'word.lower': 'senior', 'word.shape': 'Xx', 'word.isupper': False, 'word.istitle': True, 'word.isdigit': False, 'word.has_digit': False, 'word.has_hyphen': False, 'word.has_dot': False, 'word.has_slash': False, 'gaz.in_skill_unigram': True, 'pref2': 'se', 'pref3': 'sen', 'pref4': 'seni', 'suff2': 'or', 'suff3': 'ior', 'suff4': 'nior', 'char2=Se': True, 'char2=en': True, 'char2=ni': True, 'char2=io': True, 'char2=or': True, '+1.lower': 'qa', '+1.shape': 'X', '+1.istitle': False, '+1.isupper': True, '+2.lower': 'engineer', '+2.shape': 'Xx', '+2.istitle': True, '+2.isupper': False, 'gaz.cur_next_in_skill_bigram': False}


In [26]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,               # L1
    c2=0.1,               # L2
    max_iterations=200,
    all_possible_transitions=True
)

crf.fit(X_train, y_train_)
print("CRF trained.")

CRF trained.


In [27]:
def eval_seqeval(y_true, y_pred, title: str = "Eval"):
    print(f"=== {title} (span-level) ===")
    p = precision_score(y_true, y_pred, scheme=IOB2)
    r = recall_score(y_true, y_pred, scheme=IOB2)
    f = f1_score(y_true, y_pred, scheme=IOB2)
    print(f"Precision: {p:.4f}  Recall: {r:.4f}  F1: {f:.4f}")
    print(classification_report(y_true, y_pred, scheme=IOB2, digits=4))

y_val_pred  = crf.predict(X_val_)
y_test_pred = crf.predict(X_test_)

eval_seqeval(y_val_,  y_val_,  title="Val [oracle sanity]")     # sanity: should be 1.0
eval_seqeval(y_val_,  y_val_pred,  title="Val")
eval_seqeval(y_test_, y_test_pred, title="Test")

=== Val [oracle sanity] (span-level) ===
Precision: 1.0000  Recall: 1.0000  F1: 1.0000
              precision    recall  f1-score   support

       SKILL     1.0000    1.0000    1.0000      1070

   micro avg     1.0000    1.0000    1.0000      1070
   macro avg     1.0000    1.0000    1.0000      1070
weighted avg     1.0000    1.0000    1.0000      1070

=== Val (span-level) ===
Precision: 0.3375  Recall: 0.1000  F1: 0.1543
              precision    recall  f1-score   support

       SKILL     0.3375    0.1000    0.1543      1070

   micro avg     0.3375    0.1000    0.1543      1070
   macro avg     0.3375    0.1000    0.1543      1070
weighted avg     0.3375    0.1000    0.1543      1070

=== Test (span-level) ===
Precision: 0.3607  Recall: 0.1009  F1: 0.1577
              precision    recall  f1-score   support

       SKILL     0.3607    0.1009    0.1577      1090

   micro avg     0.3607    0.1009    0.1577      1090
   macro avg     0.3607    0.1009    0.1577      1090
weight