In [1]:
from sklearn_crfsuite import CRF, metrics
from pathlib import Path
import sys
from seqeval.metrics import f1_score, classification_report
import joblib
import pycrfsuite
import random


if Path.cwd().parent not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

from config import settings
from utils.mex4 import read_conll_stream, add_context_inplace

### Approach

- **Data Preparation:** Created a CoNLL-formatted dataset. Where the `Title` from the CSV file were used as a reference/citation. Each text file scanned to check the occurences of the title, and annotated in BIO format (O,B, I)
- **Training:** Once done, I loaded it using a streaming approach (to avoid RAM bloating). Added a Feature Engineering (e.g. contextual feature augmentation)

In [2]:
random_seed = random.Random(42)

trainer = pycrfsuite.Trainer(verbose=True)

X_test, y_test = [], []

for X_sent, y_sent in read_conll_stream(Path("../data/mex4/train_weak.conll")):
    # add ±1 context on the fly
    add_context_inplace(X_sent, left=1, right=1)
    # stream split
    if random_seed.random() < 0.1:
        X_test.append(X_sent)
        y_test.append(y_sent)
    else:
        trainer.append(X_sent, y_sent)

trainer.set_params({
    "c1": 0.2,                         # L1
    "c2": 0.1,                         # L2
    "max_iterations": 200,
    "feature.possible_transitions": True
})

model_path = Path("../outputs/mex4_crf_model.joblib")
model_path.parent.mkdir(parents=True, exist_ok=True)
trainer.train(str(model_path))
print(f"\nSaved model to: {model_path}")

tagger = pycrfsuite.Tagger()
tagger.open(str(model_path))

y_pred = [tagger.tag(x) for x in X_test]
print(f"\nDev F1: {f1_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, digits=3))

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 2222755
Seconds required: 57.867

L-BFGS optimization
c1: 0.200000
c2: 0.100000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 7854684.365188
Feature norm: 1.000000
Error norm: 20506467.759086
Active features: 2222688
Line search trials: 1
Line search step: 0.000000
Seconds required for this iteration: 50.496

***** Iteration #2 *****
Loss: 3914387.148764
Feature norm: 1.260109
Error norm: 9906104.251392
Active features: 2222349
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 26.259

***** Iteration #3 *****
Loss: 2130238.851885
Feature norm: 1.505398
Error norm: 4766570.092941
Active features: 1115898
Line search trials: 1
Line search step: 1.000

### Summary

The CRF model achieved near-perfect detection of citations/datasets, with slightly higher recall than precision (catching almost all relevant spans with minimal false positives).

But it needs to be checked with a golden samples for sanity check.