In [None]:
import os
import numpy as np
from glob import glob
from datasets import Dataset, Sequence, ClassLabel
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, pipeline
from seqeval.metrics import classification_report

# -------------------------
# Step 1: Label Definitions
# -------------------------
labels_list = ["O", "B-SKILL", "I-SKILL"]
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for label, i in label2id.items()}

# -------------------------
# Step 2: Read Multi-file CoNLL
# -------------------------
def read_multiple_conll_files(file_list):
    all_tokens, all_labels = [], []
    for file_path in file_list:
        tokens, labels = [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    if tokens:
                        all_tokens.append(tokens)
                        all_labels.append(labels)
                        tokens, labels = [], []
                else:
                    parts = line.split()
                    if len(parts) >= 2:
                        token, tag = parts[0], parts[1]
                        tokens.append(token)
                        labels.append(tag)
        if tokens:
            all_tokens.append(tokens)
            all_labels.append(labels)
    return {"tokens": all_tokens, "ner_tags": all_labels}

# -------------------------
# Step 3: Load Data from Files
# -------------------------
train_files = glob("data/train/*.txt")
val_files = glob("data/val/*.txt")

train_data = read_multiple_conll_files(train_files)
val_data = read_multiple_conll_files(val_files)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# -------------------------
# Step 4: Map tags to IDs
# -------------------------
train_dataset = train_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})
val_dataset = val_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})

# -------------------------
# Step 5: Define Features
# -------------------------
features = {
    "tokens": train_dataset.features["tokens"],
    "ner_tags": Sequence(ClassLabel(names=labels_list))
}
train_dataset = train_dataset.cast(features)
val_dataset = val_dataset.cast(features)

# -------------------------
# Step 6: Tokenization & Label Alignment
# -------------------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            current_label = example["ner_tags"][word_idx]
            label_name = labels_list[current_label]
            labels.append(label2id["I-SKILL"] if label_name.startswith("B") else current_label)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=False)

# -------------------------
# Step 7: Load Model
# -------------------------
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(labels_list),
    id2label=id2label,
    label2id=label2id
)

# -------------------------
# Step 8: Metrics
# -------------------------
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_labels = [[labels_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[labels_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "f1": report["micro avg"]["f1-score"]
    }

# -------------------------
# Step 9: Trainer Setup
# -------------------------
args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# -------------------------
# Step 10: Train
# -------------------------
trainer.train()


In [None]:
# Load inference pipeline
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

text = "We are hiring a Python developer with experience in Docker and machine learning."

result = nlp(text)

# Reconstruct full skills from B-SKILL/I-SKILL tokens
skills = []
current_skill = []

for ent in result:
    if ent["entity"] == "B-SKILL":
        if current_skill:
            skills.append(" ".join(current_skill))
            current_skill = []
        current_skill = [ent["word"]]
    elif ent["entity"] == "I-SKILL":
        current_skill.append(ent["word"])
    else:
        if current_skill:
            skills.append(" ".join(current_skill))
            current_skill = []

if current_skill:
    skills.append(" ".join(current_skill))

print("Extracted Skills:", skills)


### PREPROCESSING
- DETECTING SAME SKILL SEPTERATED BY SPACE (PRESENT IN SAME LINE AND REPLACING IT)

In [23]:
def find_malformed_lines(filepath):
    print(f"\n🔍 Checking file: {filepath}\n")
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            stripped = line.strip()
            if stripped == "":
                continue
            parts = stripped.split()
            if len(parts) > 2:
                print(f"[Line {i}] ❌ Malformed: {stripped} ({len(parts)} parts)")

# Example usage
find_malformed_lines("../data/val/test.txt")



🔍 Checking file: ../data/val/test.txt

[Line 2292] ❌ Malformed: New Relic B-SKILL (3 parts)


In [25]:
def fix_malformed_lines(input_path, output_path):
    fixed_lines = []
    
    with open(input_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                fixed_lines.append("")  # Keep sentence breaks
                continue

            parts = line.split()
            if len(parts) == 2:
                fixed_lines.append(line)  # Proper line
            elif len(parts) > 2:
                # Assume last token is the label, rest is the entity
                *tokens, label = parts
                if label.startswith("B-") or label.startswith("I-"):
                    fixed_lines.append(f"{tokens[0]} {label}")  # First word = B-label
                    for token in tokens[1:]:
                        fixed_lines.append(f"{token} I{label[1:]}")
                else:
                    print(f"⚠️ Unknown label format: {line}")
            else:
                print(f"⚠️ Skipping invalid line: {line}")
    
    # Write output
    with open(output_path, "w", encoding="utf-8") as f_out:
        for line in fixed_lines:
            f_out.write(line + "\n")

    print(f"✅ Fixed file saved to: {output_path}")

# Example usage
fix_malformed_lines("../data/val/test.txt", "../data/val/test.txt")


✅ Fixed file saved to: ../data/val/test.txt


In [4]:
import os
import numpy as np
from glob import glob
from datasets import Dataset, Sequence, ClassLabel,Features,Value
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report

# -------------------------
# Step 1: Label Definitions
# -------------------------
labels_list = ["O", "B-SKILL", "I-SKILL"]
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for label, i in label2id.items()}

# -------------------------
# Step 2: Read Multi-file CoNLL
# -------------------------
def read_multiple_conll_files(file_list):
    all_tokens, all_labels = [], []
    for file_path in file_list:
        tokens, labels = [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    if tokens and labels:
                        all_tokens.append(tokens)
                        all_labels.append(labels)
                        tokens, labels = [], []
                else:
                    parts = line.split()
                    if len(parts) >= 2:
                        token, tag = parts[0], parts[1]
                        tokens.append(token)
                        labels.append(tag)
        if tokens and labels:
            all_tokens.append(tokens)
            all_labels.append(labels)
    return {"tokens": all_tokens, "ner_tags": all_labels}

# -------------------------
# Step 3: Load Data from Files
# -------------------------
train_files = glob("../data/train/*.txt")
val_files = glob("../data/val/*.txt")

train_data = read_multiple_conll_files(train_files)
val_data = read_multiple_conll_files(val_files)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# -------------------------
# Step 4: Map tags to IDs
# -------------------------
train_dataset = train_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})
val_dataset = val_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})

# -------------------------
# Step 5: Define Features
# -------------------------
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=labels_list))
})
train_dataset = train_dataset.cast(features)
val_dataset = val_dataset.cast(features)

# -------------------------
# Step 6: Tokenization & Label Alignment
# -------------------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            current_label = example["ner_tags"][word_idx]
            label_name = labels_list[current_label]
            if label_name.startswith("B"):
                labels.append(label2id["I-SKILL"])
            else:
                labels.append(current_label)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=False)

# -------------------------
# Step 7: Load Model
# -------------------------
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(labels_list),
    id2label=id2label,
    label2id=label2id
)

# -------------------------
# Step 8: Metrics
# -------------------------
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[labels_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[labels_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["overall_precision"],
        "recall": report["overall_recall"],
        "f1": report["overall_f1"]
    }

# -------------------------
# Step 9: Trainer Setup
# -------------------------
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # <-- change from "no" to "epoch"
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable WANDB/logging if not using
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()


ModuleNotFoundError: No module named 'datasets'

___

# TRAINING PHASE

In [1]:
import os
import numpy as np
from glob import glob
from datasets import Dataset, Sequence, ClassLabel,Features,Value
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from seqeval.metrics import classification_report
from transformers import DataCollatorForTokenClassification

# -------------------------
# Step 1: Label Definitions
# -------------------------
labels_list = ["O", "B-SKILL", "I-SKILL"]
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for label, i in label2id.items()}

# -------------------------
# Step 2: Read Multi-file CoNLL
# -------------------------
def read_multiple_conll_files(file_list):
    all_tokens, all_labels = [], []
    for file_path in file_list:
        tokens, labels = [], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    if tokens and labels:
                        all_tokens.append(tokens)
                        all_labels.append(labels)
                        tokens, labels = [], []
                else:
                    parts = line.split()
                    if len(parts) >= 2:
                        token, tag = parts[0], parts[1]
                        tokens.append(token)
                        labels.append(tag)
        if tokens and labels:
            all_tokens.append(tokens)
            all_labels.append(labels)
    return {"tokens": all_tokens, "ner_tags": all_labels}

# -------------------------
# Step 3: Load Data from Files
# -------------------------
train_files = glob("../data/train/*.txt")
val_files = glob("../data/val/*.txt")

train_data = read_multiple_conll_files(train_files)
val_data = read_multiple_conll_files(val_files)

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# -------------------------
# Step 4: Map tags to IDs
# -------------------------
train_dataset = train_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})
val_dataset = val_dataset.map(lambda x: {"ner_tags": [label2id[tag] for tag in x["ner_tags"]]})

# -------------------------
# Step 5: Define Features
# -------------------------
features = Features({
    "tokens": Sequence(Value("string")),
    "ner_tags": Sequence(ClassLabel(names=labels_list))
})
train_dataset = train_dataset.cast(features)
val_dataset = val_dataset.cast(features)

# -------------------------
# Step 6: Tokenization & Label Alignment
# -------------------------
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=True  # ✅ Add this line
    )
    word_ids = tokenized_inputs.word_ids()
    labels = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["ner_tags"][word_idx])
        else:
            current_label = example["ner_tags"][word_idx]
            label_name = labels_list[current_label]
            if label_name.startswith("B"):
                labels.append(label2id["I-SKILL"])
            else:
                labels.append(current_label)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=False)

# -------------------------
# Step 7: Load Model
# -------------------------
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(labels_list),
    id2label=id2label,
    label2id=label2id
)

# -------------------------
# Step 8: Metrics
# -------------------------
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[labels_list[l] for l in label if l != -100] for label in labels]
    true_preds = [[labels_list[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    report = classification_report(true_labels, true_preds, output_dict=True)

    return {
        "precision": report["macro avg"]["precision"],
        "recall": report["macro avg"]["recall"],
        "f1": report["macro avg"]["f1-score"]
    }


# -------------------------
# Step 9: Trainer Setup
# -------------------------
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",  # <-- change from "no" to "epoch"
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable WANDB/logging if not using
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


trainer.train()
# Save model and tokenizer
save_dir = "./ner_model"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)


Map:   0%|          | 0/287 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/287 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/108 [00:00<?, ?it/s]



{'loss': 0.6442, 'grad_norm': 3.0621626377105713, 'learning_rate': 4.5370370370370374e-05, 'epoch': 0.28}
{'loss': 0.3175, 'grad_norm': 3.0876271724700928, 'learning_rate': 4.074074074074074e-05, 'epoch': 0.56}
{'loss': 0.3463, 'grad_norm': 1.9984157085418701, 'learning_rate': 3.611111111111111e-05, 'epoch': 0.83}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.301613986492157, 'eval_precision': 0.7548387096774194, 'eval_recall': 0.6381818181818182, 'eval_f1': 0.6916256157635469, 'eval_runtime': 8.4118, 'eval_samples_per_second': 1.783, 'eval_steps_per_second': 0.238, 'epoch': 1.0}




{'loss': 0.2143, 'grad_norm': 0.6205812692642212, 'learning_rate': 3.148148148148148e-05, 'epoch': 1.11}
{'loss': 0.2582, 'grad_norm': 3.0448570251464844, 'learning_rate': 2.6851851851851855e-05, 'epoch': 1.39}
{'loss': 0.2294, 'grad_norm': 1.6237772703170776, 'learning_rate': 2.2222222222222223e-05, 'epoch': 1.67}
{'loss': 0.2165, 'grad_norm': 0.7305532693862915, 'learning_rate': 1.7592592592592595e-05, 'epoch': 1.94}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.2774098515510559, 'eval_precision': 0.728952772073922, 'eval_recall': 0.6454545454545455, 'eval_f1': 0.6846673095467696, 'eval_runtime': 8.1776, 'eval_samples_per_second': 1.834, 'eval_steps_per_second': 0.245, 'epoch': 2.0}




{'loss': 0.1705, 'grad_norm': 2.279723882675171, 'learning_rate': 1.2962962962962962e-05, 'epoch': 2.22}
{'loss': 0.1779, 'grad_norm': 1.3308075666427612, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}
{'loss': 0.161, 'grad_norm': 1.6121634244918823, 'learning_rate': 3.7037037037037037e-06, 'epoch': 2.78}


  0%|          | 0/2 [00:00<?, ?it/s]

{'eval_loss': 0.3401213586330414, 'eval_precision': 0.7850467289719626, 'eval_recall': 0.610909090909091, 'eval_f1': 0.6871165644171779, 'eval_runtime': 9.5508, 'eval_samples_per_second': 1.571, 'eval_steps_per_second': 0.209, 'epoch': 3.0}
{'train_runtime': 1951.0848, 'train_samples_per_second': 0.441, 'train_steps_per_second': 0.055, 'train_loss': 0.2666846403369197, 'epoch': 3.0}


('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

___

# LOAD MODEL AND TOKEN

In [1]:
from transformers import BertTokenizerFast, BertForTokenClassification
import torch
import numpy as np

# Load model and tokenizer
model = BertForTokenClassification.from_pretrained("./ner_model")
tokenizer = BertTokenizerFast.from_pretrained("./ner_model")
model.eval()  # Set to eval mode


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [3]:
labels_list = ["O", "B-SKILL", "I-SKILL"]

def predict_tokens(tokens):
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    word_ids = inputs.word_ids()[0]

    final_preds = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        label_id = predictions[idx]
        final_preds.append((tokens[word_idx], labels_list[label_id]))
        previous_word_idx = word_idx
    return final_preds

# Example usage
tokens = ["We", "are", "looking", "for", "a", "Python", "developer", "."]
print(predict_tokens(tokens))


TypeError: 'NoneType' object is not iterable

In [4]:
import torch

def predict_tokens(tokens):
    # Tokenize without converting to tensor yet
    encoding = tokenizer(tokens, is_split_into_words=True, truncation=True, return_offsets_mapping=True)
    word_ids = encoding.word_ids()

    # Now get torch tensor input
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()

    final_preds = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        label_id = predictions[idx]
        final_preds.append((tokens[word_idx], labels_list[label_id]))
        previous_word_idx = word_idx
    return final_preds


In [5]:
import torch

labels_list = ["O", "B-SKILL", "I-SKILL"]

def predict_tokens(tokens):
    # First, tokenize with return_offsets_mapping to access word_ids
    encoding = tokenizer(tokens, is_split_into_words=True, return_offsets_mapping=True, truncation=True)
    word_ids = encoding.word_ids()

    # Prepare model input
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)

    # Disable gradient calculation
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label IDs
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()

    # Align predictions to input tokens
    final_preds = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        label_id = predictions[idx]
        final_preds.append((tokens[word_idx], labels_list[label_id]))
        previous_word_idx = word_idx

    return final_preds

# ✅ Example usage
tokens = ["We", "are", "looking", "for", "a", "Python", "developer", "."]
print(predict_tokens(tokens))


[('We', 'O'), ('are', 'O'), ('looking', 'O'), ('for', 'O'), ('a', 'O'), ('Python', 'B-SKILL'), ('developer', 'I-SKILL'), ('.', 'O')]


In [7]:
def extract_skills(predictions):
    skills = []
    current_skill = []

    for token, label in predictions:
        if label == "B-SKILL":
            if current_skill:
                skills.append(" ".join(current_skill))
                current_skill = []
            current_skill.append(token)
        elif label == "I-SKILL":
            if current_skill:
                current_skill.append(token)
        else:
            if current_skill:
                skills.append(" ".join(current_skill))
                current_skill = []

    # Catch last skill if it ended the sentence
    if current_skill:
        skills.append(" ".join(current_skill))

    return skills


# ✅ Example usage
tokens = ["We", "require", "experience", "in", "Java", "programming", "and", "machine", "learning", "frameworks", "."]

predictions = predict_tokens(tokens)
print("🔍 Raw predictions:", predictions)

skills = extract_skills(predictions)
print("✅ Extracted skills:", skills)


🔍 Raw predictions: [('We', 'O'), ('require', 'O'), ('experience', 'O'), ('in', 'O'), ('Java', 'B-SKILL'), ('programming', 'I-SKILL'), ('and', 'O'), ('machine', 'B-SKILL'), ('learning', 'I-SKILL'), ('frameworks', 'I-SKILL'), ('.', 'O')]
✅ Extracted skills: ['Java programming', 'machine learning frameworks']


In [5]:
def predict_tokens_from_text(text):
    # Tokenize with offset mapping (but don't pass it to the model)
    tokenized = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        return_offsets_mapping=True,
        return_attention_mask=True
    )
    word_ids = tokenized.word_ids(0)  # works correctly

    # Remove offset_mapping before passing to the model
    inputs = {k: v for k, v in tokenized.items() if k != "offset_mapping"}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    result = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or inputs["input_ids"][0][idx] in tokenizer.all_special_ids:
            continue  # Skip [CLS], [SEP], etc.
        if word_idx != previous_word_idx:
            token = tokens[idx]
            label_id = predictions[idx]
            result.append((token, id2label[label_id]))
            previous_word_idx = word_idx

    return result
job_description = """
We are looking for a candidate with experience in Python programming, machine learning, and deep learning frameworks like TensorFlow and PyTorch.
"""

predictions = predict_tokens_from_text(job_description)
print("🔍 Raw Predictions:", predictions)

skills = extract_skills(predictions)
print("✅ Extracted Skills:", skills)


🔍 Raw Predictions: [('We', 'O'), ('are', 'O'), ('looking', 'O'), ('for', 'O'), ('a', 'O'), ('candidate', 'O'), ('with', 'O'), ('experience', 'O'), ('in', 'O'), ('Python', 'B-SKILL'), ('programming', 'I-SKILL'), (',', 'O'), ('machine', 'B-SKILL'), ('learning', 'I-SKILL'), (',', 'O'), ('and', 'O'), ('deep', 'B-SKILL'), ('learning', 'I-SKILL'), ('framework', 'I-SKILL'), ('like', 'O'), ('Ten', 'B-SKILL'), ('and', 'O'), ('P', 'B-SKILL'), ('.', 'O')]
✅ Extracted Skills: ['Python programming', 'machine learning', 'deep learning framework', 'Ten', 'P']


In [6]:
def predict_tokens_from_text(text):
    # Split text into words
    words = text.strip().split()

    # Tokenize with alignment
    tokenized = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True
    )
    word_ids = tokenized.word_ids(0)

    # Run prediction
    with torch.no_grad():
        outputs = model(**tokenized)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)[0].tolist()

    # Map predictions to words (not subwords)
    result = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        word = words[word_idx]
        label_id = predictions[idx]
        result.append((word, id2label[label_id]))
        previous_word_idx = word_idx

    return result


In [7]:
job_description = """
We are looking for a candidate with experience in Python programming, machine learning, and deep learning frameworks like TensorFlow and PyTorch.
"""

predictions = predict_tokens_from_text(job_description)
skills = extract_skills(predictions)

print("🔍 Raw Predictions:", predictions)
print("✅ Extracted Skills:", skills)


🔍 Raw Predictions: [('We', 'O'), ('are', 'O'), ('looking', 'O'), ('for', 'O'), ('a', 'O'), ('candidate', 'O'), ('with', 'O'), ('experience', 'O'), ('in', 'O'), ('Python', 'B-SKILL'), ('programming,', 'I-SKILL'), ('machine', 'B-SKILL'), ('learning,', 'I-SKILL'), ('and', 'O'), ('deep', 'B-SKILL'), ('learning', 'I-SKILL'), ('frameworks', 'I-SKILL'), ('like', 'O'), ('TensorFlow', 'B-SKILL'), ('and', 'O'), ('PyTorch.', 'B-SKILL')]
✅ Extracted Skills: ['Python programming,', 'machine learning,', 'deep learning frameworks', 'TensorFlow', 'PyTorch.']
