In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm
2024-12-04 17:06:10.942840: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-04 17:06:11.118664: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733349971.215900 1585461 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733349971.234640 1585461 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 17:06:11.391951: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [3]:


# Load the dataset
file_path = '/home/ahmedabdullahi/NLP590/NLPJobsFinder/Data/traindataset.csv'  # Replace with your file path
training_data = pd.read_csv(file_path)

# Prepare data for Transformers
def prepare_data_for_transformers(data):
    """
    Converts the dataframe into a format suitable for Transformers.
    Groups tokens and labels by sentence.
    """
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    label_to_id = {label: idx for idx, label in enumerate(data['Label'].unique())}
    id_to_label = {idx: label for label, idx in label_to_id.items()}

    for _, row in data.iterrows():
        token, label = row['Token'], row['Label']
        current_sentence.append(token)
        current_labels.append(label_to_id[label])

        # Simulate end of sentence
        if token.endswith('.') or len(current_sentence) >= 10:
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence = []
            current_labels = []

    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)

    return sentences, labels, label_to_id, id_to_label

sentences, labels, label_to_id, id_to_label = prepare_data_for_transformers(training_data)



In [4]:
# Split into train and test datasets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.5, random_state=42
)

In [5]:
# Load pre-trained tokenizer and model
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

def tokenize_and_align_labels(sentences, labels):
    """
    Tokenizes input sentences and aligns labels with tokenized output.
    """
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        padding=True,
        is_split_into_words=True,
        return_tensors="pt"
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_label.append(-100)  # Ignore these tokens
            elif word_id != previous_word_id:  # Only label the first subword
                aligned_label.append(label[word_id])
            else:
                aligned_label.append(-100)  # Ignore subsequent subword tokens
            previous_word_id = word_id
        aligned_labels.append(aligned_label)

    tokenized_inputs["labels"] = torch.tensor(aligned_labels)
    return tokenized_inputs


# Tokenize and align labels (use your existing tokenize_and_align_labels function)
train_inputs = tokenize_and_align_labels(train_sentences, train_labels)
test_inputs = tokenize_and_align_labels(test_sentences, test_labels)

# Prepare Hugging Face datasets
from datasets import Dataset
train_dataset = Dataset.from_dict(train_inputs)
test_dataset = Dataset.from_dict(test_inputs)

# Training Arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Trainer setup
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
import evaluate

# Load the seqeval metric
metric = evaluate.load("seqeval")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) and align predictions with true labels
    true_labels = [
        [id_to_label[label] for label in label_row if label != -100]
        for label_row in labels
    ]
    true_predictions = [
        [id_to_label[pred] for (pred, label) in zip(pred_row, label_row) if label != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    # Compute the metrics
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [7]:


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",       # Evaluate more frequently
    eval_steps=500,                    # Evaluation every 500 steps
    learning_rate=1e-5,                # Lower learning rate
    per_device_train_batch_size=8,     # Smaller batch size for finer updates
    per_device_eval_batch_size=8,      # Smaller eval batch size
    num_train_epochs=5,                # More epochs for better learning
    weight_decay=0.01,                 # Regularization
    save_strategy="steps",             # Save best model during training
    save_steps=500,                    # Save every 500 steps
    logging_dir="./logs",
    logging_steps=100,                 # Log progress more frequently
    load_best_model_at_end=True,       # Automatically load best model
    metric_for_best_model="eval_loss", # Monitor eval loss for "best"
    greater_is_better=False            # Lower loss is better
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Function to calculate F1-score, precision, recall

)



  trainer = Trainer(


In [9]:
# Train the model
trainer.train()

                                               
100%|██████████| 70/70 [10:15<00:00,  8.79s/it]

{'train_runtime': 615.5294, 'train_samples_per_second': 0.853, 'train_steps_per_second': 0.114, 'train_loss': 1.002686527797154, 'epoch': 5.0}





TrainOutput(global_step=70, training_loss=1.002686527797154, metrics={'train_runtime': 615.5294, 'train_samples_per_second': 0.853, 'train_steps_per_second': 0.114, 'total_flos': 35235471640950.0, 'train_loss': 1.002686527797154, 'epoch': 5.0})

In [10]:
results = trainer.evaluate()
print(f"Precision: {results['eval_precision']:.2f}")
print(f"Recall: {results['eval_recall']:.2f}")
print(f"F1-score: {results['eval_f1']:.2f}")
print(f"Accuracy: {results['eval_accuracy']:.2f}")


28it [11:33, 24.78s/it]                        

Precision: 0.99
Recall: 0.99
F1-score: 0.99
Accuracy: 0.99





In [13]:

# Save the model and tokenizer
model.save_pretrained("ner_model1")
tokenizer.save_pretrained("ner_model1")
print("Model saved to 'ner_model1/'.")



Model saved to 'ner_model1/'.


In [23]:
# Load the model and use it for inference
ner_pipeline = pipeline("ner", model="ner_model1", tokenizer="ner_model1", aggregation_strategy="simple")

# Test on new input
input_text = "This company is  India Ohio States in remote description: work from home Series of 2 interviews first on soft skills."
predictions = ner_pipeline(input_text)

print("\nPredictions:")
for entity in predictions:
    print(entity)


Predictions:
{'entity_group': 'LOC', 'score': np.float32(0.3942915), 'word': 'India', 'start': 17, 'end': 22}
{'entity_group': 'PER', 'score': np.float32(0.44366944), 'word': 'Ohio', 'start': 23, 'end': 27}
{'entity_group': 'MISC', 'score': np.float32(0.92090696), 'word': 'States', 'start': 28, 'end': 34}
