In [1]:
import optuna
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import pipeline



  from .autonotebook import tqdm as notebook_tqdm
2024-12-04 15:11:31.042613: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-04 15:11:31.200162: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733343091.264558 1544433 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733343091.283296 1544433 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 15:11:31.431891: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [2]:



# Load the dataset
file_path = '/home/ahmedabdullahi/NLP590/NLPJobsFinder/Data/traindataset.csv'  # Replace with your file path
training_data = pd.read_csv(file_path)

# Prepare data for Transformers
def prepare_data_for_transformers(data):
    sentences = []
    labels = []
    current_sentence = []
    current_labels = []

    label_to_id = {label: idx for idx, label in enumerate(data['Label'].unique())}
    id_to_label = {idx: label for label, idx in label_to_id.items()}

    for _, row in data.iterrows():
        token, label = row['Token'], row['Label']
        current_sentence.append(token)
        current_labels.append(label_to_id[label])

        # Simulate end of sentence
        if token.endswith('.') or len(current_sentence) >= 10:
            sentences.append(current_sentence)
            labels.append(current_labels)
            current_sentence = []
            current_labels = []

    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)

    return sentences, labels, label_to_id, id_to_label

sentences, labels, label_to_id, id_to_label = prepare_data_for_transformers(training_data)

# Split into train and test datasets
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)

# Load pre-trained tokenizer and model
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        padding=True,
        is_split_into_words=True,
        return_tensors="pt"
    )

    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = []
        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                aligned_label.append(-100)  # Ignore these tokens
            elif word_id != previous_word_id:  # Only label the first subword
                aligned_label.append(label[word_id])
            else:
                aligned_label.append(-100)  # Ignore subsequent subword tokens
            previous_word_id = word_id
        aligned_labels.append(aligned_label)

    tokenized_inputs["labels"] = torch.tensor(aligned_labels)
    return tokenized_inputs

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
train_inputs = tokenize_and_align_labels(train_sentences, train_labels)
test_inputs = tokenize_and_align_labels(test_sentences, test_labels)

train_dataset = Dataset.from_dict(train_inputs)
test_dataset = Dataset.from_dict(test_inputs)

data_collator = DataCollatorForTokenClassification(tokenizer)

In [4]:





# Define Optuna objective function
def objective(trial):
    # Define hyperparameter search space
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 3, 10)

    # Define TrainingArguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_metrics = trainer.evaluate()
    return eval_metrics["eval_loss"]  # Optuna will minimize this



In [None]:

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)  # Number of trials for hyperparameter search

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train final model with best hyperparameters
best_params = study.best_params
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["batch_size"],
    per_device_eval_batch_size=best_params["batch_size"],
    num_train_epochs=best_params["num_epochs"],
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)




[I 2024-12-04 15:13:48,342] A new study created in memory with name: no-name-cd7c8df9-2312-4d94-be4e-9bbc6c8807b7
  trainer = Trainer(
 10%|█         | 4/40 [11:01<1:56:35, 194.32s/it]

In [None]:
trainer.train()

In [None]:


# Save the model and tokenizer
model.save_pretrained("ner_model_optuna")
tokenizer.save_pretrained("ner_model_optuna")
print("Model saved to 'ner_model_optuna/'.")


In [None]:

# Load the model and use it for inference
ner_pipeline = pipeline("ner", model="ner_model_optuna", tokenizer="ner_model_optuna", aggregation_strategy="simple")

# Test on new input
input_text = "This company is located in United State sepsificaly in Louisville, Kentucky and remote description: Series of 2 interviews first on soft skills."
predictions = ner_pipeline(input_text)

print("\nPredictions:")
for entity in predictions:
    print(entity)