# Datetime and Location extraction using Finetuned Encoder Model

## Install the packages

In [1]:
!pip install --upgrade transformers datasets accelerate
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'



## Load the data

In [2]:
import pandas as pd
from datasets import Dataset, ClassLabel
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForTokenClassification

# Load the dataset
df = pd.read_csv('data/encoder-ner.csv', converters={'tokens': eval, 'ner_tags': eval})
data = Dataset.from_pandas(df)

2024-06-06 07:48:29.598996: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
## Visualize the data

In [12]:
data[4]

{'tokens': ['Satellite',
  'images',
  'of',
  'burn',
  'scars',
  'in',
  'Montana',
  'from',
  'August',
  '14',
  ',',
  '2023',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 0]}

In [13]:
## Format the data for finetuning

In [None]:
# Define unique labels
unique_labels = ['O', 'DATE', 'LOCATION']  # add all your labels here
label_dict = {label: i for i, label in enumerate(unique_labels)}

# Update the dataset with encoded labels
def encode_labels(examples):
    try:
        return {'ner_tags': [label_dict[label] for label in examples['ner_tags']]}
    except:
        return None

data = data.map(encode_labels)

def convert_list(input_list):
    output_list = input_list
    for i in range(2, len(output_list) - 1):
        if output_list[i - 2] == 1 and output_list[i - 1] == 1 and output_list[i] == 0 and output_list[i + 1] == 1:
            output_list[i] = 1
            break
    return output_list



def consolidate_labels(dataset):
    dataset["ner_tags"] = convert_list(dataset["ner_tags"])
        
    return dataset

data = data.map(consolidate_labels)

In [14]:
## Load tokenizer

In [5]:
# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

# Function to tokenize and align labels
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        print(label, word_ids)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            try:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx  # Update previous_word_idx inside the loop
            except Exception as e:  # Use Exception instead of error
                continue
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, batch_size=128)
data_collator = DataCollatorForTokenClassification(tokenizer)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None]
[0, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 0] [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, None]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, None]
[0, 0, 0, 2, 0, 1, 1, 1, 1, 0] [None, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, None]
[0, 0, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 0] [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, None]
[1, 1, 1, 1, 0, 0, 0, 0, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, None]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]
[0, 1, 1, 1, 1, 0, 0, 0, 0, 2, 0, 2, 0] [None, 0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, None]
[0, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 0] [None, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, None]
[1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 10, None]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 0] [None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 

In [6]:
tokenized_datasets[1]

{'tokens': ['Burn',
  'scars',
  'in',
  'Oregon',
  ',',
  'noted',
  'on',
  'April',
  '17',
  ',',
  '2023',
  '.'],
 'ner_tags': [0, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 0],
 'input_ids': [0,
  7960,
  26172,
  11,
  4316,
  2156,
  1581,
  15,
  587,
  601,
  2156,
  291,
  1922,
  479,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 1, 0, -100]}

In [15]:
## Create Huggingface Model

In [7]:
from transformers import RobertaForTokenClassification, Trainer, TrainingArguments

model = RobertaForTokenClassification.from_pretrained(
    'roberta-base',
    num_labels=len(unique_labels) # This should match your total number of NER tags
)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
## Define Training Arguments

In [8]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
    learning_rate=2e-4,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Assuming you have a train split
    eval_dataset=tokenized_datasets,  # Assuming you have a train split
    data_collator = data_collator
)

trainer.train()


Detected kernel version 4.14.343, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss
1,No log,0.223636
2,No log,0.188638
3,No log,0.128659
4,No log,0.086164
5,No log,0.046331
6,No log,0.021326
7,No log,0.013624
8,No log,0.024029
9,No log,0.007152
10,No log,0.005498


TrainOutput(global_step=130, training_loss=0.13096021505502553, metrics={'train_runtime': 138.3168, 'train_samples_per_second': 7.157, 'train_steps_per_second': 0.94, 'total_flos': 8461600380360.0, 'train_loss': 0.13096021505502553, 'epoch': 10.0})

In [17]:
## Save Model

In [18]:
model.save_pretrained('./finetuned_roberta_ner')
tokenizer.save_pretrained('./finetuned_roberta_ner')


('./finetuned_roberta_ner/tokenizer_config.json',
 './finetuned_roberta_ner/special_tokens_map.json',
 './finetuned_roberta_ner/vocab.json',
 './finetuned_roberta_ner/merges.txt',
 './finetuned_roberta_ner/added_tokens.json',
 './finetuned_roberta_ner/tokenizer.json')

In [19]:
## Test model with examples

In [27]:
text = "Crop types in Spain durin 23 April, 2023."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
print(inputs)
for k, v in inputs.items():
    print(k, v)

{'input_ids': tensor([[    0,   230,  6884,  3505,    11,  2809, 17373,   179,   883,   587,
             6,   291,  1922,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
input_ids tensor([[    0,   230,  6884,  3505,    11,  2809, 17373,   179,   883,   587,
             6,   291,  1922,     4,     2]])
attention_mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [28]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)



In [29]:
ner_classes = ner_pipeline(text)
print(ner_classes)

[{'entity': 'LABEL_0', 'score': 0.99981743, 'index': 1, 'word': 'ĠC', 'start': 0, 'end': 1}, {'entity': 'LABEL_0', 'score': 0.99984944, 'index': 2, 'word': 'rop', 'start': 1, 'end': 4}, {'entity': 'LABEL_0', 'score': 0.9998709, 'index': 3, 'word': 'Ġtypes', 'start': 5, 'end': 10}, {'entity': 'LABEL_0', 'score': 0.9998066, 'index': 4, 'word': 'Ġin', 'start': 11, 'end': 13}, {'entity': 'LABEL_2', 'score': 0.9999361, 'index': 5, 'word': 'ĠSpain', 'start': 14, 'end': 19}, {'entity': 'LABEL_0', 'score': 0.9998373, 'index': 6, 'word': 'Ġdur', 'start': 20, 'end': 23}, {'entity': 'LABEL_0', 'score': 0.99967325, 'index': 7, 'word': 'in', 'start': 23, 'end': 25}, {'entity': 'LABEL_1', 'score': 0.9992142, 'index': 8, 'word': 'Ġ23', 'start': 26, 'end': 28}, {'entity': 'LABEL_1', 'score': 0.99935466, 'index': 9, 'word': 'ĠApril', 'start': 29, 'end': 34}, {'entity': 'LABEL_1', 'score': 0.99947006, 'index': 10, 'word': ',', 'start': 34, 'end': 35}, {'entity': 'LABEL_1', 'score': 0.9980672, 'index': 1

In [24]:
text = "Crop types in Spain as of yesterday."
ner_classes = ner_pipeline(text)
print(ner_classes)

[{'entity': 'LABEL_0', 'score': 0.9996712, 'index': 1, 'word': 'ĠC', 'start': 0, 'end': 1}, {'entity': 'LABEL_0', 'score': 0.99229616, 'index': 2, 'word': 'rop', 'start': 1, 'end': 4}, {'entity': 'LABEL_0', 'score': 0.9997447, 'index': 3, 'word': 'Ġtypes', 'start': 5, 'end': 10}, {'entity': 'LABEL_0', 'score': 0.9997352, 'index': 4, 'word': 'Ġin', 'start': 11, 'end': 13}, {'entity': 'LABEL_2', 'score': 0.99991083, 'index': 5, 'word': 'ĠSpain', 'start': 14, 'end': 19}, {'entity': 'LABEL_0', 'score': 0.9928919, 'index': 6, 'word': 'Ġas', 'start': 20, 'end': 22}, {'entity': 'LABEL_0', 'score': 0.87052906, 'index': 7, 'word': 'Ġof', 'start': 23, 'end': 25}, {'entity': 'LABEL_0', 'score': 0.98498046, 'index': 8, 'word': 'Ġyesterday', 'start': 26, 'end': 35}, {'entity': 'LABEL_0', 'score': 0.99229455, 'index': 9, 'word': '.', 'start': 35, 'end': 36}]


In [25]:
ner_classes

[{'entity': 'LABEL_0',
  'score': 0.9996712,
  'index': 1,
  'word': 'ĠC',
  'start': 0,
  'end': 1},
 {'entity': 'LABEL_0',
  'score': 0.99229616,
  'index': 2,
  'word': 'rop',
  'start': 1,
  'end': 4},
 {'entity': 'LABEL_0',
  'score': 0.9997447,
  'index': 3,
  'word': 'Ġtypes',
  'start': 5,
  'end': 10},
 {'entity': 'LABEL_0',
  'score': 0.9997352,
  'index': 4,
  'word': 'Ġin',
  'start': 11,
  'end': 13},
 {'entity': 'LABEL_2',
  'score': 0.99991083,
  'index': 5,
  'word': 'ĠSpain',
  'start': 14,
  'end': 19},
 {'entity': 'LABEL_0',
  'score': 0.9928919,
  'index': 6,
  'word': 'Ġas',
  'start': 20,
  'end': 22},
 {'entity': 'LABEL_0',
  'score': 0.87052906,
  'index': 7,
  'word': 'Ġof',
  'start': 23,
  'end': 25},
 {'entity': 'LABEL_0',
  'score': 0.98498046,
  'index': 8,
  'word': 'Ġyesterday',
  'start': 26,
  'end': 35},
 {'entity': 'LABEL_0',
  'score': 0.99229455,
  'index': 9,
  'word': '.',
  'start': 35,
  'end': 36}]

In [30]:
## Conver NER format to JSON format

def ner_to_dict(ner_result):
    result = dict()
    for item in ner_result:
        if item['entity'] in ["LABEL_1", "LABEL_2"]:
            if item['entity'] in result.keys():
                result[item['entity']] += item['word']
            else:
                result[item['entity']] = ""
                result[item['entity']] += item['word']
    old_keys = list(result.keys())
    
    for key in old_keys:
        new_key = unique_labels[int(key[-1])]
        result[new_key] = result[key]
        del result[key]
    
    for key, val in result.items():
        result[key] = val.replace('Ġ', ' ')[1:]

    return result

print(ner_to_dict(ner_classes))

{'LOCATION': 'Spain', 'DATE': '23 April, 2023'}
