# Datetime and Location extraction using Finetuned Encoder Model

## Install the packages

In [None]:
!pip install --upgrade transformers datasets accelerate
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

## Load the data

In [None]:
import pandas as pd
from datasets import Dataset, ClassLabel
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForTokenClassification

# Load the dataset
df = pd.read_csv('../data/encoder-ner.csv', converters={'tokens': eval, 'ner_tags': eval})
data = Dataset.from_pandas(df)

In [None]:
## Visualize the data

In [None]:
data[4]

In [None]:
## Format the data for finetuning

In [None]:
# Define unique labels
unique_labels = ['O', 'DATE', 'LOCATION']  # add all your labels here
label_dict = {label: i for i, label in enumerate(unique_labels)}

# Update the dataset with encoded labels
def encode_labels(examples):
    try:
        return {'ner_tags': [label_dict[label] for label in examples['ner_tags']]}
    except:
        return None

data = data.map(encode_labels)

def convert_list(input_list):
    output_list = input_list
    for i in range(2, len(output_list) - 1):
        if output_list[i - 2] == 1 and output_list[i - 1] == 1 and output_list[i] == 0 and output_list[i + 1] == 1:
            output_list[i] = 1
            break
    return output_list



def consolidate_labels(dataset):
    dataset["ner_tags"] = convert_list(dataset["ner_tags"])
        
    return dataset

data = data.map(consolidate_labels)

In [None]:
## Load tokenizer

In [None]:
# Load tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

# Function to tokenize and align labels
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        print(label, word_ids)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            try:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx  # Update previous_word_idx inside the loop
            except Exception as e:  # Use Exception instead of error
                continue
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, batch_size=128)
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
tokenized_datasets[1]

In [None]:
## Create Huggingface Model

In [None]:
from transformers import RobertaForTokenClassification, Trainer, TrainingArguments

model = RobertaForTokenClassification.from_pretrained(
    'roberta-base',
    num_labels=len(unique_labels) # This should match your total number of NER tags
)


In [None]:
## Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
    learning_rate=2e-4,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Assuming you have a train split
    eval_dataset=tokenized_datasets,  # Assuming you have a train split
    data_collator = data_collator
)

trainer.train()


In [None]:
## Save Model

In [None]:
model.save_pretrained('./finetuned_roberta_ner')
tokenizer.save_pretrained('./finetuned_roberta_ner')


In [None]:
## Test model with examples

In [None]:
text = "Crop types in Spain durin 23 April, 2023."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
print(inputs)
for k, v in inputs.items():
    print(k, v)

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)



In [None]:
ner_classes = ner_pipeline(text)
print(ner_classes)

In [None]:
text = "Crop types in Spain as of yesterday."
ner_classes = ner_pipeline(text)
print(ner_classes)

In [None]:
ner_classes

In [None]:
## Conver NER format to JSON format

def ner_to_dict(ner_result):
    result = dict()
    for item in ner_result:
        if item['entity'] in ["LABEL_1", "LABEL_2"]:
            if item['entity'] in result.keys():
                result[item['entity']] += item['word']
            else:
                result[item['entity']] = ""
                result[item['entity']] += item['word']
    old_keys = list(result.keys())
    
    for key in old_keys:
        new_key = unique_labels[int(key[-1])]
        result[new_key] = result[key]
        del result[key]
    
    for key, val in result.items():
        result[key] = val.replace('Ġ', ' ')[1:]

    return result

print(ner_to_dict(ner_classes))