# Init Environment

In [None]:
# !pip install en_core_web_sm
# !python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm

import sys

sys.path.append('../')
from src.config import *
from src.helper_entity import *

import pandas as pd
import spacy
from spacy.util import minibatch, compounding
from spacy.training.example import Example

import random
import warnings
from tqdm import tqdm
import re

# Domain Specific Vocabulary

In [None]:
'COMPANY': ['eni', 'petronas', 'tpao', 'slb', 'cvx', 'equinor', 'Ecopetrol', 'omv', 'int', 'ongc', 
                'bp', 'bsp', 'spic', 'chevron', "mpcl", 'schlumberger', 'santos', 'woodside'],
    
    'GEOUNIT': ['USL', 'SCA', 'SLR', 'KSA', 'ING', 'EUR', 'EAG', 'APG', 'CHG'],
    
    'PRODUCT': ['Agile Reservoir Modeling', 'Avocet', 
                'Cameron Supplier Document Management', 'Cameron Supplier Portal', 'Cameron Surface Surveillance', 'ConnectedProduction', 
                'DELFI RE', 'DNS Management', 
                'Data Delivery Services', 'Data Ingestion', 'Data Integration Framework', 'Data Integrator', 'Data Migration', 'Data Science', 'Dataiku DSS', 
                'Delfi Help', 'Delfi Opportunity Assessor', 'Delfi Portal', 'Delfi Production Chemical', 'Developer Portal', 'Delfi',
                'DrillOps', 'DrillPlan', 'Drillbench', 'Drilling Insights', 'Drilling Office', 'Drilling Interpretation',
                'ECLIPSE', 'EXP_PS', 'Edge', 'Engine Ecosystem', 
                'Enterprise Data Management Agent', 'Enterprise Data Solution', 'Enterprise Data Workspace', 'Enterprise Developer Portal', 'Enterprise Portal', 
                'ExplorePlan', 'eSearch',
                'FDPlan', 'FORGAS', 'Facility Planner', 'Flaresim', 'FluidModeler', 
                'GAIA', 'GeoX', 
                'INTERSECT', 'InnerLogix', 'Integrated Asset Modeler', 'InterACT', 
                'Kinetix'
                'LiveQuest', 
                'MEPO', 'MERAK', 'Malcom', 
                'Nasuni', 'NetApp ANF', 
                'OFM', 'OLGA', 'OLGA Online', 'OMNI3D', 'Ocean Framework', 'Ocean Plug-ins','Ocean Store','Omega', 'On Demand Reservoir Simulation', 'Osprey', 
                'PIPEFLO', 'PIPESIM', 'PIPESYS', 'PerformView', 'Petrel', 'Petrel Exploration Geology', 'Petrel RE', 'PetroMod', 
                'Petrotechnical Suite', 'ProSource', 'ProcessOps', 'ProdOps', 'Production Data Foundation', 'Provisioning & Decommissioning', 
                'RP Planner', 'RTDS', 'Rapid Screening', 'Reservoir Analytics', 'RigHour', 
                'Seabed', 'Secure Data Exchange', 'Simulation Cluster Manager', 'Studio', 'Symmetry', 'Spotfire', 
                'TGX', 'TDI', 'Techlog', 
                'VISAGE', 'VISTA', 
                'WELLFLO', 'WMS', 'Wellbarrier', 'WinGLink', 
                'ZFS', 
                ]}

def export_entity_csv():
# load Entities into a dataframe which has two columns: 'Entity' and 'Label'
    import pandas as pd
    array_entities = []

    for key in Entities.keys():
        for entity in Entities[key]:
            # insert a row into the dataframe
            array_entities.append([key, entity])
    df_entities = pd.DataFrame(array_entities, columns=['Label', 'Entity'])

    # export the dataframe to cvs file
    # print current working directory
    from src.config import SRC_FOLDER_PATH
    
    df_entities.to_csv(f'{SRC_FOLDER_PATH}/entities.csv', index=False)

https://github.com/prem2017/new-entity-labelling/blob/master/new_entity_labelling.ipynb

# Prepare Training Data

In [None]:
# function to get ReGex pattern from a text which ignores the case and matches the whole word
def get_regex_pattern(text):
    return rf"(?i)\b{text}\b"

# export_entity_csv() # export the entities to csv if not already done
if 'df_consolidated' not in locals():
    data_url = f'{DATA_FOLDER_PATH_PROCESSED}/data_consolidated.xlsx'
    df_consolidated = pd.read_excel(data_url)
    
df_samples = df_consolidated.sample(2000, random_state=42)
# export_entity_csv()
df_entities = pd.read_csv(f'{DATA_FOLDER_PATH_PROCESSED}/entities.csv')

# prepare the training data
training_data = []
# List of entities with overlapping mentions
entity_list = df_entities['Entity'].tolist()

# Sorting the entities based on their length
sorted_entities = sorted(entity_list, key=len, reverse=True)

for index, row in tqdm(df_samples.iterrows(), total=len(df_samples), desc="Processing data"):
    text = row['Title_Translated']
    entities = []
    
    for entity in sorted_entities:
        pattern = get_regex_pattern(entity)
        # Check for the pattern in the text
        if re.search(pattern, text):
            label = df_entities[df_entities['Entity'] == entity]['Label'].iloc[0]  # Get label for the entity
            start_index = re.search(pattern, text).start()
            end_index = start_index + len(entity)
            entities.append((start_index, end_index, label))
    
    training_data.append((text, {'entities': entities}))

# Train Labelling Model

In [None]:
# Load the pre-trained spaCy model
nlp = spacy.load("en_core_web_lg")

# Set up the pipeline for training
ner = nlp.get_pipe("ner")

# test the trained model before training
text = 'exxonmobil to invest $300 million in india to expand refining capacity'
doc = nlp(text)
print("Entities in '%s'" % text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
prev_ents = ner.move_names
print('[Existing Entities] = ', ner.move_names)

# Add the new label to ner
ner.add_label("PRODUCT")
# ner.add_label("GEOUNIT")

new_ents = ner.move_names
# print('\n[All Entities] = ', ner.move_names)

print('\n\n[New Entities] = ', list(set(new_ents) - set(prev_ents)))

In [None]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Train the model
with nlp.disable_pipes(*unaffected_pipes):
    # Training for 30 iterations
    for iteration in tqdm(range(10), desc="Training"):
        # use the pretrained model to update the pipeline
        optimizer = nlp.resume_training()
        # shuffle examples before training
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            # Update the model with iterating each text
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                example.append(Example.from_dict(doc, annotations[i]))
                example.append(Example.from_dict(doc, gold))
            nlp.update(example, losses=losses, drop=0.0, sgd=optimizer)
        print("Losses", losses)


In [None]:
# test the trained model after training
text = 'exxonmobil to invest $300 million in india to expand refining capacity using Petrel software'
doc = nlp(text)
print("Entities in '%s'" % text)
for ent in doc.ents:
    print(ent.label_, ent.text)

In [None]:
# use the pretrained model to update the pipeline
optimizer = nlp.resume_training()

# get names of other pipes to disable them during training
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Start the training process with the new label
with nlp.disable_pipes(*other_pipes):
    for _ in range(3):  # Adjust the number of iterations as needed
        for text, annotations in tqdm(training_data, total = len(training_data), desc="Training model"):
            example = Example.from_dict(nlp.make_doc(text), annotations)
            optimizer.update([example], losses=losses, drop=0.3)

# # train NER for 30 iterations
# losses = {}

# for batch in tqdm(minibatch(training_data, size=2), total=len(df_samples)//2, desc="Training model"):
#     for text, annotations in batch:
#         # create Example
#         doc = nlp.make_doc(text)
#         example = Example.from_dict(doc, annotations)
#         # Update the model
#         nlp.update([example], losses=losses, drop=0.3)
    # print("Losses", losses)

In [None]:
# test the trained model
for text, _ in training_data:
    doc = nlp(text)
    if doc.ents:
        #print the text in blue
        print(f'\033[1;34;40m {text}\033[0m')
        #print the entities found if the label is not PRODUCT
        print([(ent.text, ent.label_) for ent in doc.ents])
    # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])