In [1]:
import re
import pickle
from gensim.models import FastText
from tensorflow.keras.models import load_model
from sentence_transformers import SentenceTransformer

import spacy
from spacy.tokens import Span
from spacy import displacy

# Create spaCy Doc with entities
nlp = spacy.blank("en")

def load_artifacts():
    one_hot_encoder = pickle.load(open("../models/one_hot_encoder.pkl", 'rb'))
    NER_model = load_model('../models/NER_tensorflow_3_input_model/')
    label_encoder = pickle.load(open("../models/label_encoder.pkl", 'rb'))
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    return one_hot_encoder, NER_model, embedding_model, label_encoder

one_hot_encoder, NER_model, embedding_model, label_encoder = load_artifacts()

In [2]:
import contractions

def clean_text(text):
    text = ' '.join([contractions.fix(word) for word in text.split()])
    text = re.sub("[^a-zA-Z0-9 ]", "", text)
    return text      

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

def fetch_pos_tag(text):
    tags = np.array([token.tag_ for token in nlp(text)]).reshape(-1, 1)
    return tags

In [4]:
import numpy as np

def generate_vectors(text):
    tokens = text.split()
    token_embeddings = np.array([embedding_model.encode(token) for token in tokens])
    sentence_embeddings = np.array([embedding_model.encode(text) for token in tokens])
    return token_embeddings, sentence_embeddings

In [16]:
text = """I am watching a movie on the TV, can you tell Nathan to please be quiet?"""

In [17]:
# Map BIO tags to character-level spans
def bio_to_offsets(text, tags):
    entities = []
    start, entity_type = None, None

    tokens =  text.split()
    
    for idx, (token, tag) in enumerate(zip(tokens, tags)):
        if tag.startswith("B-"):  # Beginning of a new entity
            if start is not None:
                # Save previous entity
                entities.append((start, end, entity_type))
            start = len(" ".join(tokens[:idx])) + (1 if idx > 0 else 0)  # Start char
            end = start + len(token)  # End char
            entity_type = tag.split("-")[1]  # Extract entity type
        elif tag.startswith("I-") and start is not None:  # Inside entity
            end = len(" ".join(tokens[:idx+1]))  # Update end char
        else:  # Outside entity
            if start is not None:
                entities.append((start, end, entity_type))
                start, entity_type = None, None

    if start is not None:  # Save last entity
        entities.append((start, end, entity_type))
    return entities

# # Convert BIO tags to offsets
# entity_offsets = bio_to_offsets(tokens, tags)

# # Create spaCy Doc with entities
# nlp = spacy.blank("en")
# doc = nlp(text)

# # Add entities to doc
# ents = [Span(doc, doc.char_span(start, end).start, doc.char_span(start, end).end, label=label) 
#         for start, end, label in entity_offsets if doc.char_span(start, end)]
# doc.ents = ents

# # Define custom colors for entity types
# colors = {
#     "per": "#a781f9",
#     "tim": "#e59edb",
#     "gpe": "#faa419",
#     "geo": "#80e5d9",
#     "org": "#4ea8de",
#     "art": "#d3c8a8",
#     "nat": "#81c784",
#     "eve": "#ffb74d"
# }
# options = {"ents": list(colors.keys()), "colors": colors}

# # Visualize with displacy
# displacy.render(doc, style="ent", options=options, jupyter=True)


In [14]:
def main(text):
    cleaned_text = clean_text(text)
    token_embedding, sentence_embedding = generate_vectors(cleaned_text)
    pos_tags = one_hot_encoder.transform(fetch_pos_tag(cleaned_text)).toarray()
    prediction = label_encoder.inverse_transform(np.argmax(NER_model.predict([token_embedding, pos_tags, sentence_embedding]), axis = 1))
    output = {
        k:v for k, v in zip(cleaned_text.split(), prediction)
    }
    entity_offsets = bio_to_offsets(cleaned_text, prediction)
    doc = nlp(cleaned_text)
    ents = [Span(doc, doc.char_span(start, end).start, doc.char_span(start, end).end, label=label) 
         for start, end, label in entity_offsets if doc.char_span(start, end)]
    doc.ents = ents
    colors = {
    "per": "#a781f9",
    "tim": "#e59edb",
    "gpe": "#faa419",
    "geo": "#80e5d9",
    "org": "#4ea8de",
    "art": "#d3c8a8",
    "nat": "#81c784",
    "eve": "#ffb74d",
    }
    options = {"ents": list(colors.keys()), "colors": colors}
    displacy.render(doc, style="ent", options=options, jupyter=True)
    return prediction


main(text)





array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
      dtype=object)

In [15]:
text

'I am watching a movie on the TV, can you please be quiet?'

In [164]:
one_hot_encoder.transform([pos_tags])

ValueError: Found array with dim 3. None expected <= 2.

In [65]:
matches = re.finditer('the', text)
[(match.start(), match.end()) for match in matches]

[(31, 34),
 (72, 75),
 (102, 105),
 (162, 165),
 (219, 222),
 (245, 248),
 (317, 320),
 (359, 362),
 (429, 432),
 (485, 488)]

In [58]:
text

"Amara and David strolled along the winding gravel path of Oakwood Park, the crisp autumn air carrying the scent of fallen leaves. Amara, her dark braids catching the sunlight, paused to admire a squirrel darting across the grass. 'Reminds me of the parks back home,' she said softly. David, his blond hair tousled by the breeze, nodded with a smile. ‘This is the perfect escape, isn’t it?’ he replied, watching children laugh as they chased after a kite fluttering like a bird against the clear blue sky."

In [69]:
a = "a b c d e f t t t t"

a = a.replace('t', "", 1)

In [70]:
a

'a b c d e f  t t t'

In [71]:
a = a.replace('t', "", 1)
a

'a b c d e f   t t'

In [72]:
text = "the boy jumped the fence"

# Initialize variables
tokens = text.split()
positions = []
current_position = 0

# Calculate start and end positions for each token
for token in tokens:
    start = text.find(token, current_position)
    end = start + len(token) - 1
    positions.append((token, start, end))
    current_position = end + 1  # Move to the next position after the token

# Output the results
for token, start, end in positions:
    print(f"Token: '{token}', Start: {start}, End: {end}")

Token: 'the', Start: 0, End: 2
Token: 'boy', Start: 4, End: 6
Token: 'jumped', Start: 8, End: 13
Token: 'the', Start: 15, End: 17
Token: 'fence', Start: 19, End: 23


In [None]:
combined