In [418]:
import re

import nltk

import numpy as np
import pandas as pd
import lightgbm as lgbm
import seaborn as sns
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

from nltk.corpus import stopwords

from IPython.display import Markdown, display

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

In [420]:
checkpoint = "intfloat/multilingual-e5-base"

embedding_model = SentenceTransformer(checkpoint)

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [421]:
data = pd.read_csv('data/processed_data.csv')
sentence_data = pd.read_csv('data/sentence_df.csv').sample(frac = 0.05)
combined_data = data.merge(sentence_data, on = 'Sentence #', how = 'inner')
combined_data

Unnamed: 0,index,Sentence #,Word,POS,Tag,is_stopword,contains_special_char,contains_numerical_char,POS_contains_spec_char,POS_clean,list_,length
0,524,Sentence: 23,Iraqi,JJ,B-gpe,No,Yes,No,False,JJ,Iraqi military officials say tanks and troops ...,25
1,525,Sentence: 23,military,JJ,O,No,Yes,No,False,JJ,Iraqi military officials say tanks and troops ...,25
2,526,Sentence: 23,officials,NNS,O,No,Yes,No,False,NNS,Iraqi military officials say tanks and troops ...,25
3,527,Sentence: 23,say,VBP,O,No,Yes,No,False,VBP,Iraqi military officials say tanks and troops ...,25
4,528,Sentence: 23,tanks,NNS,O,No,Yes,No,False,NNS,Iraqi military officials say tanks and troops ...,25
...,...,...,...,...,...,...,...,...,...,...,...,...
47804,1048393,Sentence: 47950,year,NN,O,No,Yes,No,False,NN,The Joint Coordination and Monitoring Board Mo...,25
47805,1048395,Sentence: 47950,most,JJS,O,Yes,Yes,No,False,JJS,The Joint Coordination and Monitoring Board Mo...,25
47806,1048396,Sentence: 47950,of,IN,O,Yes,Yes,No,False,IN,The Joint Coordination and Monitoring Board Mo...,25
47807,1048397,Sentence: 47950,them,PRP,O,Yes,Yes,No,False,PRP,The Joint Coordination and Monitoring Board Mo...,25


In [422]:
train_sentence_ids, test_sentence_ids = train_test_split(combined_data['Sentence #'].unique(), test_size = 0.30)

In [423]:
train_set = combined_data.loc[combined_data['Sentence #'].isin(train_sentence_ids)]
test_set = combined_data.loc[combined_data['Sentence #'].isin(test_sentence_ids)]

In [396]:
# import pickle

# with open('models/token_embedding_dict', 'rb') as file:
#     token_embedding_dict = pickle.load(file)


# with open('models/sentence_embedding_dict', 'rb') as file:
#     sentence_embedding_dict = pickle.load(file)

In [424]:
from tqdm import tqdm

one_hot_encoder = OneHotEncoder(handle_unknown = 'ignore')
one_hot_encoder.fit(combined_data[['POS_clean']])

token_embedding_dict = {token:embedding_model.encode(token) for token in tqdm(combined_data['Word'].unique())}

import pickle

with open('models/token_embedding_dict_e5_base', 'wb') as file:
    pickle.dump(token_embedding_dict, file)

100%|███████████████████████████████████████| 8220/8220 [02:31<00:00, 54.34it/s]


In [425]:
sentence_embedding_dict = {sentence:embedding_model.encode(sentence) for sentence in tqdm(combined_data['list_'].unique())}

with open('models/sentence_embedding_dict', 'wb') as file:
    pickle.dump(sentence_embedding_dict, file)

100%|███████████████████████████████████████| 2398/2398 [01:11<00:00, 33.50it/s]


In [426]:
from tqdm import tqdm

train_token_embeddings = np.array([token for token in train_set['Word'].map(token_embedding_dict)])
train_sentence_embeddings = np.array([sentence for sentence in train_set['list_'].map(sentence_embedding_dict)])
train_pos_vector = one_hot_encoder.transform(train_set[['POS_clean']])

test_token_embeddings = np.array([token for token in test_set['Word'].map(token_embedding_dict)])
test_sentence_embeddings = np.array([sentence for sentence in test_set['list_'].map(sentence_embedding_dict)])
test_pos_vector = one_hot_encoder.transform(test_set[['POS_clean']])

In [427]:
test_token_embeddings.shape

(14133, 768)

In [428]:
label_encoder = LabelEncoder()
label_encoder.fit(combined_data['Tag'])

y_train = label_encoder.transform(train_set['Tag']).reshape(-1, 1)
y_test = label_encoder.transform(test_set['Tag']).reshape(-1, 1)

In [429]:
y_train.reshape(-1, 1)

array([[16],
       [ 7],
       [16],
       ...,
       [16],
       [ 7],
       [15]])

In [436]:
from tensorflow.keras import *
token_input = layers.Input(shape=(768,))  # Token embeddings
pos_input = layers.Input(shape=(32,))  # POS one-hot
sentence_input = layers.Input(shape=(768,))  # Sentence embedding

combined_features = layers.Concatenate()([token_input, pos_input, sentence_input])
dot_product = layers.Dot(axes = 1)([token_input, sentence_input])
combined_features_with_dot = layers.Concatenate()([combined_features, dot_product])


pre_output = layers.Dense(units=128)(combined_features_with_dot)
output = layers.Dense(17, activation='softmax')(pre_output)

model = Model(inputs = [token_input, pos_input, sentence_input], outputs = output)

In [437]:
model.compile(
    loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'], optimizer = 'Adam'
)

In [438]:
history = model.fit(
    [train_token_embeddings, train_pos_vector, train_sentence_embeddings], y_train,
    validation_data = ([test_token_embeddings, test_pos_vector, test_sentence_embeddings], y_test), epochs=10
)

Epoch 1/10


2024-12-19 10:31:51.074605: I tensorflow/core/common_runtime/executor.cc:1210] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype float and shape [33676,768]
	 [[{{node Placeholder/_2}}]]
2024-12-19 10:31:51.074965: I tensorflow/core/common_runtime/executor.cc:1210] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [33676,768]
	 [[{{node Placeholder/_0}}]]




2024-12-19 10:31:52.453503: I tensorflow/core/common_runtime/executor.cc:1210] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [14133,768]
	 [[{{node Placeholder/_0}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [439]:
import spacy

nlp = spacy.load("en_core_web_sm")

def fetch_pos_tag(text):
    tags = np.array([token.tag_ for token in nlp(text)]).reshape(-1, 1)
    return tags
    
import contractions

def clean_text(text):
    text = ' '.join([contractions.fix(word) for word in text.split()])
    text = re.sub("[^a-zA-Z0-9 ]", "", text)
    return text    

text = """Praise is hugging Nathan, such love."""

cleaned_text = clean_text(text)
cleaned_text

'Praise is hugging Nathan such love'

In [440]:
oof_pos_tags = one_hot_encoder.transform(fetch_pos_tag(cleaned_text)).toarray()
oof_pos_tags



array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [441]:
token_embs = np.array([embedding_model.encode(token) for token in cleaned_text.split()])
token_embs

array([[ 0.01119403,  0.0317075 , -0.024415  , ..., -0.06224062,
        -0.03176974,  0.04287673],
       [-0.00859165,  0.03210774,  0.00502753, ..., -0.02235029,
        -0.0382512 ,  0.01503805],
       [ 0.02012217,  0.04156946, -0.00516126, ..., -0.05958466,
        -0.04319851,  0.04035001],
       [ 0.0037807 ,  0.02059454, -0.02499169, ..., -0.0433039 ,
        -0.03441045,  0.00815656],
       [-0.00976721,  0.01835852, -0.00804532, ..., -0.01832934,
        -0.04130954,  0.00316312],
       [-0.0018905 ,  0.02066201, -0.00041843, ..., -0.05644192,
        -0.01762111,  0.00366273]], dtype=float32)

In [442]:
sent_embs = embedding_model.encode(cleaned_text)

final_sent_embs = np.array([sent_embs for i in range(len(cleaned_text.split()))])

final_sent_embs

array([[ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112],
       [ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112],
       [ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112],
       [ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112],
       [ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112],
       [ 0.00320957,  0.01888477, -0.0182494 , ..., -0.10023823,
        -0.04328684,  0.03327112]], dtype=float32)

In [443]:
l = label_encoder.inverse_transform(np.argmax(model.predict([token_embs, oof_pos_tags, final_sent_embs]), axis = 1))







In [444]:
{t:l for t, l in zip(cleaned_text.split(), l)}

{'Praise': 'O',
 'is': 'O',
 'hugging': 'O',
 'Nathan': 'B-per',
 'such': 'O',
 'love': 'O'}

In [445]:
train_set.loc[train_set['Word'] == 'Sade']

Unnamed: 0,index,Sentence #,Word,POS,Tag,is_stopword,contains_special_char,contains_numerical_char,POS_contains_spec_char,POS_clean,list_,length


In [446]:
model.save('models/NER_tensorflow_3_input_model')

INFO:tensorflow:Assets written to: models/NER_tensorflow_3_input_model/assets


INFO:tensorflow:Assets written to: models/NER_tensorflow_3_input_model/assets


In [447]:
with open('models/one_hot_encoder.pkl', 'wb') as f:
    pickle.dump(one_hot_encoder, f)

with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [448]:
combined_data.loc[combined_data['Word'] == 'open ai']

Unnamed: 0,index,Sentence #,Word,POS,Tag,is_stopword,contains_special_char,contains_numerical_char,POS_contains_spec_char,POS_clean,list_,length


In [449]:
combined_data['Tag'].unique()

array(['B-gpe', 'O', 'B-geo', 'B-org', 'I-org', 'B-tim', 'B-per', 'I-per',
       'B-art', 'I-art', 'B-eve', 'B-nat', 'I-nat', 'I-geo', 'I-tim',
       'I-eve', 'I-gpe'], dtype=object)

In [342]:
import spacy
from spacy.tokens import Span
from spacy import displacy

# Example text and BIO-tagged tokens
tokens = ["Michael", "R.", "Bloomberg", "launched", "a", "US$85", "million", "campaign",
          "in", "New", "York", "City", "to", "stop", "pollution", "by", "the", "UN", "Special", "Envoy"]

tags = ["B-per", "I-per", "I-per", "O", "O", "B-tim", "I-tim", "O",
        "O", "B-gpe", "I-gpe", "I-gpe", "O", "O", "O", "O", "O", "B-org", "I-org", "I-org"]

# Combine tokens into text
text = " ".join(tokens)

# Map BIO tags to character-level spans
def bio_to_offsets(tokens, tags):
    entities = []
    start, entity_type = None, None
    
    for idx, (token, tag) in enumerate(zip(tokens, tags)):
        if tag.startswith("B-"):  # Beginning of a new entity
            if start is not None:
                # Save previous entity
                entities.append((start, end, entity_type))
            start = len(" ".join(tokens[:idx])) + (1 if idx > 0 else 0)  # Start char
            end = start + len(token)  # End char
            entity_type = tag.split("-")[1]  # Extract entity type
        elif tag.startswith("I-") and start is not None:  # Inside entity
            end = len(" ".join(tokens[:idx+1]))  # Update end char
        else:  # Outside entity
            if start is not None:
                entities.append((start, end, entity_type))
                start, entity_type = None, None

    if start is not None:  # Save last entity
        entities.append((start, end, entity_type))
    return entities

# Convert BIO tags to offsets
entity_offsets = bio_to_offsets(tokens, tags)

# Create spaCy Doc with entities
nlp = spacy.blank("en")
doc = nlp(text)

# Add entities to doc
ents = [Span(doc, doc.char_span(start, end).start, doc.char_span(start, end).end, label=label) 
        for start, end, label in entity_offsets if doc.char_span(start, end)]
doc.ents = ents

# Define custom colors for entity types
colors = {
    "per": "#a781f9",
    "tim": "#e59edb",
    "gpe": "#faa419",
    "geo": "#80e5d9",
    "org": "#4ea8de",
    "art": "#d3c8a8",
    "nat": "#81c784",
    "eve": "#ffb74d"
}
options = {"ents": list(colors.keys()), "colors": colors}

# Visualize with displacy
displacy.render(doc, style="ent", options=options, jupyter=True)
