# Data Loading and Splitting

In [1]:
# Read the dataset 

import pandas as pd

# Read the CSV file containing sentences and tags
dataset_path = "ner_dataset.csv"
ner_df = pd.read_csv(dataset_path, encoding="latin1")


In [2]:
ner_df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


In [3]:
# Split the dataset 
from sklearn.model_selection import train_test_split

# Split the dataset into train, validation, and test sets
train_df, test_df = train_test_split(ner_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Data Preprocessing

In [4]:
# Function to preprocess the data
def preprocess_data(df):
    sentences = []
    ner_tags = []
    current_sentence = []
    current_tags = []

    for index, row in df.iterrows():
        # Check if the Sentence # column is not NaN
        if not pd.isnull(row["Sentence #"]):
            # If it's not a blank sentence, add the current sentence and its tags to the lists
            if current_sentence:
                sentences.append(current_sentence)
                ner_tags.append(current_tags)
            # Reset current sentence and tags for the new sentence
            current_sentence = []
            current_tags = []
        # Add word and tag to the current sentence and tags
        current_sentence.append(row["Word"])
        current_tags.append(row["Tag"])
    
    # Append the last sentence and tags
    if current_sentence:
        sentences.append(current_sentence)
        ner_tags.append(current_tags)
    
    return sentences, ner_tags

# Preprocess training, validation, and test data
train_sentences, train_ner_tags = preprocess_data(train_df)
val_sentences, val_ner_tags = preprocess_data(val_df)
test_sentences, test_ner_tags = preprocess_data(test_df)


In [5]:
df=ner_df

In [6]:
#Both Conditional Random Fields (CRF) and 
#Bidirectional LSTM with a CRF layer (BiLSTM-CRF) are popular choices for Named Entity Recognition (NER) tasks

In [7]:
# If you have limited training data or prefer an interpretable model with explicit modeling of sequential dependencies, CRF could be a suitable choice.
# If you have abundant training data and prioritize performance over interpretability, BiLSTM-CRF may offer 
# better results by capturing richer contextual information.

# Feature Extraction:

In [16]:
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Function to preprocess the data
def preprocess_data(df):
    sentences = []
    ner_tags = []
    current_sentence = []
    current_tags = []

    for index, row in df.iterrows():
        # Check if the Sentence # column is not NaN
        if not pd.isnull(row["Sentence #"]):
            # If it's not a blank sentence, add the current sentence and its tags to the lists
            if current_sentence:
                sentences.append(current_sentence)
                ner_tags.append(current_tags)
            # Reset current sentence and tags for the new sentence
            current_sentence = []
            current_tags = []
        # Add word and tag to the current sentence and tags
        current_sentence.append(row["Word"])
        current_tags.append(row["Tag"])
    
    # Append the last sentence and tags
    if current_sentence:
        sentences.append(current_sentence)
        ner_tags.append(current_tags)
    
    return sentences, ner_tags


In [17]:
def word_features(sentence, index):
    word = sentence[index]

    # Define features for the word
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }

    if index > 0:
        prev_word = sentence[index - 1]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence
    
    if index < len(sentence) - 1:
        next_word = sentence[index + 1]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence
    
    return features

In [18]:
# Function to extract features from a sentence
def sentence_features(sentence):
    return [word_features(sentence, index) for index in range(len(sentence))]

# Function to define labels for a sentence
def sentence_labels(sentence):
    return [label for label in sentence]

In [19]:
#Feature extraction function call
# Extract features and labels for the training data
X_train = [sentence_features(sentence) for sentence in train_sentences]
y_train = train_ner_tags


In [20]:
# Extract features and labels for the validation data
X_val = [sentence_features(sentence) for sentence in val_sentences]
y_val = val_ner_tags


In [21]:
# Extract features and labels for the test data
X_test = [sentence_features(sentence) for sentence in test_sentences]
y_test = test_ner_tags


# Model Training

# Bidirectional LSTM with a CRF layer (BiLSTM-CRF)

In [21]:
#pip install tensorflow keras keras-contrib numpy pandas scikit-learn

In [8]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Tokenization
word_tokenizer = Tokenizer()  # Initialize tokenizer
word_tokenizer.fit_on_texts(train_sentences)  # Fit tokenizer on training sentences

# Convert words to sequences of integers
X_train_seq = word_tokenizer.texts_to_sequences(train_sentences)
X_val_seq = word_tokenizer.texts_to_sequences(val_sentences)
X_test_seq = word_tokenizer.texts_to_sequences(test_sentences)

# Padding
max_seq_length = 100  # Define maximum sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_seq_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_seq_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_seq_length, padding='post')

# Label Encoding
label_encoder = LabelEncoder()  # Initialize label encoder
label_encoder.fit([tag for tag_seq in train_ner_tags for tag in tag_seq])  # Fit label encoder on training tags

# Encode NER tags
y_train_encoded = [label_encoder.transform(tag_seq) for tag_seq in train_ner_tags]
y_val_encoded = [label_encoder.transform(tag_seq) for tag_seq in val_ner_tags]
y_test_encoded = [label_encoder.transform(tag_seq) for tag_seq in test_ner_tags]

# Train-Validation Split
# Already split during preprocessing


In [9]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Bidirectional, TimeDistributed, Dense, Dropout, Masking, Lambda
from tensorflow.keras import backend as K

# Define the input shape
input_layer = Input(shape=(max_seq_length,))

# Add an embedding layer to convert input words into dense vectors
embedding_layer = Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=100, input_length=max_seq_length)(input_layer)

# Add a Bidirectional LSTM layer to capture bidirectional context
lstm_layer = Bidirectional(LSTM(units=100, return_sequences=True, dropout=0.5))(embedding_layer)

# Add a TimeDistributed layer to apply a dense layer to each time step
dense_layer = TimeDistributed(Dense(units=len(label_encoder.classes_), activation="softmax"))(lstm_layer)

# Define the model
model = Model(inputs=input_layer, outputs=dense_layer)

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Display the model summary
model.summary()




In [10]:
# Train the BiLSTM-CRF Model

# Convert NER tags to numpy arrays
y_train_np = pad_sequences(y_train_encoded, maxlen=max_seq_length, padding='post')
y_val_np = pad_sequences(y_val_encoded, maxlen=max_seq_length, padding='post')

# Train the model
history = model.fit(X_train_pad, y_train_np, validation_data=(X_val_pad, y_val_np), epochs=5, batch_size=32, verbose=1)



Epoch 1/5
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 111ms/step - accuracy: 0.9521 - loss: 0.2153 - val_accuracy: 0.9729 - val_loss: 0.1735
Epoch 2/5
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 127ms/step - accuracy: 0.9861 - loss: 0.0489 - val_accuracy: 0.9735 - val_loss: 0.1808
Epoch 3/5
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 126ms/step - accuracy: 0.9878 - loss: 0.0415 - val_accuracy: 0.9737 - val_loss: 0.1825
Epoch 4/5
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 121ms/step - accuracy: 0.9882 - loss: 0.0389 - val_accuracy: 0.9735 - val_loss: 0.1850
Epoch 5/5
[1m1076/1076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 117ms/step - accuracy: 0.9884 - loss: 0.0375 - val_accuracy: 0.9734 - val_loss: 0.1896


# Model Evaluation

In [11]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test_pad, pad_sequences(y_test_encoded, maxlen=max_seq_length, padding='post'), verbose=1)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.9740 - loss: 0.1852
Test Loss: 0.18620410561561584
Test Accuracy: 0.9740674495697021


In [14]:
import numpy as np
# Predict labels for the test data
predictions = model.predict(X_test_pad)

# Decode the predicted labels
decoded_predictions = []

for i in range(len(predictions)):
    # Get the predicted label indices for the current sequence
    predicted_indices = np.argmax(predictions[i], axis=1)
    
    # Decode the predicted label indices using the label encoder
    predicted_labels = label_encoder.inverse_transform(predicted_indices)
    
    # Add the decoded predictions to the list
    decoded_predictions.append(predicted_labels)

# Print the first few decoded predictions
for i in range(5):
    print("Sentence:", test_sentences[i])
    print("True Labels:",test_ner_tags[i])
    print("Predicted Labels:", decoded_predictions[i])
    print()


[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step
Sentence: ['the', 'investors', 'New', 'his', 'the', 'Earth', 'modest', 'on', 'war', ',', 'officers', 'prompted', 'Sri', 'monthly', 'political', 'former', 'Palestinian', 'eliminate', '60', '.', 'Haiti', 'Jordan', 'under', 'Hezbollah', 'dialogue']
True Labels: ['O', 'O', 'B-org', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'B-geo', 'B-gpe', 'O', 'B-org', 'O']
Predicted Labels: ['O' 'O' 'O' 'O' 'O' 'B-geo' 'O' 'O' 'O' 'O' 'O' 'O' 'B-per' 'O' 'O' 'O'
 'B-gpe' 'O' 'O' 'O' 'B-geo' 'B-gpe' 'O' 'B-org' 'O' 'B-art' 'B-art'
 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art'
 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art'
 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art'
 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art'
 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B-art' 'B

In [15]:
# #  We can adjust the model architecture or hyperparameters, increase the training data, or introduce 
# regularization techniques like dropout. For the Bert model, we can again check proper data preprocessing, 
# including tokenization and padding, and fine-tune the model on domain-specific data to enhance 
# performance.