# Data
Dataset:
Each row in the CSV file is a complete sentence, list of POS tags for each word in the sentence, and list of NER tags for each word in the sentence

In [24]:
import pandas as pd
import ast
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

In [25]:
df = pd.read_csv('test.csv',)
print(df.shape)
df.head(5)

(1046, 4)


Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [26]:
# Preprocess the data
sentences = df['Sentence'].tolist()
tags = df['Tag'].tolist()

In [27]:
print(sentences[:5])
print(tags[:5])

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "', 'They marched from the Houses of Parliament to a rally in Hyde Park .', 'Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .', "The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton ."]
["['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']", "['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']", "['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O']", 

## BiLSTM

In [28]:
# Create a vocabulary and word-to-index mapping
words = set()
for sentence in sentences:
    for word in sentence.split():
        words.add(word.lower())
word2idx = {w: i + 1 for i, w in enumerate(words)}

In [29]:
# Convert sentences and tags to numerical representations
X = [[word2idx[word.lower()] for word in sentence.split()] for sentence in sentences]

In [61]:
df['Tag']

0       ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...
1       ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...
2       ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...
3       ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...
4       ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...
                              ...                        
1041    ['O', 'O', 'O', 'O', 'B-nat', 'O', 'O', 'O', '...
1042    ['B-nat', 'I-nat', 'O', 'O', 'O', 'B-geo', 'B-...
1043    ['B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...
1044    ['O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', '...
1045     ['O', 'O', 'B-gpe', 'O', 'O', 'B-gpe', 'O', 'O']
Name: Tag, Length: 1046, dtype: object

In [30]:
# Convert string representations to lists
tags = [ast.literal_eval(tag) for tag in df['Tag']]

# Create tag to index mapping
tag2idx = {tag: idx for idx, tag in enumerate(set([tag for tags in tags for tag in tags]))}

# Convert tags to indices
y = [[tag2idx[tag] for tag in tags] for tags in tags]

In [63]:
y

array([[14, 14, 14, ...,  0,  0,  0],
       [14, 14, 14, ...,  0,  0,  0],
       [14, 14, 14, ...,  0,  0,  0],
       ...,
       [ 7, 14, 14, ...,  0,  0,  0],
       [14, 14, 14, ...,  0,  0,  0],
       [14, 14, 13, ...,  0,  0,  0]])

In [31]:
# Pad sequences to a maximum length
max_len = max(len(sentence) for sentence in X)
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create idx2tag mapping
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Convert labels to one-hot encoded format
num_classes = len(tag2idx)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=num_classes)

In [32]:
X

array([[1251, 2453, 1931, ...,    0,    0,    0],
       [1953, 2453, 2460, ...,    0,    0,    0],
       [3072, 1490,  929, ...,    0,    0,    0],
       ...,
       [1241, 3078,  102, ...,    0,    0,    0],
       [1884, 4291, 3777, ...,    0,    0,    0],
       [3741, 2165,  691, ...,    0,    0,    0]])

In [33]:
# Define the model architecture
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(word2idx) + 1, output_dim=64, input_length=max_len))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_classes, activation='softmax')))

# Compile the model
loss = tf.keras.losses.CategoricalCrossentropy()
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10




[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 98ms/step - accuracy: 0.6313 - loss: 1.7205 - val_accuracy: 0.9495 - val_loss: 0.3163
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.9481 - loss: 0.3066 - val_accuracy: 0.9495 - val_loss: 0.2706
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 66ms/step - accuracy: 0.9475 - loss: 0.2707 - val_accuracy: 0.9495 - val_loss: 0.2543
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.9466 - loss: 0.2613 - val_accuracy: 0.9495 - val_loss: 0.2444
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 61ms/step - accuracy: 0.9493 - loss: 0.2395 - val_accuracy: 0.9495 - val_loss: 0.2325
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.9455 - loss: 0.2367 - val_accuracy: 0.9495 - val_loss: 0.2150
Epoch 7/10
[1m27/27[0m [32m━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1ff82eded50>

In [49]:
# Predict entities in new text
test_sentence = "Thousands of protesters gathered in New York City"
test_sequence = [word2idx.get(word.lower(), 0) for word in test_sentence.split(" ")]
test_sequence = pad_sequences([test_sequence], maxlen=max_len, padding='post')
predictions = model.predict(test_sequence)

# Convert predicted entity tags to their original labels
predicted_tags = np.argmax(predictions, axis=-1)[0]
predicted_labels = [idx2tag[idx] for idx in predicted_tags]
print(predicted_labels)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim', 'I-tim']


## Transformer


In [35]:
# from sklearn.svm import SVC
# from sklearn.feature_extraction.text import CountVectorizer

In [36]:
# import nltk
# from nltk.tokenize import word_tokenize, sent_tokenize

In [37]:
# nltk.download('punkt')

In [38]:
# import torch
# import transformers
# from transformers import BertTokenizer, BertForTokenClassification
# from torch.utils.data import TensorDataset, DataLoader

# # Tokenize texts
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [39]:
# # Convert string representations to lists
# tags = [ast.literal_eval(tag) for tag in df['Tag']]

# # Create tag to index mapping
# tag2idx = {tag: idx for idx, tag in enumerate(set([tag for tags in tags for tag in tags]))}

# # Convert tags to indices
# labels = [[tag2idx[tag] for tag in tags] for tags in tags]

In [40]:
# # Convert tokens to input IDs and attention masks
# input_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_texts]

# # Pad input sequences to a fixed length
# max_length = max(len(sent) for sent in input_ids)
# input_ids = pad_sequences(input_ids, maxlen=max_length, dtype="long", value=0, truncating="post", padding="post")

# # Convert attention masks
# attention_masks = [[1] * len(sent) for sent in input_ids]


In [41]:
# # Pad labels to match the sequence length
# labels = pad_sequences(labels, maxlen=max_length, dtype="long", value=0, truncating="post", padding="post")

# # Convert labels to tensors
# labels = torch.tensor(labels)

In [42]:
# # Create dataset and dataloader
# dataset = TensorDataset(torch.tensor(input_ids), torch.tensor(attention_masks), labels)
# dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [43]:
# # Load pre-trained BERT model for token classification
# model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag2idx))

# # Set device to GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

In [44]:
# # Set optimizer and loss function
# optimizer = transformers.AdamW(model.parameters(), lr=2e-5)
# loss_function = torch.nn.CrossEntropyLoss()

# # Training loop
# for epoch in range(5):
#     model.train()
#     total_loss = 0

#     for batch in dataloader:
#         input_ids, attention_masks, labels = batch
#         input_ids = input_ids.to(device)
#         attention_masks = attention_masks.to(device)
#         labels = labels.to(device)

#         outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
#         loss = outputs.loss
#         total_loss += loss.item()

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#     avg_loss = total_loss / len(dataloader)
#     print(f"Epoch {epoch+1} - Average Loss: {avg_loss}")

# HMM