In [None]:
import ast
import codecs
import json
import matplotlib.pyplot as plt
import numpy as np
from keras.layers import Bidirectional, Dense, InputLayer, Embedding, Activation, LSTM
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

# Load tagged sentences from your dataset
tagged_sentences = codecs.open("data.txt", encoding="utf-8").readlines()

# Preprocess your data
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*ast.literal_eval(tagged_sentence))
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

# Split your data into train and test sets
(train_sentences, test_sentences, train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

# Define utility functions
def get_words(sentences):
    words = set([])
    for sentence in sentences:
        for word in sentence:
            words.add(word)
    return words

def get_tags(sentences_tags):
    tags = set([])
    for tag in sentences_tags:
        for t in tag:
            tags.add(t)
    return tags

# Create vocabulary and tag mappings
words = get_words(sentences)
tags = get_tags(sentence_tags)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0
word2index['-OOV-'] = 1

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0

# Convert sentences and tags to numerical indices
def get_sequences_x(sentences, word2index):
    sequences_x = []
    for sentence in sentences:
        sequence = [word2index.get(word, word2index['-OOV-']) for word in sentence]
        sequences_x.append(sequence)
    return sequences_x

def get_sequences_y(tags, tag2index):
    sequences_y = []
    for tag in tags:
        sequence = [tag2index.get(t, tag2index['-PAD-']) for t in tag]
        sequences_y.append(sequence)
    return sequences_y

train_sentences_x = get_sequences_x(train_sentences, word2index)
test_sentences_x = get_sequences_x(test_sentences, word2index)

train_tags_y = get_sequences_y(train_tags, tag2index)
test_tags_y = get_sequences_y(test_tags, tag2index)

# Pad sequences for consistent length
MAX_LENGTH = len(max(train_sentences_x, key=len))
train_sentences_x = pad_sequences(train_sentences_x, maxlen=MAX_LENGTH, padding='post')
test_sentences_x = pad_sequences(test_sentences_x, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

# Build a custom NER model with a Bidirectional LSTM
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH,))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dense(len(tag2index))
model.add(Activation('softmax'))

# Define a utility function to convert sequences to categorical data
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cat = np.zeros(categories)
            cat[item] = 1.0
            cats.append(cat)
        cat_sequences.append(cats)
    return np.array(cat_sequences

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

# Train the model
history = model.fit(train_sentences_x, to_categorical(train_tags_y, len(tag2index)),
                    batch_size=32, epochs=10, validation_split=0.2).history

# Save the trained model
model.save("brahui_ner_model.h5")

# Evaluate the model
scores = model.evaluate(test_sentences_x, to_categorical(test_tags_y, len(tag2index))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")  # Accuracy

# Use the model for inference
# You can use the model for predicting NER tags on new Brahui text data


In [None]:
import ast
import codecs
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Bidirectional, Dense, InputLayer, Embedding, Activation, LSTM
from keras.models import Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical, pad_sequences
from sklearn.model_selection import train_test_split

# Load tagged sentences from your dataset
tagged_sentences = codecs.open("data.txt", encoding="utf-8").readlines()

# Preprocess your data
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*ast.literal_eval(tagged_sentence))
    sentences.append(list(sentence))
    sentence_tags.append(list(tags))

# Split your data into train and test sets
(train_sentences, test_sentences, train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

# ... (rest of the code for word2index, tag2index, preprocessing, and model definition)

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

# Train the model
history = model.fit(train_sentences_x, to_categorical(train_tags_y, len(tag2index)),
                    batch_size=32, epochs=10, validation_split=0.2).history

# Save the trained model
model.save("brahui_ner_model.h5")

# Evaluate the model
scores = model.evaluate(test_sentences_x, to_categorical(test_tags_y, len(tag2index))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")  # Accuracy

# Plot accuracy and loss graphs
plt.figure(figsize=(12, 6))

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.show()