In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, TimeDistributed, Bidirectional, Dropout, SpatialDropout1D
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

def load_conll_data(filepath):
    sentences = []
    sentence = []
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, pos, chunk, ner = line.strip().split()
                sentence.append((word, pos, ner))
    if sentence:
        sentences.append(sentence)
        
    return sentences

train_file = r"D:\SIH_POC_MODEL\Nikunj Dataset Train.txt"
test_file = r"D:\SIH_POC_MODEL\Nikunj Dataset Test.txt"  

train_sentences = load_conll_data(train_file)
test_sentences = load_conll_data(test_file)

train_words = [[word for word, _, ner in sentence] for sentence in train_sentences]
train_labels = [[ner for _, _, ner in sentence] for sentence in train_sentences]

test_words = [[word for word, _, ner in sentence] for sentence in test_sentences]
test_labels = [[ner for _, _, ner in sentence] for sentence in test_sentences]

all_words = set([word for sentence in train_words for word in sentence])
all_tags = set([tag for tags in train_labels for tag in tags])

word2idx = {word: idx + 2 for idx, word in enumerate(all_words)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

tag2idx = {tag: idx + 1 for idx, tag in enumerate(all_tags)}
tag2idx['PAD'] = 0

idx2word = {idx: word for word, idx in word2idx.items()}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

MAX_LEN = 50

def preprocess(sentences, labels):
    X = [[word2idx.get(word, word2idx['UNK']) for word in sentence] for sentence in sentences]
    X = pad_sequences(X, maxlen=MAX_LEN, padding='post')

    y = [[tag2idx[tag] for tag in label] for label in labels]
    y = pad_sequences(y, maxlen=MAX_LEN, padding='post')

    return X, y

X_train, y_train_raw = preprocess(train_words, train_labels)
X_test, y_test_raw = preprocess(test_words, test_labels)

flattened_labels = np.concatenate(y_train_raw)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(flattened_labels), y=flattened_labels)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

y_train = [to_categorical(i, num_classes=len(tag2idx)) for i in y_train_raw]
y_train = np.array(y_train)

y_test = [to_categorical(i, num_classes=len(tag2idx)) for i in y_test_raw]
y_test = np.array(y_test)

def create_sample_weights(y_raw, class_weight_dict):
    sample_weights = np.ones((y_raw.shape[0], y_raw.shape[1]))
    
    for i in range(y_raw.shape[0]):
        for j in range(y_raw.shape[1]):
            tag_idx = y_raw[i][j]
            if tag_idx != tag2idx['PAD']:
                sample_weights[i][j] = class_weight_dict[tag_idx]
    
    return sample_weights

sample_weights = create_sample_weights(y_train_raw, class_weight_dict)

model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=100, input_length=MAX_LEN),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(len(tag2idx), activation="softmax"))
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.1, verbose=1, sample_weight=sample_weights)

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, axis=-1)

y_pred_flat = y_pred.flatten()
y_test_flat = y_test_true.flatten()

mask = y_test_flat != tag2idx['PAD']
y_pred_flat = y_pred_flat[mask]
y_test_flat = y_test_flat[mask]

precision = precision_score(y_test_flat, y_pred_flat, average='weighted')
recall = recall_score(y_test_flat, y_pred_flat, average='weighted')
f1 = f1_score(y_test_flat, y_pred_flat, average='weighted')

print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")

print(classification_report(y_test_flat, y_pred_flat, target_names=list(tag2idx.keys())))


Epoch 1/5




[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 71ms/step - accuracy: 0.7818 - loss: 1.8739 - val_accuracy: 0.9458 - val_loss: 0.7508
Epoch 2/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 66ms/step - accuracy: 0.9677 - loss: 0.3715 - val_accuracy: 0.9710 - val_loss: 0.5719
Epoch 3/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 63ms/step - accuracy: 0.9901 - loss: 0.0975 - val_accuracy: 0.9746 - val_loss: 0.5435
Epoch 4/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 63ms/step - accuracy: 0.9947 - loss: 0.0430 - val_accuracy: 0.9757 - val_loss: 0.6055
Epoch 5/5
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 64ms/step - accuracy: 0.9966 - loss: 0.0235 - val_accuracy: 0.9785 - val_loss: 0.6513
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step
Precision: 93.45%
Recall: 89.58%
F1-Score: 91.03%


ValueError: Number of classes, 9, does not match size of target_names, 10. Try specifying the labels parameter