In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, TimeDistributed, Bidirectional
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv('dataset2_2.csv', delimiter='\t', header=None, names=['Sentence', 'Tags', 'Intent'])
df

Unnamed: 0,Sentence,Tags,Intent
0,من یک گوشی با صفحه نمایش کوچکتر از 4 می خواهم,O O O O O O O O B-displaySizeless O O,suggest_mobile
1,من یک گوشی با برندی به جز هوآوی می خواهم,O O O O O O O B-brand_not O O,suggest_mobile
2,من یک گوشی با حداکثر کیفیت دوربین 12 مگاپیکسل ...,O O O O O O O B-maincameraless O O O,suggest_mobile
3,من یک گوشی دوربینش حداقل برابر با 25 می خواهم,O O O O O O O B-maincameramore O O,suggest_mobile
4,من یک گوشی غیر آیفون و همچنین رم اش با رمی برا...,O O O O B-brand_not O O O O O O O O B-RAM O O O,suggest_mobile
...,...,...,...
13486,دوست ندارم با قابلیت شارژ سریع باشد,O O O O B-fastCharge_yes_not I-fastCharge_yes_...,suggest_mobile
13487,دوست ندارم دارای قلم باشد,O O O B-pen_yes_not O,suggest_mobile
13488,دوست ندارم با ظاهر شیک باشد,O O O O B-appearanceScore1_not O,suggest_mobile
13489,دوست ندارم دارای پردازنده خوب باشد,O O O O B-chipsetScore1_not O,suggest_mobile


In [None]:
# Extract the sentences and labels
sentences = df['Sentence'].values
sentences

array(['من یک گوشی با صفحه نمایش کوچکتر از 4 می خواهم',
       'من یک گوشی با برندی به جز هوآوی می خواهم',
       'من یک گوشی با حداکثر کیفیت دوربین 12 مگاپیکسل می خواهم', ...,
       'دوست ندارم با ظاهر شیک باشد',
       'دوست ندارم دارای پردازنده خوب باشد',
       'دوست ندارم مال کشور سنگاپوری باشد'], dtype=object)

In [None]:
labels = df['Tags'].values
labels

array(['O O O O O O O O B-displaySizeless O O',
       'O O O O O O O B-brand_not O O',
       'O O O O O O O B-maincameraless O O O', ...,
       'O O O O B-appearanceScore1_not O',
       'O O O O B-chipsetScore1_not O', 'O O O O B-madeIn_not O'],
      dtype=object)

In [None]:
# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x7c2c666e3c40>

In [None]:
X = tokenizer.texts_to_sequences(sentences)
X

[[5, 7, 2, 1, 40, 69, 325, 4, 104, 8, 17],
 [5, 7, 2, 1, 113, 6, 91, 224, 8, 17],
 [5, 7, 2, 1, 78, 37, 20, 116, 56, 8, 17],
 [5, 7, 2, 220, 55, 200, 1, 268, 8, 17],
 [5, 7, 2, 57, 230, 15, 14, 29, 21, 1, 192, 200, 1, 190, 127, 8, 17],
 [5, 7, 2, 4, 231, 44, 45, 30, 338, 150, 53, 8, 17],
 [5, 7, 2, 4, 58, 231, 15, 14, 57, 86, 8, 17],
 [5, 7, 2, 1, 83, 57, 4, 274, 1, 20, 81, 146, 4, 111, 56, 8, 17],
 [5, 7, 2, 1, 83, 57, 4, 266, 140, 74, 43, 85, 8, 17],
 [5, 7, 2, 57, 287, 30, 38, 39, 20, 105, 4, 111, 56, 8, 17],
 [5, 7, 2, 1, 211, 212, 213, 234, 6, 16, 1, 25, 71, 4, 306, 8, 17],
 [5, 7, 2, 3, 78, 13, 234, 127, 6, 16, 1, 70, 147, 6, 91, 88, 8, 17],
 [5, 7, 2, 13, 112, 111, 294, 8, 17],
 [5, 7, 2, 1, 13, 29, 190, 1, 40, 69, 132, 90, 8, 17],
 [5, 7, 2, 29, 21, 190, 58, 21, 236, 8, 17],
 [5, 7, 2, 1, 29, 71, 4, 87, 115, 15, 14, 1, 78, 76, 19, 261, 8, 17],
 [5, 7, 2, 140, 89, 43, 85, 46, 1, 29, 71, 4, 108, 115, 8, 17],
 [5, 7, 2, 3, 74, 43, 41, 1, 29, 128, 190, 127, 8, 17],
 [5, 7, 2, 3, 89

In [None]:
# Pad sequences
X = pad_sequences(X, padding='post', value=-1)
X

array([[  5,   7,   2, ...,   0,   0,   0],
       [  5,   7,   2, ...,   0,   0,   0],
       [  5,   7,   2, ...,   0,   0,   0],
       ...,
       [ 73, 315,   1, ...,   0,   0,   0],
       [ 73, 315,   3, ...,   0,   0,   0],
       [ 73, 315, 123, ...,   0,   0,   0]], dtype=int32)

In [None]:
# Encode the labels
all_tags = set(tag for label in labels for tag in label.split())
all_tags

{'B-5G_yes',
 'B-5G_yes_not',
 'B-RAM',
 'B-RAM1',
 'B-RAM1_not',
 'B-RAM2',
 'B-RAM2_not',
 'B-RAM3',
 'B-RAM3_not',
 'B-RAM_not',
 'B-RAMless',
 'B-RAMless_not',
 'B-RAMmore',
 'B-RAMmore_not',
 'B-appearanceScore1',
 'B-appearanceScore1_not',
 'B-batteryC',
 'B-batteryC1',
 'B-batteryC1_not',
 'B-batteryC2',
 'B-batteryC2_not',
 'B-batteryC3',
 'B-batteryC3_not',
 'B-batteryC_not',
 'B-batteryCless',
 'B-batteryCless_not',
 'B-batteryCmore',
 'B-batteryCmore_not',
 'B-batteryLife1',
 'B-batteryLife1_not',
 'B-batteryLife2',
 'B-batteryLife2_not',
 'B-batteryLife3',
 'B-batteryLife3_not',
 'B-brand',
 'B-brand1',
 'B-brand1_not',
 'B-brand2',
 'B-brand2_not',
 'B-brand_not',
 'B-chipsetScore1',
 'B-chipsetScore1_not',
 'B-chipsetScore2',
 'B-chipsetScore2_not',
 'B-color',
 'B-color1',
 'B-color1_not',
 'B-color2',
 'B-color2_not',
 'B-color_not',
 'B-displaySize',
 'B-displaySize1',
 'B-displaySize1_not',
 'B-displaySize2',
 'B-displaySize2_not',
 'B-displaySize3',
 'B-displaySize3_

In [None]:
unique, counts = np.unique(y_test.argmax(axis=-1), return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
tag_encoder = {tag: i for i, tag in enumerate(all_tags)}
tag_encoder

{'B-maincameraless_not': 0,
 'B-pen_yes_not': 1,
 'B-5G_yes_not': 2,
 'B-color1': 3,
 'B-memorycard_yes_not': 4,
 'I-5G_yes_not': 5,
 'B-weight_not': 6,
 'B-madeIn': 7,
 'I-brand1_not': 8,
 'B-intermemory2_not': 9,
 'B-weight1': 10,
 'B-maincameramore': 11,
 'B-displaySize1_not': 12,
 'B-weight1_not': 13,
 'B-batteryLife3_not': 14,
 'B-intermemory_not': 15,
 'I-batteryC3_not': 16,
 'B-displaySize1': 17,
 'B-os': 18,
 'B-RAM3': 19,
 'B-price_not': 20,
 'I-priceless': 21,
 'B-brand2_not': 22,
 'I-selficamera3': 23,
 'B-color_not': 24,
 'I-maincamera3_not': 25,
 'B-maincamera2_not': 26,
 'B-weight': 27,
 'I-batteryC3': 28,
 'B-RAM1_not': 29,
 'B-displaySize2': 30,
 'B-intermemorymore_not': 31,
 'B-weight2': 32,
 'I-maincamera2': 33,
 'B-color2_not': 34,
 'I-maincamera1': 35,
 'B-brand': 36,
 'B-price3': 37,
 'I-memorycard_yes_not': 38,
 'B-color1_not': 39,
 'I-price': 40,
 'B-batteryC_not': 41,
 'B-displaySize': 42,
 'B-RAMless': 43,
 'B-pricemore_not': 44,
 'B-batteryC1_not': 45,
 'B-bra

In [None]:
y = [[tag_encoder[tag] for tag in label.split()] for label in labels]
y

[[63, 63, 63, 63, 63, 63, 63, 63, 98, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 144, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 143, 63, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 11, 63, 63],
 [63, 63, 63, 63, 144, 63, 63, 63, 63, 63, 63, 63, 63, 106, 63, 63, 63],
 [63, 63, 63, 63, 36, 63, 63, 63, 63, 77, 40, 63, 63],
 [63, 63, 63, 63, 63, 36, 63, 63, 63, 149, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 24, 63, 63, 63, 63, 63, 63, 143, 63, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 24, 63, 75, 63, 63, 63, 63],
 [63, 63, 63, 63, 24, 63, 63, 63, 63, 63, 63, 143, 63, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 51, 63, 63, 63, 63, 63, 63, 151, 63, 63],
 [63, 63, 63, 63, 63, 63, 52, 63, 63, 63, 63, 63, 63, 63, 63, 149, 63, 63],
 [63, 63, 63, 63, 63, 52, 63, 7, 63, 63],
 [63, 63, 63, 63, 63, 63, 106, 63, 63, 63, 63, 63, 42, 63, 63],
 [63, 63, 63, 63, 63, 106, 63, 63, 63, 36, 63, 63],
 [63, 63, 63, 63, 63, 63, 63, 117, 63, 63, 63, 63, 63, 63, 63, 184, 63, 63],
 [63, 63, 63, 63, 75, 63, 63, 63, 63, 63, 63, 63, 117, 6

In [None]:
# Pad label sequences
y = pad_sequences(y, padding='post', value=-1)
y

array([[63, 63, 63, ...,  0,  0,  0],
       [63, 63, 63, ...,  0,  0,  0],
       [63, 63, 63, ...,  0,  0,  0],
       ...,
       [63, 63, 63, ...,  0,  0,  0],
       [63, 63, 63, ...,  0,  0,  0],
       [63, 63, 63, ...,  0,  0,  0]], dtype=int32)

In [None]:
# One-hot encode the labels
y = tf.keras.utils.to_categorical(y, num_classes=len(all_tags))
y

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

array([[ 62, 235, 334, ...,   0,   0,   0],
       [ 71,   4,  90, ...,   0,   0,   0],
       [155,  50,  12, ...,   0,   0,   0],
       ...,
       [ 62, 295, 153, ...,   0,   0,   0],
       [ 11,  65,   2, ...,   0,   0,   0],
       [ 11,  65,   7, ...,   0,   0,   0]], dtype=int32)

In [None]:
X_test

array([[154,   7,  61, ...,   0,   0,   0],
       [160,   4,  34, ...,   0,   0,   0],
       [158,  63,   7, ...,   0,   0,   0],
       ...,
       [ 52,   8, 102, ...,   0,   0,   0],
       [162,   7,   2, ...,   0,   0,   0],
       [ 73, 315,   1, ...,   0,   0,   0]], dtype=int32)

In [None]:
y_train

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.

In [None]:
y_test[0][0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Number of label '0' occurrences (including padding): 347682
There is a label with encoding 0 in y_test.


In [None]:
# Define the GRU model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X.shape[1]))
model.add(Bidirectional(GRU(128, return_sequences=True)))
model.add(TimeDistributed(Dense(len(all_tags), activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - accuracy: 0.8597 - loss: 0.9356 - val_accuracy: 0.9558 - val_loss: 0.2055
Epoch 2/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9633 - loss: 0.1628 - val_accuracy: 0.9752 - val_loss: 0.0927
Epoch 3/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9804 - loss: 0.0749 - val_accuracy: 0.9842 - val_loss: 0.0508
Epoch 4/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.9870 - loss: 0.0409 - val_accuracy: 0.9873 - val_loss: 0.0376
Epoch 5/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9894 - loss: 0.0292 - val_accuracy: 0.9873 - val_loss: 0.0331
Epoch 6/10
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9902 - loss: 0.0249 - val_accuracy: 0.9878 - val_loss: 0.0300
Epoch 7/10
[1m338/33

In [None]:
index_to_word = {index: word for word, index in tokenizer.word_index.items()}
word = index_to_word.get(10)
print(word)

باشد


In [None]:
print(y_test[2698][5])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
def predict_tags(sentence):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=X.shape[1], padding='post', value=-1)

    # Make predictions
    prediction = model.predict(padded_sequence)

    # Convert predictions to tags
    predicted_tags = [np.argmax(p) for p in prediction[0]]

    # Convert indices to tag names
    predicted_tags = [list(tag_encoder.keys())[list(tag_encoder.values()).index(tag)] for tag in predicted_tags]

    # Return only the tags corresponding to the non-padding tokens
    return predicted_tags[:len(sentence.split())]  # Only return tags for actual words

sentence = "من یک گوشی با صفحه نمایش کوچکتر از 4 می خواهم"
predicted_tags = predict_tags(sentence)
print(predicted_tags)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 695ms/step
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-displaySizeless', 'O', 'O']


In [None]:

# Predict tags for the entire test set at once
y_pred_prob = model.predict(X_test)

# Convert probabilities to predicted tag indices
y_pred_indices = np.argmax(y_pred_prob, axis=-1)

# Convert the one-hot encoded y_test to indices for true tags
y_true_indices = np.argmax(y_test, axis=-1)


# Remove padding from both predictions and true labels
# We assume that padding tokens are zero in the label space
mask = (y_true_indices != 0)

y_pred_filtered = y_pred_indices[mask]
y_true_filtered = y_true_indices[mask]

# Convert indices to tag names for both predictions and true labels
# Flatten the results as they might be nested
y_pred_tags = [list(tag_encoder.keys())[index] for index in y_pred_filtered.flatten()]
y_true_tags = [list(tag_encoder.keys())[index] for index in y_true_filtered.flatten()]

[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [None]:
len(y_pred_indices[])

43

In [None]:
len(y_true_indices[1])

43

In [None]:
X_test[2698]

array([ 73, 315,   1,  13,  47, 133,  10,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

In [None]:
y_pred_indices[2698]

array([63, 63, 63, 63, 63, 99, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
y_true_tags

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-fastCharge_yes',
 'I-fastCharge_yes',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-intermemory3',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-displaySize',
 'O',
 'O',
 'O',
 'B-brand',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-weight2',
 'O',
 'O',
 'O',
 'O',
 'B-selficamera3',
 'I-selficamera3',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-memorycard_yes',
 'I-memorycard_yes',
 'O',
 'B-5G_yes',
 'O',
 'O',
 'B-nsim',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-price1',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-displaySizemore',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-pen_yes',
 'O',
 'O',
 'O',
 'O',
 'B-batteryLife2',
 'O',
 'O',
 'B-madeIn',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-priceless',
 'I-priceless',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-nsim',
 'O',
 'O',
 'O',
 'O',
 'B-fastCharge_yes',
 'I-fastCh

In [None]:
# Get unique classes in y_true_tags and y_pred_tags
unique_classes = set(y_true_tags + y_pred_tags)
class_labels = sorted(unique_classes)

# Generate classification report
print("Classification Report:")
print(classification_report(y_true_tags, y_pred_tags, labels=class_labels, target_names=class_labels))

# Calculate and print overall accuracy
accuracy = accuracy_score(y_true_tags, y_pred_tags)
print(f"Overall Accuracy: {accuracy:.4f}")


Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                        precision    recall  f1-score   support

              B-5G_yes       0.97      0.93      0.95       156
          B-5G_yes_not       1.00      1.00      1.00         2
                 B-RAM       0.96      0.96      0.96        74
                B-RAM1       0.90      0.88      0.89        51
            B-RAM1_not       1.00      1.00      1.00         3
                B-RAM2       0.92      0.92      0.92        38
            B-RAM2_not       1.00      1.00      1.00         3
                B-RAM3       0.98      0.92      0.95        52
             B-RAM_not       1.00      1.00      1.00         1
             B-RAMless       0.95      0.93      0.94        85
         B-RAMless_not       1.00      1.00      1.00         1
             B-RAMmore       0.97      0.85      0.91        68
    B-appearanceScore1       0.91      0.93      0.92       228
B-appearanceScore1_not       1.00      1.00      1.00         8
            B-batteryC       0.92      

In [None]:
# Save the model for later use
model.save('gru_bio_tagger_2.h5')



In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Convert index-based predictions to tag names
def index_to_tag(index_seq):
    return [list(tag_encoder.keys())[list(tag_encoder.values()).index(idx)] for idx in index_seq]

# Generate predictions for the entire test set
y_pred = []
y_true = []

for i in range(len(X_test)):
    # Get the sentence from the test set
    sentence = " ".join([word for word in tokenizer.sequences_to_texts([X_test[i]])[0].split()])

    # Get the true tags
    true_tags = y_test[i].argmax(axis=1)  # Convert one-hot to index

    # Predict the tags using the model
    predicted_tags = predict_tags(sentence)

    # Append the results to the lists
    y_pred.extend(predicted_tags)
    y_true.extend(index_to_tag(true_tags[:len(predicted_tags)]))  # Match the length of predicted tags

# Filter out padding tokens (assuming 'O' is the padding or non-entity tag)
non_pad_indices = [i for i in range(len(y_true)) if y_true[i] != 'O']

y_pred_filtered = [y_pred[i] for i in non_pad_indices]
y_true_filtered = [y_true[i] for i in non_pad_indices]

# Get unique classes in y_true and y_pred
all_classes = set(y_true_filtered + y_pred_filtered)
class_labels = sorted(all_classes)

# Generate classification report
print("Classification Report:")
print(classification_report(y_true_filtered, y_pred_filtered, labels=class_labels, target_names=class_labels))

# Calculate and print overall accuracy
accuracy = accuracy_score(y_true_filtered, y_pred_filtered)
print(f"Overall Accuracy: {accuracy:.4f}")
