In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import InputLayer, Embedding, Bidirectional, LSTM, SpatialDropout1D, TimeDistributed, Dense
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(r"D:\f\Level 4\Second term\NLP\project\project\ANERCORP.csv", names=["Word", "Tag"], encoding="utf-8")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148563 entries, 0 to 148562
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Word    148556 non-null  object
 1   Tag     148563 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [4]:
data.head(10)

Unnamed: 0,Word,Tag
0,فرانكفورت,B-LOC
1,(د,O
2,ب,O
3,أ),O
4,أعلن,O
5,اتحاد,B-ORG
6,صناعة,I-ORG
7,السيارات,I-ORG
8,في,O
9,ألمانيا,B-LOC


In [5]:
unique_tags = data["Tag"].unique()
print(unique_tags)

['B-LOC' 'O' 'B-ORG' 'I-ORG' 'B-PERS' 'I-PERS' 'I-LOC' 'B-MISC' 'I-MISC']


In [6]:
sentences = []
current_sentence = []

for index, row in data.iterrows():
    word = row[0]
    tag = row[1]

    if pd.notna(word) and pd.notna(tag):
        current_sentence.append((word, tag))
        if word == '.':
            sentences.append(current_sentence)
            current_sentence = []
    else:
        if current_sentence:
            sentences.append(current_sentence)
        current_sentence = []


  word = row[0]
  tag = row[1]


In [7]:
print("the total number of sentences: ",len(sentences))

the total number of sentences:  4876


In [8]:
words = list(set(data["Word"].values))
tags = list(set(data["Tag"].values))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i for i, t in enumerate(tags)}

idx2word = {i: w for w, i in word2idx.items()}
idx2tag = {i: t for t, i in tag2idx.items()}

In [9]:
import pickle

with open("word2idx_2.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("idx2tag_2.pkl", "wb") as f:
    pickle.dump(idx2tag, f)

In [10]:
max_len = 50
X = [[word2idx.get(w[0], word2idx["UNK"]) for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = np.expand_dims(y, -1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("training samples count: ", len(x_train))
print("testing samples count: ",len(x_test))

training samples count:  3900
testing samples count:  976


In [12]:
model = keras.Sequential([
    InputLayer(input_shape=(max_len,)),
    Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len),
    SpatialDropout1D(0.1),
    Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(len(tag2idx), activation="softmax"))
])



In [13]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [37]:
callbacks = [
    ModelCheckpoint("arabic_ner_model.keras", save_best_only=True, monitor="val_loss"),
    EarlyStopping(monitor="val_accuracy", patience=2, restore_best_weights=True)
]

In [38]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    batch_size=32,
    epochs=10,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/10


[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 153ms/step - accuracy: 0.9929 - loss: 0.0261 - val_accuracy: 0.9687 - val_loss: 0.1416
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 152ms/step - accuracy: 0.9952 - loss: 0.0191 - val_accuracy: 0.9719 - val_loss: 0.1386
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 153ms/step - accuracy: 0.9963 - loss: 0.0145 - val_accuracy: 0.9683 - val_loss: 0.1484
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 151ms/step - accuracy: 0.9973 - loss: 0.0113 - val_accuracy: 0.9720 - val_loss: 0.1518
Epoch 5/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 150ms/step - accuracy: 0.9977 - loss: 0.0090 - val_accuracy: 0.9684 - val_loss: 0.1581
Epoch 6/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 154ms/step - accuracy: 0.9982 - loss: 0.0078 - val_accuracy: 0.9727 - val_loss: 0.1640
Epoch 7/10
[1m122/12

In [39]:
print("\nEvaluating on test set:")
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Evaluating on test set:
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.9750 - loss: 0.1488
Test Loss: 0.1640
Test Accuracy: 0.9727


In [40]:
num_examples = 4
indices = np.random.choice(range(2, x_test.shape[0]), size=num_examples, replace=False)

for i in indices:
    sample_input = x_test[i:i+1]
    pred = model.predict(sample_input)
    pred = np.argmax(pred, axis=-1)[0]
    true = y_test[i].flatten()

    print("\nExample Index:", i)
    print("{:15} {:8} {:8}".format("Word", "True", "Pred"))
    print("-" * 40)
    for w, t, p in zip(sample_input[0], true, pred):
        if w != word2idx["PAD"]:
            print("{:15} {:8} {:8}".format(idx2word[w], idx2tag[t], idx2tag[p]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step

Example Index: 523
Word            True     Pred    
----------------------------------------
وشن             O        O       
المندوب         O        O       
الإسرائيلي      O        O       
دان             B-PERS   B-PERS  
غيلرمان         I-PERS   I-PERS  
هجوما           O        O       
عنيفا           O        O       
على             O        O       
أمين            O        O       
الأمم           B-ORG    B-ORG   
المتحدة         I-ORG    I-ORG   
لأن             O        O       
تقريره          O        O       
لم              O        O       
يشر             O        O       
إلى             O        O       
الإرهاب         O        O       
وإيران          B-LOC    B-LOC   
وسوريا          B-LOC    B-LOC   
.               O        O       
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step

Example Index: 673
Word            True     Pred    
--------------------------

In [41]:
def predict_sentence(sentence, model=model, word2idx=word2idx, idx2tag=idx2tag, max_len=50):
    tokens = sentence.split()  # or use a tokenizer if needed
    input_ids = [word2idx.get(w, word2idx.get("UNK", 1)) for w in tokens]
    
    padded_input = input_ids + [word2idx["PAD"]] * (max_len - len(input_ids))
    padded_input = np.array(padded_input).reshape(1, max_len)

    pred = model.predict(padded_input)
    pred = np.argmax(pred, axis=-1)[0]

    tags = [idx2tag[idx] for idx in pred[:len(tokens)]]

    return list(zip(tokens, tags))

In [43]:
predict_sentence("ولد بشمهندس عمر طارق في مصر")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step


[('ولد', 'O'),
 ('بشمهندس', 'O'),
 ('عمر', 'B-PERS'),
 ('طارق', 'I-PERS'),
 ('في', 'O'),
 ('مصر', 'B-LOC')]

In [45]:
predict_sentence("عقد اليوم اجتماع الامم المتحدة في المملكة العربية السعودية بحضور كل من الرئيس المصري عبد الفتاح السيسي و الرئيس الامريكي دونالد ترامب")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step


[('عقد', 'O'),
 ('اليوم', 'O'),
 ('اجتماع', 'O'),
 ('الامم', 'B-ORG'),
 ('المتحدة', 'I-ORG'),
 ('في', 'O'),
 ('المملكة', 'B-LOC'),
 ('العربية', 'I-LOC'),
 ('السعودية', 'B-LOC'),
 ('بحضور', 'O'),
 ('كل', 'O'),
 ('من', 'O'),
 ('الرئيس', 'O'),
 ('المصري', 'O'),
 ('عبد', 'B-PERS'),
 ('الفتاح', 'I-PERS'),
 ('السيسي', 'I-PERS'),
 ('و', 'O'),
 ('الرئيس', 'O'),
 ('الامريكي', 'O'),
 ('دونالد', 'B-PERS'),
 ('ترامب', 'I-PERS')]