# 10 Implement RNN for sequence labelling.

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed, Input, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
# Load the Arabic IOB dataset with the correct delimiter
df = pd.read_csv(
    'data/Arabic_IOB_dataset.csv',
    sep=",",  # Adjust the delimiter if it's not a comma
    header=None,
    names=["Word i", "Word i entity tag", "Word i POS", "Stopword"]
)

In [38]:
# Display dataset preview and column details
print("Dataset Preview:")
print(df.head())
print("\nDataset Info:")
print(df.info())


Dataset Preview:
      Word i  Word i entity tag   Word i POS  Stopword
0     Word i  Word i entity tag   Word i POS  Stopword
1         إن                  O  verb_pseudo       yes
2  اللوكيميا                  B         noun        no
3         أو                  O         conj       yes
4          (                  O         punc        no

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62507 entries, 0 to 62506
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Word i             62507 non-null  object
 1   Word i entity tag  62507 non-null  object
 2   Word i POS         62507 non-null  object
 3   Stopword           62507 non-null  object
dtypes: object(4)
memory usage: 1.9+ MB
None


In [39]:
# Drop rows where all columns are NaN (fully empty rows)
df = df.dropna(how="all")

In [40]:
# Check for rows with valid 'Word i' and 'Word i entity tag'
df = df[df["Word i"].notna() & df["Word i entity tag"].notna()]
print("\nDataset after cleaning:")
print(df.head())


Dataset after cleaning:
      Word i  Word i entity tag   Word i POS  Stopword
0     Word i  Word i entity tag   Word i POS  Stopword
1         إن                  O  verb_pseudo       yes
2  اللوكيميا                  B         noun        no
3         أو                  O         conj       yes
4          (                  O         punc        no


In [41]:
# Initialize lists for sentences and labels
sentences = []
labels = []

In [42]:
# Group words and labels into sentences
sentence = []
label = []
for i, row in df.iterrows():
    word = row["Word i"]
    entity_tag = row["Word i entity tag"]

    if word == "." or pd.isnull(word):  # End of a sentence
        if sentence:  # Append the completed sentence
            sentences.append(sentence)
            labels.append(label)
            sentence = []
            label = []
    else:
        sentence.append(word)
        label.append(entity_tag)

In [43]:
# Append the last sentence if not empty
if sentence:
    sentences.append(sentence)
    labels.append(label)

In [44]:
# Print the first few sentences and labels
print("\nFirst few sentences and labels:")
for i in range(min(len(sentences), 3)):
    print(f"Sentence {i+1}: {sentences[i]}")
    print(f"Labels {i+1}: {labels[i]}")


First few sentences and labels:
Sentence 1: ['Word i', 'إن', 'اللوكيميا', 'أو', '(', 'ابيضاض', 'الدم', ')', 'هو', 'سرطان', 'خلايا', 'الدم', 'البيضاء', ',', 'و', 'خلايا', 'الدم', 'البيضاء', 'تساعد', 'الجسم', 'على', 'محاربة', 'العدوى']
Labels 1: ['Word i entity tag', 'O', 'B', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sentence 2: ['تتشكل', 'خلايا', 'الدم', 'في', 'نقي', 'العظام', ',', 'أما', 'في', 'اللوكيميا', 'ف', 'يقوم', 'نقي', 'العظام', 'على', 'كل', 'حال', 'ب', 'إنتاج', 'خلايا', 'دم', 'بيضاء', 'غير', 'طبيعية', ',', 'ب', 'حيث', 'تتجمع', 'تلك', 'الخلايا', 'حول', 'خلايا', 'الدم', 'الصحيحة', 'مما', 'يجعل', 'من', 'الصعب', 'أن', 'تقوم', 'هذه', 'الخلايا', 'ب', 'عملها', 'ب', 'شكل', 'عادي']
Labels 2: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Sent

In [45]:
# Ensure sentences and labels are populated
if not sentences or not labels:
    raise ValueError("No sentences or labels were extracted. Check dataset formatting.")

In [46]:
# Encode words and labels
word2idx = {word: i + 1 for i, word in enumerate(set(df["Word i"].values))}
label2idx = {label: i for i, label in enumerate(set([item for sublist in labels for item in sublist]))}


In [47]:
# Encode the words and labels
X = [[word2idx.get(word, 0) for word in sent] for sent in sentences]
y = [[label2idx[label] for label in lab] for lab in labels]

In [48]:
# Pad sequences to the same length
max_len = max(len(sent) for sent in sentences)
X = pad_sequences(X, maxlen=max_len, padding="post")
y = pad_sequences(y, maxlen=max_len, padding="post")


In [49]:
# Convert labels to categorical format
y = [to_categorical(i, num_classes=len(label2idx)) for i in y]


In [50]:
# Check the shapes of X and y
print("\nShape of X (features):", np.array(X).shape)
print("Shape of y (labels):", np.array(y).shape)



Shape of X (features): (3758, 127)
Shape of y (labels): (3758, 127, 4)
