<a href="https://colab.research.google.com/github/Koanah/IsizuluPosTagging/blob/main/isiZulu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cleaning the Data set for training a Linear Chain Conditional Random field

In [None]:
#data preprocessing

from google.colab import drive
drive.mount('/content/drive')
from sklearn.model_selection import train_test_split

filepath = '/content/drive/My Drive/IsizuluData.txt'

# preparing sentences in text for crf training and testing
def load_data(filepath):
  with open(filepath, 'r', encoding= 'utf-8') as file:
    next(file) # skipping the headers
    sentences = [] #sentences are separated by spaces in text
    sentence =[] #each sentence

    #grouping data by sntences
    for line in file:
      line = line.strip() #removing whitespaces
      if not line:
        if sentence:
          sentences.append(sentence)
          sentence = []
        continue

      morph_parts = line.split('\t')
      if len(morph_parts) == 3: # accounting for invalid lines
        token, morph, tag = morph_parts
        sentence.append((token, morph, tag))
    if sentence: # the last sentence in text
      sentences.append(sentence)
  return sentences




Mounted at /content/drive


# Splitting the data

In [None]:
# Splitting the data into training and testing

zulu_data = load_data(filepath)
train_data, test_data = train_test_split(zulu_data, test_size=0.25, random_state=42)
print (len(zulu_data)) # number of total sentences in text
print (len(train_data)) # number of sentences for training
print (len(test_data))# number of sentences for testing

'''visualizing the formart of the data after processing
for i in train_data:
  print(i)'''

1430
1072
358


'visualizing the formart of the data after processing\nfor i in train_data:\n  print(i)'

In [None]:
# handlinig <UNK> words for training set using frequency
from collections import Counter

#flatten tokens from training data into single list
train_tokens = [ token for sentence in train_data for token, _, _ in sentence]
word_frequency = Counter(train_tokens)

#using Avg frequency as threshHold to define rare words

threshHold = sum (word_frequency.values()) / len (word_frequency)
print("UNK threshold :", threshHold) #may need to be converted to an integer ?


# defining feature extraction funtions with UNK handling
'''
Extracting features for a given word in a sentence to replace rare words as ,<UNK>
leveraging morphological segments and adding zulu-specific prefix/suffix patterns
'''
def word_Features(sentence, i, word_frequency, threshold=1):
  word, morph, tag = sentence[i]

  #replacing rare words with <UNK>
  if word_frequency.get(word,0) < threshold:
    base_word ='<UNK>'
  else:
    base_word = word

  #Morphological segmentation , split using '-'

  morph_parts = morph.split('-')
  prefix = morph_parts[0] if morph_parts else ''
  suffix = morph_parts[-1] if morph_parts else ''
  morph_len = len (morph_parts)

  #isiZulu common affix patterns
  com_prefx = ['u','isi','aba','a','o','zi','izi','li','ili','ku','imi','um','umu','i','u','e','na','ma']
  com_sufx = ['ana','ile','ani','eni','yo','a','e','ane' ,'azi','eka','ela','elela','i','isa','isisa', 'kazi','ke','ni','pha','ulula','wa']

  #main features dictionary for the current word

  features = {
      'bias': 1.0, # handler for words without featue matches
      'word.lower': base_word.lower(),
      'word.isupper': base_word.isupper(),
      'word.istitle': word.istitle(),
      'word.isdigit': word.isdigit(),
      'prefix3': base_word[:3],
      'suffix3': base_word[-3:],
      'prefix_morph': prefix,
      'suffix_morph': suffix,
      'morph_len': morph_len,
      'has_commonPrefix' : prefix in  com_prefx,
      'has_commonSuffix' : suffix in com_sufx,
      'prefix+suffix_combo': f"{prefix}+{suffix}"
  }

  #capturing context for current word using its neighbours
  #features from previous word

  if i >0 :
    prev_word = sentence[i-1][0]

    features.update({
        '-1:word.lower' : prev_word.lower(),
        '-1:prefix3': prev_word[:3],
    })
  else:
    features['BOS'] = True # marks the begining of a sentence

    #features from following word

    if i < len(sentence) -1:
      next_word = sentence[i+1][0]
      features.update({
        '+1:word.lower' : next_word.lower(),
        '+1:suffix3': next_word[-3:],
       })
    else:
      features['EOS'] = True # marks the end of a sentence

  return features

# converting a full sentence into a list of feature dictionaries

def sentence_features(sentence, word_freq, threshold=1):
  return [word_Features(sentence, i, word_freq, threshold) for i in range (len(sentence))]

# obtaining the tag sequence labels from the sentence.

def sentence_labels(sentence):
  return [label for _, _, label in sentence]


#features and labels for CRF

#  Prepare features and labels for CRF
#  separate labels from test set
X_train = [sentence_features(s, word_frequency, threshHold) for s in train_data]
Y_train = [sentence_labels(s) for s in train_data] # POS tags for train set

X_test = [sentence_features(s, word_frequency, threshHold) for s in test_data]# without labes
y_test = [sentence_labels(s) for s in test_data] # POS tags for test set

''' for visual purposes only
for i in X_test :
  print (i)
'''


UNK threshold : 3.102037394147732


' for visual purposes only\nfor i in X_test :\n  print (i)\n'

# Training the LC-CRF

In [None]:
!pip install sklearn_crfsuite
import sklearn_crfsuite

#Initializing the CRF model using the LBFGS algorithm for optimization

crf = sklearn_crfsuite.CRF(
    algorithm= 'lbfgs',
    c1=0.1, # L1 regularization to eliminate irrelevant features
    c2=0.1, # L2 regularization to prevent overfitting
    max_iterations=100, # stops after 100 training iterations
    all_possible_transitions=True # promoting model generilization

)

crf.fit(X_train, Y_train)




# TESTING AND EVALUATING

In [None]:
#testing/ predicting POS tags on test data

y_pred = crf.predict(X_test) # Predicted tags per sentence

#Accuracy score
from sklearn.metrics import accuracy_score

#flatten both true and predicted sequences
y_true_flat = [label for sent in y_test for label in sent] # all true tags
y_pred_flat = [ label for sent in y_pred for label in sent] # predicted tags

for i in range(3):
  print(f"\n Sentence {i+1}:")
  tokens = [token for token, _, _, in test_data[i]]
  print("tokens:    ",tokens)
  print("Predicted:  ",y_pred[i])
  print("Actual:      ",y_test[i])


accuracy = accuracy_score(y_true_flat, y_pred_flat)
print(f"CFR POS Tagging Accuracy: {accuracy:.4f}")


 Sentence 1:
tokens:     ['Hlola', 'imisebenzi', 'yohlelo', 'lwe-CBP', 'ngokuyiqhathanisa', 'nephakanyiswe', 'ohlelweni', 'lwe-IDP', 'ukuthola', 'ukuthi', 'kukhona', 'yini', 'ukuphindaphindeka', 'kwemisebenzi', '.', 'Ngisho', 'noma', 'ngabe', 'akukho', ',', 'kungenzeka', 'noma', 'kube', 'yinto', 'efanele', 'ukucela', 'ngqo', 'ukuxhaswa', 'ngezimali', '.']
Predicted:   ['V', 'N', 'POSS', 'POSS', 'ADV', 'ADV', 'N', 'POSS', 'V', 'CONJ', 'ADV', 'INT', 'N', 'POSS', 'PUNC', 'V', 'CONJ', 'CONJ', 'ADV', 'PUNC', 'V', 'CONJ', 'V', 'COP', 'REL', 'V', 'IDEO', 'N', 'ADV', 'PUNC']
Actual:       ['V', 'N', 'POSS', 'POSS', 'ADV', 'ADV', 'REL', 'POSS', 'V', 'CONJ', 'ADV', 'INT', 'N', 'POSS', 'PUNC', 'V', 'CONJ', 'CONJ', 'ADV', 'PUNC', 'V', 'CONJ', 'V', 'COP', 'REL', 'V', 'IDEO', 'N', 'ADV', 'PUNC']

 Sentence 2:
tokens:     ['Kuveza', ',', 'kuqinisa', 'futhi', 'kufaka', 'ukuziphatha', 'okwenzekayo', 'kokucabanga', 'izinto', 'ezingelona', 'iqiniso', 'ngabanye', 'kanye', 'nokucwasa', 'iqembu', 'elithile