In [1]:
import gensim
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support


2024-02-21 19:21:36.572387: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Importing vectorizer, explained in train notebook

In [2]:
vec_model = gensim.models.KeyedVectors.load_word2vec_format('cc.hr.300.vec')

In [3]:
df_test = pd.read_csv('test_dataset.csv') #Loading test dataset

**Feedforward neural network Evaluation**

Splitting dataset on x,y (words,lemmas)

In [4]:
test_words = df_test["Rijeci"] 
test_lemmas = df_test["Leme"]

Explained in train notebook, used to save words and lemmas as a vectors

In [5]:
vocab = set()

def get_vec_form(words, lemmas):
    x = {'form': [], 'vec': []}
    y = {'form': [], 'vec': []}
    for w, l in zip(words, lemmas):
        try:
            new_x = vec_model[w]
        except:
            vocab.add(w)
            continue
        try:
            new_y = vec_model[l]
        except:
            vocab.add(l)
            continue
        x['vec'].append(new_x)
        x['form'].append(w)
        y['vec'].append(new_y)
        y['form'].append(l)
    x['vec'] = np.array(x['vec'])
    y['vec'] = np.array(y['vec'])
    return x, y

In [6]:
test_x, test_y = get_vec_form(test_words, test_lemmas) 

In [7]:
len(test_x['vec'])

51548

In [9]:
model_ffnn = tf.keras.models.load_model("model_ffnn.keras") #loading exported model from train notebook

In [10]:
from sklearn.metrics import f1_score

def evaluate_f1(predicted_vecs, lemmas):
    predicted_lemmas = []
    # Iterating through predicted vectors and their corresponding indices
    for i, pred in enumerate(predicted_vecs):
        
        #Finding the nearest lemma using vec_model
        nearest = vec_model.most_similar(positive=[pred], topn=1)
        
        # Appending the predicted lemma if found, otherwise append an empty string
        predicted_lemmas.append(nearest[0][0] if nearest else "")
    
    f1 = f1_score(lemmas, predicted_lemmas, average='micro') #using f1score pre-built function to obtain f1 score
    return f1, predicted_lemmas

# Predicting using a feedforward neural network model on the test set vectors
test_pred_ffnn = model_ffnn.predict(test_x['vec'])

f1_ffnn, predicted_lemmas_ffnn = evaluate_f1(test_pred_ffnn, test_y['form']) #applying function and getting f1 score


print('{:.2f}%\t- F1 score on test set'.format(100 * f1_ffnn))

68.23%	- F1 score on test set


Evaluation on real-life examples to check our model

In [11]:
def lemmatize(tokens):
    lemmas = []
    for token in tokens:
        try:
            vec = vec_model[token].reshape((1, 300)) #vector representation
            pred = model_ffnn.predict(vec)[0] #making a prediction using ffnn
            #Finding the most similar lemma in the vector model based on the predicted vector
            lemmas.append(vec_model.most_similar(positive=[pred], topn=1)[0][0])
        except Exception as e:
            print(e)
            print(token)
            lemmas.append(token)
    return lemmas

In [12]:
lemmatize("Išao sam danas na posao, bio mi je jako zanimljiv dan na poslu".split(' ')) #lets try it

"Key 'posao,' not present"
posao,


['ići',
 'biti',
 'danas',
 'na',
 'posao,',
 'biti',
 'ja',
 'biti',
 'jako',
 'zanimljiv',
 'dan',
 'na',
 'posao']

**LSTM Evaluation**

In [13]:
LSTMmodel = tf.keras.models.load_model("lstm_model.keras") #loading exported model

Using again dataset resized 10times because I cant run full LSTM on dataset, so to have same ratio as in training.

In [15]:
test_words = df_test["Rijeci"].values[:5154]
test_lemmas = df_test["Leme"].values[:5154]

Reused function from training for generating test batches

In [29]:
B = 50  # Batch size
R = 300  # RNN size
S = 4   # Max sequence length
E = 300  # Embedding size -> dimensionality of the vector space in which words or tokens are represented

In [27]:
def generate_data(words, lemmas, vec_model, line_limit=5145, mode='test'):
    word_count = 0
    line_number = 0

    # Initialize arrays to store input (x) and output (y) sequences
    x = np.zeros((B, S, E))
    y = np.zeros((B, S, E))
    
    word_seqs = [None for _ in range(B)] # Stores word sequences
    lemma_seqs = [None for _ in range(B)] # Stores lemma sequences

    word_seq = [] # Current word sequence
    lemma_seq = [] # Current lemma sequence
    
    x_seq = [] # Current x sequence
    y_seq = [] # Current y sequence
    
    i = 0 # Batch index

    
    # Iterate through words and lemmas
    for word, lemma in zip(words, lemmas):
        line_number += 1
        if line_number > line_limit: # Stopping if line limit is reached
            return 

        # Check if the current sequences have reached the maximum length (S)
        if len(x_seq) == S and len(y_seq) == S:
            # Convert current sequences to arrays and store them in the batch
            x[i] = np.array(x_seq)
            y[i] = np.array(y_seq)
            word_seqs[i] = word_seq[:]
            lemma_seqs[i] = lemma_seq[:]

            # If in training mode, popping the first element from sequences to shift the window
            if mode == 'train':
                x_seq.pop(0)
                y_seq.pop(0)
                word_seq.pop(0)
                lemma_seq.pop(0)
            else:                   # If not in training mode, reseting the sequences
                x_seq = []
                y_seq = []
                word_seq = []
                lemma_seq = []
            i += 1

            # If the batch is full, yield the data and reset for the next batch
            if i >= B:
                yield x, y, word_seqs, lemma_seqs
                x = np.zeros((B, S, E))
                y = np.zeros((B, S, E))
                word_seqs = [None for _ in range(B)]
                lemma_seqs = [None for _ in range(B)]
                i = 0
                word_count += S

        try:             # Get word and lemma embeddings from the vector model
            word_embedding = vec_model[word]
            lemma_embedding = vec_model[lemma]
        except KeyError:     # If not found, using zero vectors
            word_embedding = np.zeros(E)
            lemma_embedding = np.zeros(E)

       # Appending the embeddings and the words/lemmas to the current sequences
        x_seq.append(word_embedding)
        y_seq.append(lemma_embedding)
        word_seq.append(word)
        lemma_seq.append(lemma)

loaded test data using generate_data function

In [33]:
test_batches = [(x, y, w, l) for x, y, w, l in generate_data(test_words, test_lemmas, vec_model,line_limit=5154)]

In [34]:
# Evaluate on the test set
correct = 0
count = 0
true_labels = []
predicted_labels = []

# Iterating through test batches containing X (input), Y (true labels),
#  W (word embeddings), and L (lemmatized labels)
for X, Y, W, L in test_batches:
    
    #Predicting labels using the LSTM model for the current batch
    pred = LSTMmodel.predict_on_batch(X)
    
    # Iterating through predictions and sequences in the batch
    for i, seq in enumerate(pred):
        for j, pred_y in enumerate(seq):
            # Checking if the input sequence is a padding sequence
            if np.sum(X[i][j]) == 0:
                nearest = W[i][j]   # If padding, use identity backoff for out-of-vocabulary (oov) tokens
            else: # Find the nearest word embedding to the predicted label using a word vector model
                nearest = vec_model.most_similar(positive=[pred_y], topn=1)[0][0]

            true_labels.append(L[i][j])
            predicted_labels.append(nearest)

             # Checking if the predicted label matches the true label
            if nearest == L[i][j]:
                correct += 1
            count += 1

# Calculating precision, recall, and F1 score -> using prebuilt functions
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='weighted',zero_division=1)

print('final test precision: {0:.4f}'.format(precision))
print('final test recall: {0:.4f}'.format(recall))
print('final test F1 score: {0:.4f}'.format(f1))
print('correctly lemmatized tokens:', correct)
print('all tokens:', count)

final test precision: 0.9795
final test recall: 0.7148
final test F1 score: 0.7258
correctly lemmatized tokens: 3574
all tokens: 5000


Evaluation on real-life examples to check our model

In [37]:
def lemmatize(tokens):
    lemmas = []
    for i in range(0, len(tokens), S):
        x = np.zeros((1, S, E))
        oov = []
        
        # Iterating through tokens in the current window and handle out-of-vocabulary cases
        for j, t in enumerate(tokens[i:min(i + S, len(tokens))]):
            try:
                x[0][j] = vec_model[t]
            except:
                oov.append(j)
                
        #Predicting lemmas using the LSTM model for the current window        
        y = LSTMmodel.predict([x], batch_size=1)
        
        # Iterating through the predicted values and handle out-of-vocabulary cases
        predicted_lemmas = []
        for j in range(min(i + S, len(tokens)) - i):
            if j in oov: # If the token was out-of-vocabulary, keep the original token as the lemma
                predicted_lemmas.append(tokens[i + j])
            else: #Find the nearest word to the predicted value using the word vector model
                predicted_lemmas.append(vec_model.most_similar(positive=[y[0][j]], topn=1)[0][0])
        lemmas += predicted_lemmas    
    return lemmas

In [38]:
lemmatize("Išao sam danas na posao, bio mi je jako zanimljiv dan na poslu".split(' '))



['Išao',
 'biti',
 'danas',
 'na',
 'posao,',
 'biti',
 'ja',
 'biti',
 'jako',
 'zanimljiv',
 'dan',
 'na',
 'poslu']

**Final word:**
    
- Two models were tested to achieve text lemmatization using the HR500k dataset. 
- FeedFoward Neural network, was trained on the full dataset for training, evaluation and testing and achieved f1 score: 68.23%.
- LSTM was trained on a smaller dataset 10 times due to resource limitations on the laptop, and f1 score was achieved on these data: 72.58%. 

My predictions are that LSTM on the full dataset would give a much better score, but I am satisfied with this achieved.    