In [None]:
pip install tensorflow

In [None]:
!pip install gdown

In [1]:
import numpy as np
import pandas as pd
import gensim
import sklearn
from tensorflow.keras.losses import cosine_similarity
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Masking, Dropout, TimeDistributed
from tensorflow.keras.optimizers import RMSprop

2024-02-21 18:28:16.043579: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Vectorizer**

- We used a pre-trained vectorizer to store words into vectors and capture semantic relationships between words, allowing the model to understand similarities and differences in meaning.

- The vec_model variable now contains the loaded Word2Vec model, which you can use to get word vectors for words in NLP tasks.

- The decision to use this vectorizer was made because it is overtrained in Croatian, as can be seen from the files we uploaded.

In [None]:
#Downloading Croatian vectorizer uploaded on google drive
#!gdown --id 1L953pNrGZTiI8vKTXDIUYcGc92VUD9wB

In [2]:
vec_model = gensim.models.KeyedVectors.load_word2vec_format('cc.hr.300.vec')

**Loading Processed data**

In [3]:
df_train = pd.read_csv('train_dataset.csv') ##training dataset
df_dev = pd.read_csv('val_dataset.csv') ##validation dataset

**Feedforward neural network**

In [4]:
train_words = df_train["Rijeci"] #input feature
train_lemmas = df_train["Leme"] #output feature

dev_words = df_dev["Rijeci"] #input feature
dev_lemmas = df_dev["Leme"] #output feature

In [5]:
vocab = set() # Initializing an empty set

def get_vec_form(words, lemmas):
    #dictionaries 'x' and 'y' with keys 'form' and 'vec'
    x = {'form': [], 'vec': []}
    y = {'form': [], 'vec': []}
    for w, l in zip(words, lemmas):   #Iterating over pairs of words and lemmas using zip
        try:
            new_x = vec_model[w] #obtaining the vector representation of the word 'w' from the embedding model
        except:
            vocab.add(w) 
            continue
        try:
            new_y = vec_model[l] #obtaining the vector representation of the lemma 'l' from the embedding model
        except:
            vocab.add(l)
            continue
        x['vec'].append(new_x) # Appending the vector representation and corresponding words dictionary
        x['form'].append(w)
        y['vec'].append(new_y)
        y['form'].append(l)
        
    # Converting the lists of vectors to NumPy arrays
    x['vec'] = np.array(x['vec'])
    y['vec'] = np.array(y['vec'])
    return x, y  
# Return the dictionaries 'x' and 'y' containing the vector representations and corresponding words/lemmas

Spliting data on train_x, train_y and dev_x, dev_y (validation data) using made function get_vec_form on real data

In [6]:
train_x, train_y = get_vec_form(train_words, train_lemmas)
dev_x, dev_y = get_vec_form(dev_words, dev_lemmas)

In [7]:
len(train_x['vec']), len(dev_x['vec']) #number of examples

(308780, 51467)

**Model structure -> Feedforward neural network**

**Architecture:**
- Input Layer: Dense layer with 512 units and ReLU activation, taking a 300-dimensional input vector.
- Dropout Layer: 50% dropout applied after the first layer.
- Hidden Layer: Dense layer with 256 units and ReLU activation.
- Dropout Layer: 30% dropout applied after the second layer.
- Output Layer: Dense layer with 300 units and linear activation, indicating a regression task.

**Activation Functions:**
- ReLU activation is used in the hidden layers to introduce non-linearity.
- Linear activation in the output layer for regression.

**Regularization:**
- Dropout regularization is applied to mitigate overfitting after the first and second dense layers.

In [8]:
model_ffnn = Sequential()
model_ffnn.add(Dense(512, activation='relu', input_shape=(300,)))
model_ffnn.add(Dropout(0.5))
model_ffnn.add(Dense(256, activation='relu'))
model_ffnn.add(Dropout(0.3))
model_ffnn.add(Dense(300, activation='linear'))

**Loss Function:**
- The model is compiled with the cosine similarity as the loss function, indicating that the training objective is to maximize the cosine similarity between predicted and true values, suitable for similarity-based tasks.

**Optimizer:**
- RMSprop is chosen as the optimizer for updating the model weights during training, offering adaptive learning rates that can help converge faster in non-convex optimization problems.

In [9]:
model_ffnn.compile(loss='cosine_similarity', optimizer=RMSprop())

**Training the model**

In [10]:
model_ffnn.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
               epochs=50, batch_size=128, verbose=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7fcbf1cddff0>

In [11]:
model_ffnn.save("model_ffnn.keras") #exporting model

**LSTM**

HyperParameters

In [12]:
B = 50  # Batch size
R = 300  # RNN size
S = 4   # Max sequence length
E = 300  # Embedding size -> dimensionality of the vector space in which words or tokens are represented

- Function generated_data generates sequences of word and lemma embeddings for training a LSTM model. Function uses a sliding window approach to create batches of data with a specified batch size (B), sequence length (S), and embedding size (E). 
- The mode parameter determines whether the function is used for training or testing. If the line_limit is reached, the function stops generating data. The data is yielded in batches as numpy arrays.

In [13]:
def generate_data(words, lemmas, vec_model, line_limit=30878, mode='train'):
    word_count = 0
    line_number = 0

    # Initialize arrays to store input (x) and output (y) sequences
    x = np.zeros((B, S, E))
    y = np.zeros((B, S, E))
    
    word_seqs = [None for _ in range(B)] # Stores word sequences
    lemma_seqs = [None for _ in range(B)] # Stores lemma sequences

    word_seq = [] # Current word sequence
    lemma_seq = [] # Current lemma sequence
    
    x_seq = [] # Current x sequence
    y_seq = [] # Current y sequence
    
    i = 0 # Batch index

    
    # Iterate through words and lemmas
    for word, lemma in zip(words, lemmas):
        line_number += 1
        if line_number > line_limit: # Stopping if line limit is reached
            return 

        # Check if the current sequences have reached the maximum length (S)
        if len(x_seq) == S and len(y_seq) == S:
            # Convert current sequences to arrays and store them in the batch
            x[i] = np.array(x_seq)
            y[i] = np.array(y_seq)
            word_seqs[i] = word_seq[:]
            lemma_seqs[i] = lemma_seq[:]

            # If in training mode, popping the first element from sequences to shift the window
            if mode == 'train':
                x_seq.pop(0)
                y_seq.pop(0)
                word_seq.pop(0)
                lemma_seq.pop(0)
            else:                   # If not in training mode, reseting the sequences
                x_seq = []
                y_seq = []
                word_seq = []
                lemma_seq = []
            i += 1

            # If the batch is full, yield the data and reset for the next batch
            if i >= B:
                yield x, y, word_seqs, lemma_seqs
                x = np.zeros((B, S, E))
                y = np.zeros((B, S, E))
                word_seqs = [None for _ in range(B)]
                lemma_seqs = [None for _ in range(B)]
                i = 0
                word_count += S

        try:             # Get word and lemma embeddings from the vector model
            word_embedding = vec_model[word]
            lemma_embedding = vec_model[lemma]
        except KeyError:     # If not found, using zero vectors
            word_embedding = np.zeros(E)
            lemma_embedding = np.zeros(E)

       # Appending the embeddings and the words/lemmas to the current sequences
        x_seq.append(word_embedding)
        y_seq.append(lemma_embedding)
        word_seq.append(word)
        lemma_seq.append(lemma)

Applying generate_data function on real data
W and l in these lists allows access to the word and lemma sequences associated with each batch, which could be useful for evaluation purposes

- Loading data again to ensure that is fresh for next model, also reduced 10 times because my laptop can train full dataset

In [17]:
train_words = df_train["Rijeci"].values[:30878] #input feature
train_lemmas = df_train["Leme"].values[:30878] #output feature

dev_words = df_dev["Rijeci"].values[:5154] #input feature
dev_lemmas = df_dev["Leme"].values[:5154] #output feature

(X, Y) -> input and output pairs.
(x, y, w, l) -> x is the input, y is the output, w is the words, l is the lemmas

In [18]:
train_set = [(X, Y) for X, Y, _, _ in generate_data(train_words, train_lemmas, vec_model, line_limit=30878)]
dev_batches = [(x, y, w, l) for x, y, w, l in generate_data(dev_words, dev_lemmas, vec_model, line_limit=5154, mode='dev')]

**Model Structure:** Long Short-Term Memory (LSTM) network

**Architecture:**
- Input Layer: Masking layer applied to sequences with a mask value of 0.0, shaping input data with dimensions (S, E).
- LSTM Layer: R units in the LSTM layer configured to return sequences
- Dropout Layer: 20% dropout rate applied after the LSTM layer to prevent overfitting
- TimeDistributed Dense Layer: Linear activation applied independently to each time step, with E units, indicating the output size for each sequence
  
**Model Compilation:**
- Loss Function: Cosine similarity chosen as the loss function for training
- Optimizer: RMSprop utilized as the optimizer for weight updates during training

In [19]:
LSTMmodel = Sequential()
LSTMmodel.add(Masking(mask_value=.0, input_shape=(S, E)))
LSTMmodel.add(LSTM(R, return_sequences=True))
LSTMmodel.add(Dropout(.2))
LSTMmodel.add(TimeDistributed(Dense(E, activation='linear')))
LSTMmodel.compile(loss='cosine_similarity', optimizer='rmsprop')

**Running the model**

In [20]:
for epoch in range(100): #100 epochs
    train_loss = 0 # initialization
    train_batch_c = 0 #initalization batch count

    for X, Y in train_set: # Iterating through training set batches (X: input, Y: target)
        train_loss += LSTMmodel.train_on_batch(X, Y)  # Updating training loss and batch count by training on the batch
        train_batch_c += 1

    #similar process repeated only for validation    
    dev_loss = 0
    dev_batch_c = 0

    for X, Y, _, _ in dev_batches:
        dev_loss += LSTMmodel.test_on_batch(X, Y)
        dev_batch_c += 1

    
    # Checking if either training batch count or development batch count is zero    
    if train_batch_c == 0 or dev_batch_c == 0:
        print('Warning: train_batch_c or dev_batch_c is zero. Skipping epoch', epoch + 1)
        continue

    # Checking if it is the first epoch or a multiple of 10, showing only 10,20,30 etc..
    if epoch == 0 or (epoch + 1) % 10 == 0:
        print('epoch:', epoch + 1, 
              '\ttrain loss: {0:.4f}'.format(train_loss / train_batch_c), 
              '\tdev loss: {0:.4f}'.format(dev_loss / dev_batch_c))

epoch: 1 	train loss: -0.6671 	dev loss: -0.7605
epoch: 10 	train loss: -0.8131 	dev loss: -0.8364
epoch: 20 	train loss: -0.8228 	dev loss: -0.8482
epoch: 30 	train loss: -0.8279 	dev loss: -0.8538
epoch: 40 	train loss: -0.8313 	dev loss: -0.8572
epoch: 50 	train loss: -0.8341 	dev loss: -0.8599
epoch: 60 	train loss: -0.8363 	dev loss: -0.8619
epoch: 70 	train loss: -0.8383 	dev loss: -0.8639
epoch: 80 	train loss: -0.8401 	dev loss: -0.8656
epoch: 90 	train loss: -0.8417 	dev loss: -0.8669
epoch: 100 	train loss: -0.8430 	dev loss: -0.8682


In [21]:
LSTMmodel.save("lstm_model.keras") #exporting model