### Kaggle CommonLit Readability Challenge
Functions & Classes

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, in_size, hidden_layer_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size=in_size, hidden_size=hidden_layer_size, bidirectional=True)
        self.L1 = nn.Linear(2*hidden_layer_size, output_size[0])
        self.L2 = nn.Linear(output_size[0], output_size[1])
        self.L3 = nn.Linear(output_size[1], output_size[2])

    def forward(self, input_seq):
        lstm_output, (lstm_hn, lstm_cn) = self.lstm(input_seq)
        
        # Select the hidden state corresponding to the last element in the word sequence
        # Since only one element is selected from the first index, in_matrix is a 2D tensor and not a 3D tensor
        in_matrix = lstm_output[-1, :, :] 

        ffd_step_1 = torch.relu(self.L1(in_matrix))
        ffd_step_2 = torch.relu(self.L2(ffd_step_1))
        ffd_step_3 = self.L3(ffd_step_2)
    
        return ffd_step_3

In [None]:
# Generate the word embedding matrix for an excerpt
def excerpt_embedding(in_excerpt, return_idx):
    
    nltk_word_tokens = nltk.wordpunct_tokenize(in_excerpt)
    word_list_series = pd.Series(nltk_word_tokens)
    rm_words = word_list_series.isin(spacy_stopwords)
    retain_idx = np.where(~rm_words)[0]
    word_list_series_no_stop_words = word_list_series[retain_idx]

    num_words = len(retain_idx)
    word_embedding = np.zeros((num_words, 50))

    for idx in range(num_words):
        current_word = word_list_series_no_stop_words.iloc[idx].lower()
        try:
            current_idx = keyset.index(current_word)
            word_embedding[idx, :] = embeddings_dict[keyset[current_idx]]
        except ValueError:
            blank_var = 0
            # print('Missing value = ', current_word)
    
    # Remove rows with all 0
    row_sum = np.sum(word_embedding, axis=1)
    non_zero_idx = np.where(row_sum != 0)[0]
    word_embedding_mod = word_embedding[non_zero_idx, :].copy()

    if return_idx == 0:
        return word_embedding_mod.shape[0]
    
    return word_embedding_mod