### Kaggle CommonLit Readability Challenge
Functions & Classes

In [None]:
# Generate the word embedding matrix for an excerpt
def excerpt_embedding(in_excerpt, return_idx, vec_dim=50):
    
    nltk_word_tokens = nltk.wordpunct_tokenize(in_excerpt)
    word_list_series = pd.Series(nltk_word_tokens)
    rm_words = word_list_series.isin(spacy_stopwords)
    retain_idx = np.where(~rm_words)[0]
    word_list_series_no_stop_words = word_list_series[retain_idx]

    num_words = len(retain_idx)
#     word_embedding = np.zeros((num_words, 50))
    word_embedding = np.zeros((num_words, vec_dim))

    for idx in range(num_words):
        current_word = word_list_series_no_stop_words.iloc[idx].lower()
        try:
            current_idx = keyset.index(current_word)
            word_embedding[idx, :] = embeddings_dict[keyset[current_idx]]
        except ValueError:
            blank_var = 0
            # print('Missing value = ', current_word)
    
    # Remove rows with all 0
    row_sum = np.sum(word_embedding, axis=1)
    non_zero_idx = np.where(row_sum != 0)[0]
    word_embedding_mod = word_embedding[non_zero_idx, :].copy()

    if return_idx == 0:
        return word_embedding_mod.shape[0]
    
    return word_embedding_mod

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, in_size, hidden_layer_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size=in_size, hidden_size=hidden_layer_size, bidirectional=True)
        self.L1 = nn.Linear(2*hidden_layer_size, output_size[0])
        self.L2 = nn.Linear(output_size[0], output_size[1])
        self.L3 = nn.Linear(output_size[1], output_size[2])

    def forward(self, input_seq):
        lstm_output, (lstm_hn, lstm_cn) = self.lstm(input_seq)
        
        # Select the hidden state corresponding to the last element in the word sequence
        # Since only one element is selected from the first index, in_matrix is a 2D tensor and not a 3D tensor
        in_matrix = lstm_output[-1, :, :] 

        ffd_step_1 = torch.relu(self.L1(in_matrix))
        ffd_step_2 = torch.relu(self.L2(ffd_step_1))
        ffd_step_3 = self.L3(ffd_step_2)
    
        return ffd_step_3

In [None]:
# Key Features
# - No positional embedding
# - Tranformer + FFN
class Transformer_Encoder(nn.Module):
    def __init__(self, embed_size, n_heads, n_layers, output_size):
        super().__init__()
#         self.lstm = nn.LSTM(input_size=in_size, hidden_size=hidden_layer_size, bidirectional=True)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=n_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.L1 = nn.Linear(embed_size, output_size[0])
        self.L2 = nn.Linear(output_size[0], output_size[1])
        self.L3 = nn.Linear(output_size[1], output_size[2])

    def forward(self, input_seq):
        trf_output = self.transformer_encoder(input_seq)
        in_matrix = torch.mean(trf_output, 0) # Average over all the words for each batch and embedding combination
        
        ffd_step_1 = torch.relu(self.L1(in_matrix))
        ffd_step_2 = torch.relu(self.L2(ffd_step_1))
        ffd_step_3 = self.L3(ffd_step_2)
    
        return ffd_step_3

In [None]:
# Key Features
# - No positional embedding
# - Tranformer with no FFN
class Transformer_Encoder_Self_Decoder(nn.Module):
    def __init__(self, embed_size, n_heads, n_layers, output_size):
        super().__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=n_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.L1 = nn.Linear(embed_size, output_size[2])

    def forward(self, input_seq):
        trf_output = self.transformer_encoder(input_seq)
        in_matrix = torch.mean(trf_output, 0) # Average over all the words for each batch and embedding combination
        ffd_step_1 = self.L1(in_matrix)
        return ffd_step_1

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
# Key Features
# - Positional embedding
# - Tranformer + FFN
class Transformer_Encoder_Pos_Embed(nn.Module):
    def __init__(self, embed_size, n_heads, n_layers, output_size, dim_feedfwd=2048):
        super().__init__()
        self.pos_encoder = PositionalEncoding(embed_size, 0.1)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=n_heads
                                                        , dropout=0.1, dim_feedforward=dim_feedfwd)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.L1 = nn.Linear(embed_size, output_size[0])
        self.L2 = nn.Linear(output_size[0], output_size[1])
        self.L3 = nn.Linear(output_size[1], output_size[2])

    def forward(self, input_seq):
        input_seq_pos = self.pos_encoder(input_seq)
        trf_output = self.transformer_encoder(input_seq_pos)
        in_matrix = torch.mean(trf_output, 0) # Average over all the words for each batch and embedding combination
        
        ffd_step_1 = torch.relu(self.L1(in_matrix))
        ffd_step_2 = torch.relu(self.L2(ffd_step_1))
        ffd_step_3 = self.L3(ffd_step_2)
    
        return ffd_step_3