### download fasttext weights 

In [1]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz

--2024-11-07 19:03:46--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.227.219.33, 13.227.219.70, 13.227.219.59, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.227.219.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4502524724 (4.2G) [application/octet-stream]
Saving to: ‘cc.fa.300.bin.gz’


2024-11-07 19:08:01 (16.9 MB/s) - ‘cc.fa.300.bin.gz’ saved [4502524724/4502524724]



In [2]:
!gunzip /content/cc.fa.300.bin.gz

### download dataset 

In [3]:
!gdown 1CmseUCVqDNTLouAv9nGVn80kLjuh-rxd

Downloading...
From: https://drive.google.com/uc?id=1CmseUCVqDNTLouAv9nGVn80kLjuh-rxd
To: /content/NLU_Assignment1.zip
100% 22.2M/22.2M [00:00<00:00, 69.2MB/s]


In [4]:
!unzip /content/NLU_Assignment1.zip

Archive:  /content/NLU_Assignment1.zip
  inflating: Assignment1_NLU.pdf     
   creating: Poem Meter Dataset/
  inflating: Poem Meter Dataset/test_samples.csv  
  inflating: Poem Meter Dataset/train_samples.csv  
  inflating: Poem Meter Dataset/validation_samples.csv  


### apply preprocessing train set 

In [1]:
import pandas as pd
import regex as re

train_df = pd.read_csv('/content/Poem Meter Dataset/train_samples.csv')
train_df = train_df.sample(frac=1).reset_index(drop=True)

def clean_text(text):
    # Remove unwanted Unicode characters
    text = re.sub(r'\u200c', '', text)  # Remove zero-width non-joiners
    return text

# Apply cleaning to 'poem_text' column
train_df['poem_text'] = train_df['poem_text'].apply(clean_text)

In [2]:
val_df = pd.read_csv('/content/Poem Meter Dataset/validation_samples.csv')
test_data = pd.read_csv('/content/Poem Meter Dataset/test_samples.csv')

# Prepare input and output sequences
X_train_texts = train_df['poem_text'].astype(str).tolist()
y_train_texts = train_df['metre'].astype(str).tolist()
X_val_texts = val_df['poem_text'].astype(str).tolist()
y_val_texts = val_df['metre'].astype(str).tolist()
X_test_texts = test_data['poem_text'].astype(str).tolist()
X_test = test_data['poem_text'].astype(str).tolist()


In [3]:
!pip install fasttext



In [None]:
import fasttext
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, Embedding, Activation, dot, concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

# Load FastText model and set dimensions
ft = fasttext.load_model('cc.fa.300.bin')
embedding_dim = 300

# Tokenization
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(X_train_texts)

output_tokenizer = Tokenizer()
output_tokenizer.fit_on_texts(y_train_texts)

X_train = pad_sequences(input_tokenizer.texts_to_sequences(X_train_texts), padding='post')
X_val = pad_sequences(input_tokenizer.texts_to_sequences(X_val_texts), padding='post')
X_test = pad_sequences(input_tokenizer.texts_to_sequences(X_test_texts), padding='post')

y_train = pad_sequences(output_tokenizer.texts_to_sequences(y_train_texts), padding='post')
y_val = pad_sequences(output_tokenizer.texts_to_sequences(y_val_texts), padding='post')

# Embedding matrix
embedding_matrix = np.zeros((len(input_tokenizer.word_index) + 1, embedding_dim))
for word, i in input_tokenizer.word_index.items():
    embedding_matrix[i] = ft.get_word_vector(word)

# Model definition with Bi-LSTM and attention
n_classes = len(output_tokenizer.word_index) + 1
max_seq_length = X_train.shape[1]

# Encoder
encoder_input = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(input_dim=len(input_tokenizer.word_index) + 1,
                              output_dim=embedding_dim, weights=[embedding_matrix], trainable=True)(encoder_input)
encoder_bilstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(512, return_sequences=True, return_state=True)
)(encoder_embedding)
encoder_last_h = concatenate([forward_h, backward_h])
encoder_last_c = concatenate([forward_c, backward_c])

# Decoder with attention
decoder_input = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=len(output_tokenizer.word_index) + 1, output_dim=embedding_dim)(decoder_input)
decoder_lstm, _, _ = LSTM(1024, return_sequences=True, return_state=True)(decoder_embedding, initial_state=[encoder_last_h, encoder_last_c])

attention = Activation('softmax')(dot([decoder_lstm, encoder_bilstm], axes=[2, 2]))
context = dot([attention, encoder_bilstm], axes=[2, 1])
decoder_combined_context = concatenate([context, decoder_lstm])

output = TimeDistributed(Dense(n_classes, activation="softmax"))(decoder_combined_context)

model = Model([encoder_input, decoder_input], output)
model.compile(optimizer=Adam(learning_rate=2e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


y_train_in = np.concatenate([np.zeros((y_train.shape[0], 1)), y_train[:, :-1]], axis=1)
y_val_in = np.concatenate([np.zeros((y_val.shape[0], 1)), y_val[:, :-1]], axis=1)

y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

# Callbacks
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit([X_train, y_train_in], y_train, validation_data=([X_val, y_val_in], y_val),
                    epochs=2, batch_size=16, callbacks=[es, lr_scheduler], verbose=1)

# Save the model
model.save('bilstm_attention_model.h5')

# Decoding functions
def greedy_decode(input_seq):
    # Start with start token
    target_seq = np.array([[output_tokenizer.word_index['<start>']]])
    stop_condition = False
    decoded_sentence = []

    while not stop_condition:
        output_tokens = model.predict([input_seq, target_seq])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = output_tokenizer.index_word.get(sampled_token_index, None)

        if sampled_word == '<end>' or len(decoded_sentence) > max_seq_length:
            stop_condition = True
        else:
            decoded_sentence.append(sampled_word)
            target_seq = np.append(target_seq, [[sampled_token_index]], axis=1)

    return ' '.join(decoded_sentence)

def beam_search_decode(input_seq, k=3):
    sequences = [[list(), 1.0]]

    # Iteratively add tokens to sequences
    for _ in range(max_seq_length):
        all_candidates = []
        for seq, score in sequences:
            target_seq = np.array([seq + [output_tokenizer.word_index.get('<start>', 0)]])
            output_tokens = model.predict([input_seq, target_seq])
            top_k_indices = np.argsort(output_tokens[0, -1, :])[-k:]

            for i in top_k_indices:
                candidate = seq + [i]
                candidate_score = score * output_tokens[0, -1, i]
                all_candidates.append((candidate, candidate_score))

        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = ordered[:k]

        if sequences[0][0][-1] == output_tokenizer.word_index.get('<end>', 0):
            break

    best_seq = sequences[0][0]
    return ' '.join([output_tokenizer.index_word.get(idx, '') for idx in best_seq if idx])

def predict_single_sentence(sentence, method='greedy'):
    input_seq = pad_sequences(input_tokenizer.texts_to_sequences([sentence]), maxlen=max_seq_length, padding='post')
    if method == 'greedy':
        return greedy_decode(input_seq)
    elif method == 'beam':
        return beam_search_decode(input_seq)

# Apply model on test.csv 'poem_text' column
test_sentences = test_data['poem_text'].values
results = [predict_single_sentence(sentence, method='greedy') for sentence in test_sentences]
print(results)


Epoch 1/2
[1m46824/46824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1773s[0m 38ms/step - accuracy: 0.8978 - loss: 0.2840 - val_accuracy: 0.9487 - val_loss: 0.1983 - learning_rate: 2.0000e-04
Epoch 2/2
[1m  323/46824[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m28:50[0m 37ms/step - accuracy: 0.9887 - loss: 0.0348