In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Embedding, Dense, Concatenate, TimeDistributed
import re

train_data = pd.read_csv("./dataset/TED_Talks_by_ID_plus-transcripts-and-LIWC-and-MFT-plus-views.csv")
def remove_multiple_spaces(s):
    return re.sub(r'\s+', ' ', s)
START = '÷'
END = '■'
punct = '\r0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~'   

transtab = str.maketrans(dict.fromkeys(punct, ''))
train_data['transcript'] = [START+s.translate(str.maketrans(punct, ' '* len(punct))) + END for s in train_data['transcript'].astype(str)]
train_data['transcript'] = train_data['transcript'].apply(remove_multiple_spaces)
# print(train_data['transcript'])


transtab = str.maketrans(dict.fromkeys(punct, ''))
train_data['headline'] = [START+' '  + s.translate(str.maketrans(punct, ' '* len(punct))) + ' '+END for s in train_data['headline'].astype(str)]
train_data['headline'] = train_data['headline'].apply(remove_multiple_spaces)
# print(train_data['headline'])


MAX_LEN_TXT = 2500
MAX_LEN = 15
latent_dim = 512
embedding_dim = 256
VOCAB_SIZE = 5000

X_tokenizer = Tokenizer(num_words=VOCAB_SIZE+1, oov_token="<OOV>")
X_tokenizer.fit_on_texts(train_data['transcript'])
adjusted_index_word = {index - 1: word for word, index in X_tokenizer.word_index.items()}
X_tokenizer.index_word = adjusted_index_word

Y_tokenizer = Tokenizer(num_words=VOCAB_SIZE+1, oov_token="<OOV>")
Y_tokenizer.fit_on_texts(train_data['headline'])
adjusted_index_word = {index - 1: word for word, index in Y_tokenizer.word_index.items()}
Y_tokenizer.index_word = adjusted_index_word

X = X_tokenizer.texts_to_sequences(train_data['transcript'])
X = pad_sequences(X, maxlen=MAX_LEN_TXT, truncating='post')
y = Y_tokenizer.texts_to_sequences(train_data['headline'])
y = pad_sequences(y, maxlen=MAX_LEN, truncating='post')

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_voc = X_tokenizer.num_words 
Y_voc = Y_tokenizer.num_words

encoder_inputs = Input(shape=(MAX_LEN_TXT,))
enc_emb = Embedding(X_voc, embedding_dim, trainable=True)(encoder_inputs)

encoder_gru1 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.2)
encoder_output1, state_h1 = encoder_gru1(enc_emb)

encoder_gru2 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.2)
encoder_output2, state_h2 = encoder_gru2(encoder_output1)

encoder_gru3 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
encoder_outputs, state_h = encoder_gru3(encoder_output2)

decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(Y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
decoder_outputs, _ = decoder_gru(dec_emb, initial_state=state_h)

decoder_dense = TimeDistributed(Dense(Y_voc, activation='relu'))
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], batch_size=32, epochs=20)



Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2500)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2500, 256)    1280256     input_1[0][0]                    
__________________________________________________________________________________________________
gru (GRU)                       [(None, 2500, 512),  1182720     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

<tensorflow.python.keras.callbacks.History at 0x24aa5db1bc8>

In [2]:
encoder_model = Model(encoder_inputs, state_h)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2 = decoder_gru(dec_emb2, initial_state=decoder_state_inputs)

decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs2] + [state_h2])


def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = Y_tokenizer.word_index[START]

    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq] + [states_value])
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
#         print(Y_tokenizer.index_word, output_tokens[0, -1, :].shape)
        sampled_word = Y_tokenizer.index_word[sampled_token_index]
        
        decoded_sentence.append(sampled_word)
        if sampled_word == END or len(decoded_sentence) > MAX_LEN:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = h
    return ' '.join(decoded_sentence)


In [3]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def bleu_score(model, X_val, y_val):
    smoothing_function = SmoothingFunction().method1
    scores = []

    for i, input_sequence in enumerate(X_val):
        reference = [Y_tokenizer.sequences_to_texts([y_val[i]])[0].split()]
        input_sequence = input_sequence.reshape(1, -1)
        candidate = decode_sequence(input_sequence).split()
        score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
        scores.append(score)
#         print(score)
    return np.mean(scores)

validation_bleu_score = bleu_score(model, X_val, y_val)
print("Validation BLEU Score:", validation_bleu_score)

Validation BLEU Score: 0.35611440760988794
