In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Embedding, Dense, Concatenate, TimeDistributed
import re

# Load your dataset
train_data = pd.read_csv("./dataset/TED_Talks_by_ID_plus-transcripts-and-LIWC-and-MFT-plus-views.csv")

def remove_multiple_spaces(s):
    return re.sub(r'\s+', ' ', s)
START = '÷'
END = '■'

punct = '\r0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~'   # `|` is not present here
transtab = str.maketrans(dict.fromkeys(punct, ''))
# million['transcript'] = '|'.join(million['transcript'].tolist()).translate(transtab).split('|')
train_data['transcript'] = [START+s.translate(str.maketrans(punct, ' '* len(punct))) + END for s in train_data['transcript'].astype(str)]
train_data['transcript'] = train_data['transcript'].apply(remove_multiple_spaces)
print(train_data['transcript'])


transtab = str.maketrans(dict.fromkeys(punct, ''))
# million['transcript'] = '|'.join(million['transcript'].tolist()).translate(transtab).split('|')
train_data['headline'] = [START+' '  + s.translate(str.maketrans(punct, ' '* len(punct))) + ' '+END for s in train_data['headline'].astype(str)]
train_data['headline'] = train_data['headline'].apply(remove_multiple_spaces)
print(train_data['headline'])


# Preprocess the data
MAX_LEN_TXT = 2500
MAX_LEN = 15
latent_dim = 512
embedding_dim = 256
VOCAB_SIZE = 50000

# Tokenizers
X_tokenizer = Tokenizer(num_words=VOCAB_SIZE+1, oov_token="<OOV>")
X_tokenizer.fit_on_texts(train_data['transcript'])
adjusted_index_word = {index - 1: word for word, index in X_tokenizer.word_index.items()}
X_tokenizer.index_word = adjusted_index_word

Y_tokenizer = Tokenizer(num_words=VOCAB_SIZE+1, oov_token="<OOV>")
Y_tokenizer.fit_on_texts(train_data['headline'])
adjusted_index_word = {index - 1: word for word, index in Y_tokenizer.word_index.items()}
Y_tokenizer.index_word = adjusted_index_word

# Tokenize and pad the input and target sequences
X = X_tokenizer.texts_to_sequences(train_data['transcript'])
X = pad_sequences(X, maxlen=MAX_LEN_TXT, truncating='post')
y = Y_tokenizer.texts_to_sequences(train_data['headline'])
y = pad_sequences(y, maxlen=MAX_LEN, truncating='post')

# Split the data into train and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Vocabulary sizes
X_voc = X_tokenizer.num_words 
Y_voc = Y_tokenizer.num_words

# Build the GRU model
# Encoder
encoder_inputs = Input(shape=(MAX_LEN_TXT,))
enc_emb = Embedding(X_voc, embedding_dim, trainable=True)(encoder_inputs)

encoder_gru1 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
encoder_output1, state_h1 = encoder_gru1(enc_emb)

encoder_gru2 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
encoder_output2, state_h2 = encoder_gru2(encoder_output1)

encoder_gru3 = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
encoder_outputs, state_h = encoder_gru3(encoder_output2)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(Y_voc, embedding_dim, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_gru = GRU(latent_dim, return_sequences=True, return_state=True, dropout=0.4)
decoder_outputs, _ = decoder_gru(dec_emb, initial_state=state_h)

# Dense layer
decoder_dense = TimeDistributed(Dense(Y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.summary()


# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], batch_size=32, epochs=20)



0       ÷ Thank you so much Chris And it s truly a gre...
1       ÷ In terms of invention I d like to tell you t...
2       ÷ A public Dewey long ago observed is constitu...
3       ÷ I want to start off by saying Houston we hav...
4       ÷ What I want to talk about is as background i...
                              ...                        
2470    ÷ Imagine that when you walked in here this ev...
2471    ÷ Paying close attention to something Not that...
2472    ÷ So this happy pic of me was taken in I was a...
2473    ÷ My seven year old grandson sleeps just down ...
2474    ÷ Michael Browning engineer innovator inventor...
Name: transcript, Length: 2475, dtype: object
0                         ÷ Averting the climate crisis ■
1                       ÷ Simple designs to save a life ■
2                       ÷ How to rebuild a broken state ■
3                ÷ The real future of space exploration ■
4                            ÷ Great cars are great art ■
                          

<tensorflow.python.keras.callbacks.History at 0x226a5817308>

In [9]:
# Inference models

# Encoder inference model
encoder_model = Model(encoder_inputs, state_h)

# Decoder inference model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_inputs = [decoder_state_input_h]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2 = decoder_gru(dec_emb2, initial_state=decoder_state_inputs)

decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs2] + [state_h2])


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Set the first token of target sequence as the start token.
    target_seq[0, 0] = Y_tokenizer.word_index[START]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h = decoder_model.predict([target_seq] + [states_value])
        
#         print(output_tokens[0, -1, :])
#         print(Y_tokenizer.index_word)
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = Y_tokenizer.index_word[sampled_token_index]
        decoded_sentence.append(sampled_word)
        
#         print(output_tokens,sampled_token_index, Y_tokenizer.index_word[sampled_token_index])
        # Exit condition: either hit max length or find stop token.
        if sampled_word == END or len(decoded_sentence) > MAX_LEN:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = h
#         print()
    return ' '.join(decoded_sentence)

# Generate a headline for a given transcript
input_sequence = X_val[0:1]
generated_headline = decode_sequence(input_sequence)
print("Generated Headline:", generated_headline)


Generated Headline: s a to to to to to to a a a the the the the the


In [10]:
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def bleu_score(model, X_val, y_val):
    smoothing_function = SmoothingFunction().method1
    scores = []

    for i, input_sequence in enumerate(X_val):
        reference = [Y_tokenizer.sequences_to_texts([y_val[i]])[0].split()]
        input_sequence = input_sequence.reshape(1, -1)
        candidate = decode_sequence(input_sequence).split()
        score = sentence_bleu(reference, candidate, smoothing_function=smoothing_function)
        scores.append(score)
        print(score)
    return np.mean(scores)

# Calculate the BLEU score for the validation set
validation_bleu_score = bleu_score(model, X_val, y_val)
print("Validation BLEU Score:", validation_bleu_score)

0.017395797375642234
0.014628063653657535
0.014628063653657535
0.012300686288463768
0.012300686288463768
0.014628063653657535
0
0
0.012300686288463768
0.012300686288463768
0.012300686288463768
0.01618861356572822
0.01618861356572822
0.014628063653657535
0.014628063653657535
0.012300686288463768
0.014628063653657535
0.01839381624963888
0.030934588294313718
0.014628063653657535
0.012300686288463768
0.014628063653657535
0
0
0.012300686288463768
0.012300686288463768
0.012300686288463768
0.014628063653657535
0.012300686288463768
0.01618861356572822
0
0.014628063653657535
0.012300686288463768
0.012300686288463768
0.014628063653657535
0.012300686288463768
0.012300686288463768
0.01618861356572822
0.012300686288463768
0.012300686288463768
0.012300686288463768
0.012300686288463768
0.014628063653657535
0
0.014628063653657535
0.012300686288463768
0.01618861356572822
0.012300686288463768
0
0.012300686288463768
0.012300686288463768
0.012300686288463768
0.01618861356572822
0
0
0.014628063653657535
0.

KeyboardInterrupt: 