In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load a subset of the data for testing
data = pd.read_csv('cnn_dailymail/train.csv').sample(10000)

In [3]:
# Preprocess the text data
text = data['article'].astype(str).tolist()
summary = data['highlights'].astype(str).tolist()

In [4]:
data.head(10)

Unnamed: 0,id,article,highlights
192931,85c68823b7734962a6cf37b67b352f2790d27ff4,"(CNN) -- Wearing a floppy hat, 3-year-old Thor...",Up-close tours mark 50th anniversary of Kenned...
258743,dae5171c87c9f8424f09471a2dd6eff9b32733fe,By . Daily Mail Reporter . PUBLISHED: . 19:21 ...,Etan Patz was last seen in 1979 when he walked...
89967,ff7846022814b4503bfc98a23f43e94e89ab0561,Atlanta (CNN) -- Basketball star Lisa Leslie b...,The 27th annual National Girls and Women in Sp...
251510,d18ec0876e37e94c0772f55debaa1614b238c5a9,"John Balyo, 35, (pictured at his arrest last w...","John Balyo, 35, was arrested last Friday while..."
28902,5204aeb065cef0b08a62837941e64a668da11b66,(CNN) -- The Republican Party is in the midst ...,Edward Alden: GOP in the midst of historic deb...
70628,c83faf99c08fd4d44d9ee38d1c3ef84c273909f2,By . Daily Mail Reporter . UPDATED: . 06:33 ES...,Microsoft in second place with 905 million vis...
283034,fa9f62da24f8245ba3bfd1e1756a25bedee8c6a2,The explosive Senate Committee torture report ...,Experts say yesterday's Senate torture report ...
106725,15ada6585ac8c3a34f1e2e7921cacefa0bf6ae38,"The world's largest ship, which is the size of...","World's largest ship, the Hong-Kong registered..."
148621,4c2d73ce21cb138f21d2316fb83c650497be8298,"(CNN) -- The millennial generation is big, div...",A new book details changing U.S. demographics ...
226740,b19793ac6dc789534ac9b3d0291ac39a2dc5efeb,By . Will Stewart . Shocking pictures show the...,Bathers ran out of the Ob River clutching thei...


In [5]:
# Tokenize the text
text_tokenizer = Tokenizer(num_words=50000)  # Limiting the vocabulary size
text_tokenizer.fit_on_texts(text)
text_sequences = text_tokenizer.texts_to_sequences(text)
text_vocab_size = min(len(text_tokenizer.word_index) + 1, 50000)
max_text_len = 300  # Limiting the text sequence length

In [6]:
summary_tokenizer = Tokenizer(num_words=10000)  # Limiting the vocabulary size
summary_tokenizer.fit_on_texts(summary)
summary_sequences = summary_tokenizer.texts_to_sequences(summary)
summary_vocab_size = min(len(summary_tokenizer.word_index) + 1, 10000)
max_summary_len = 50  # Limiting the summary sequence length

In [7]:
# Ensure padding consistency
text_sequences = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')
summary_sequences = pad_sequences(summary_sequences, maxlen=max_summary_len, padding='post')

In [8]:
# Define the model
embedding_dim = 50
latent_dim = 128

# Encoder
encoder_inputs = Input(shape=(max_text_len,))
encoder_embedding = Embedding(input_dim=text_vocab_size, output_dim=embedding_dim, input_length=max_text_len)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=False))(encoder_embedding)
encoder_outputs = Dense(latent_dim, activation='relu')(encoder_lstm)

# Decoder
decoder_inputs = Input(shape=(max_summary_len,))
decoder_embedding = Embedding(input_dim=summary_vocab_size, output_dim=embedding_dim, input_length=max_summary_len)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)(decoder_embedding, initial_state=[encoder_outputs, encoder_outputs])
decoder_outputs = Dense(summary_vocab_size, activation='softmax')(decoder_lstm)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)



In [9]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [10]:
# Prepare target data for training
summary_sequences = np.expand_dims(summary_sequences, -1)

In [16]:
# Train the model
model.fit([text_sequences, summary_sequences], summary_sequences, epochs=10, batch_size=64, validation_split=0.25, callbacks=[EarlyStopping(monitor='val_loss', patience=2)])


Epoch 1/10


ValueError: Unknown variable: <KerasVariable shape=(50000, 50), dtype=float32, path=embedding/embeddings>. This optimizer can only be called for the variables it was originally built with. When working with a new set of variables, you should recreate a new optimizer instance.

In [None]:
# Save the model
model.save('text_summarization_rnn_light_2.h5')
