# Next

In [4]:
# Install required libraries
# !pip install tensorflow transformers datasets

# Import libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from datasets import load_dataset
import numpy as np
import re

# Step 1: Load Dataset
data = load_dataset("cnn_dailymail", "3.0.0", split="train[:3%]")  # Load a small sample for demonstration

# Step 2: Preprocess the Data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

data = data.map(lambda x: {"article": clean_text(x["article"]),
                           "highlights": f"<start> {clean_text(x['highlights'])} <end>"})
# Tokenizer for text
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
articles = data["article"]
summaries = data["highlights"]

# Fit tokenizer on both articles and summaries
tokenizer.fit_on_texts(articles + summaries)

# Convert text to sequences
article_sequences = tokenizer.texts_to_sequences(articles)
summary_sequences = tokenizer.texts_to_sequences(summaries)

# Padding sequences
article_sequences = tf.keras.preprocessing.sequence.pad_sequences(article_sequences, maxlen=500, padding="post")
summary_sequences = tf.keras.preprocessing.sequence.pad_sequences(summary_sequences, maxlen=50, padding="post")

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Split into train and validation sets
split_index = int(len(article_sequences) * 0.8)
X_train, X_val = article_sequences[:split_index], article_sequences[split_index:]
y_train, y_val = summary_sequences[:split_index], summary_sequences[split_index:]

# Step 3: Define Encoder-Decoder Model Without Attention
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(enc_units, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        _, state_h, state_c = self.lstm(x)
        return state_h, state_c

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size, activation="softmax")

    def call(self, x, enc_hidden):
        x = self.embedding(x)
        x, _, _ = self.lstm(x, initial_state=enc_hidden)
        output = self.fc(x)
        return output


Map:   0%|          | 0/8613 [00:00<?, ? examples/s]

In [5]:
# Define hyperparameters
embedding_dim = 256
units = 512

encoder = Encoder(vocab_size, embedding_dim, units)
decoder = Decoder(vocab_size, embedding_dim, units)

# Step 4: Combine Encoder and Decoder
class Seq2SeqModel(tf.keras.Model):
    def __init__(self, encoder, decoder):
        super(Seq2SeqModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        enc_input, dec_input = inputs
        enc_hidden = self.encoder(enc_input)
        dec_output = self.decoder(dec_input, enc_hidden)
        return dec_output

# Create the Seq2Seq Model
model = Seq2SeqModel(encoder, decoder)

# Compile the Model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Step 5: Train the Model
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:],
          validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:]),
          epochs=15, batch_size=16)

Epoch 1/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 360ms/step - accuracy: 0.1707 - loss: 7.7221 - val_accuracy: 0.1928 - val_loss: 6.7570
Epoch 2/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 360ms/step - accuracy: 0.2127 - loss: 6.3834 - val_accuracy: 0.2098 - val_loss: 6.6451
Epoch 3/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 362ms/step - accuracy: 0.2330 - loss: 6.0751 - val_accuracy: 0.2230 - val_loss: 6.5067
Epoch 4/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 375ms/step - accuracy: 0.2534 - loss: 5.7220 - val_accuracy: 0.2311 - val_loss: 6.4360
Epoch 5/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 375ms/step - accuracy: 0.2657 - loss: 5.4065 - val_accuracy: 0.2367 - val_loss: 6.3963
Epoch 6/15
[1m431/431[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 362ms/step - accuracy: 0.2754 - loss: 5.0969 - val_accuracy: 0.2408 - val_loss: 6.3939
Epoc

<keras.src.callbacks.history.History at 0x785a6cb136d0>

In [6]:
# Step 6: Inference Function Without Attention
def generate_summary(input_text):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    input_sequence = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=500, padding="post")
    enc_hidden = encoder(input_sequence)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = ''

    for _ in range(50):
        predictions = decoder(dec_input, enc_hidden)
        predicted_id = tf.argmax(predictions[0][0]).numpy()
        result += tokenizer.index_word.get(predicted_id, '') + ' '
        if tokenizer.index_word.get(predicted_id, '') == '<end>':
            break
        dec_input = tf.expand_dims([predicted_id], 0)
    return result

# Test the model with a sample
sample_article = articles[0]
print("Generated Summary:", generate_summary(sample_article))

Generated Summary: new briant kelly 107 of the us says manufacturing jenkins in the us says manufacturing jenkins in the us says manufacturing jenkins in the us says manufacturing jenkins in the us says manufacturing jenkins in the us says manufacturing jenkins in the us says manufacturing jenkins in the us says 
