In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
class NewsSummarizer:
    def __init__(self, max_news_length=500, max_summary_length=100, embedding_dim=256, lstm_units=256):
        """Initialize the news summarization model with given parameters."""
        self.max_news_length = max_news_length
        self.max_summary_length = max_summary_length
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.news_tokenizer = None
        self.summary_tokenizer = None
        self.model = None
        self.encoder_model = None
        self.decoder_model = None
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        """Clean and preprocess the input text"""
        if isinstance(text, float) or text is None:  # Handle NaN values and None
            return ""

        # Convert to lowercase
        text = str(text).lower()

        # Remove URLs, HTML tags, numbers and punctuation
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def load_dataset(self, filepath, num_samples=None):
        """Load and prepare the dataset"""
        try:
            # Load the dataset
            df = pd.read_csv(filepath)

            # Check if we want to limit the number of samples
            if num_samples and num_samples < len(df):
                df = df.sample(n=num_samples, random_state=42)

            # Extract articles and summaries
            news_articles = df['text'].tolist()
            summaries = df['headlines'].tolist()

            # Filter out any empty articles or summaries
            valid_indices = [i for i, (article, summary) in enumerate(zip(news_articles, summaries))
                            if isinstance(article, str) and isinstance(summary, str)
                            and len(article.strip()) > 0 and len(summary.strip()) > 0]

            news_articles = [news_articles[i] for i in valid_indices]
            summaries = [summaries[i] for i in valid_indices]

            print(f"Loaded {len(news_articles)} valid articles with summaries")

            return news_articles, summaries
        except Exception as e:
            print(f"Error loading dataset: {e}")
            return [], []

    def prepare_data(self, news_articles, summaries):
        """Prepare the data for training"""
        if not news_articles or not summaries:
            raise ValueError("Empty news articles or summaries list provided")

        # Preprocess news articles and summaries
        processed_news = [self.preprocess_text(news) for news in news_articles]
        processed_summaries = [self.preprocess_text(summary) for summary in summaries]

        # Add start and end tokens to summaries
        processed_summaries = ['startseq ' + summary + ' endseq' for summary in processed_summaries]

        # Create tokenizers
        self.news_tokenizer = Tokenizer()
        self.news_tokenizer.fit_on_texts(processed_news)

        self.summary_tokenizer = Tokenizer()
        self.summary_tokenizer.fit_on_texts(processed_summaries)

        # Get vocabulary sizes
        news_vocab_size = len(self.news_tokenizer.word_index) + 1
        summary_vocab_size = len(self.summary_tokenizer.word_index) + 1

        # Convert texts to sequences
        news_sequences = self.news_tokenizer.texts_to_sequences(processed_news)
        summary_sequences = self.summary_tokenizer.texts_to_sequences(processed_summaries)

        # Pad sequences
        news_padded = pad_sequences(news_sequences, maxlen=self.max_news_length, padding='post')
        summary_padded = pad_sequences(summary_sequences, maxlen=self.max_summary_length, padding='post')

        # Split data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            news_padded, summary_padded, test_size=0.2, random_state=42
        )

        # Verify dimensions
        print(f"Training data shape: {X_train.shape}, {y_train.shape}")
        print(f"Validation data shape: {X_val.shape}, {y_val.shape}")

        return X_train, X_val, y_train, y_val, news_vocab_size, summary_vocab_size

    def build_model(self, news_vocab_size, summary_vocab_size):
        """Build the sequence-to-sequence model with bidirectional LSTM"""
        # Encoder
        encoder_inputs = Input(shape=(self.max_news_length,), name='encoder_inputs')
        encoder_embedding = Embedding(news_vocab_size, self.embedding_dim, name='encoder_embedding')(encoder_inputs)

        # Bidirectional LSTM for encoder
        encoder_bilstm = Bidirectional(LSTM(self.lstm_units, return_sequences=True, return_state=True), name='encoder_bilstm')
        encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_embedding)

        # Concatenate forward and backward states
        state_h = Concatenate(name='concatenate_h')([forward_h, backward_h])
        state_c = Concatenate(name='concatenate_c')([forward_c, backward_c])
        encoder_states = [state_h, state_c]

        # Decoder
        decoder_inputs = Input(shape=(None,), name='decoder_inputs')
        decoder_embedding = Embedding(summary_vocab_size, self.embedding_dim, name='decoder_embedding')(decoder_inputs)
        decoder_lstm = LSTM(self.lstm_units * 2, return_sequences=True, return_state=True, name='decoder_lstm')
        decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

        # Output projection
        decoder_dense = Dense(summary_vocab_size, activation='softmax', name='decoder_dense')(decoder_outputs)

        # Define the model
        self.model = Model([encoder_inputs, decoder_inputs], decoder_dense)

        # Compile the model
        self.model.compile(
            optimizer='adam',
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return self.model

    def train(self, X_train, X_val, y_train, y_val, batch_size=64, epochs=10):
        """Train the model"""
        if self.model is None:
            raise ValueError("Model must be built before training")

        # Create decoder input data (shifted by one step)
        decoder_input_train = y_train[:, :-1]
        decoder_target_train = y_train[:, 1:]

        decoder_input_val = y_val[:, :-1]
        decoder_target_val = y_val[:, 1:]

        # Create callbacks
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=3, restore_best_weights=True
        )

        # Train the model
        history = self.model.fit(
            [X_train, decoder_input_train],
            tf.expand_dims(decoder_target_train, -1),
            validation_data=([X_val, decoder_input_val], tf.expand_dims(decoder_target_val, -1)),
            batch_size=batch_size,
            epochs=epochs,
            callbacks=[early_stopping],
            verbose=1
        )

        return history

    def infer_init(self):
        """Initialize inference models"""
        if self.model is None:
            raise ValueError("Model must be trained before initializing inference models")

        # Print all layers for debugging
        print("Available layers in the model:")
        for i, layer in enumerate(self.model.layers):
            print(f"{i}: {layer.name}")

        try:
            # Get the encoder inputs layer - this is a tensor now, not the layer itself
            encoder_inputs = self.model.input[0]  # First input in the model (encoder_inputs)

            # Find the bidirectional layer
            encoder_bilstm = None
            for layer in self.model.layers:
                if isinstance(layer, Bidirectional):
                    encoder_bilstm = layer
                    print(f"Found bidirectional layer: {layer.name}")
                    break

            if encoder_bilstm is None:
                raise ValueError("Could not find bidirectional layer in the model")

            # Get the encoder embedding layer
            encoder_embedding = self.model.get_layer('encoder_embedding')

            # Create a new input tensor for the encoder model
            encoder_model_input = Input(shape=(self.max_news_length,), name='encoder_model_input')

            # Apply the embedding layer to the new input
            encoder_embedded = encoder_embedding(encoder_model_input)

            # Apply the bidirectional LSTM
            _, forward_h, forward_c, backward_h, backward_c = encoder_bilstm(encoder_embedded)

            # Concatenate the states
            state_h = Concatenate(name='encoder_model_concat_h')([forward_h, backward_h])
            state_c = Concatenate(name='encoder_model_concat_c')([forward_c, backward_c])

            # Create encoder model
            self.encoder_model = Model(encoder_model_input, [state_h, state_c])

            # Create decoder model
            decoder_inputs = Input(shape=(1,), name='inference_decoder_inputs')
            decoder_state_h = Input(shape=(self.lstm_units * 2,), name='inference_decoder_state_h')
            decoder_state_c = Input(shape=(self.lstm_units * 2,), name='inference_decoder_state_c')

            decoder_embedding = self.model.get_layer('decoder_embedding')
            decoder_lstm = self.model.get_layer('decoder_lstm')
            decoder_dense = self.model.get_layer('decoder_dense')

            decoder_embedding_out = decoder_embedding(decoder_inputs)
            decoder_outputs, state_h, state_c = decoder_lstm(
                decoder_embedding_out, initial_state=[decoder_state_h, decoder_state_c]
            )

            decoder_outputs = decoder_dense(decoder_outputs)

            self.decoder_model = Model(
                [decoder_inputs, decoder_state_h, decoder_state_c],
                [decoder_outputs, state_h, state_c]
            )

            print("Inference models initialized successfully")

        except Exception as e:
            import traceback
            print(f"Error in infer_init: {e}")
            print(traceback.format_exc())
            raise

    def generate_summary(self, news_article, max_length=100):
        """Generate a summary for a news article"""
        if self.encoder_model is None or self.decoder_model is None:
            raise ValueError("Inference models must be initialized before generating summaries")

        # Preprocess input
        processed_article = self.preprocess_text(news_article)

        # Convert to sequence and pad
        article_seq = self.news_tokenizer.texts_to_sequences([processed_article])
        if not article_seq[0]:  # Check if sequence is empty
            return "Unable to generate summary: input text contains no recognized tokens."

        article_padded = pad_sequences(article_seq, maxlen=self.max_news_length, padding='post')

        try:
            # Encode the input
            state_h, state_c = self.encoder_model.predict(article_padded, verbose=0)

            # Get start token
            target_seq = np.zeros((1, 1))
            start_token_idx = self.summary_tokenizer.word_index.get('startseq')
            if not start_token_idx:
                raise ValueError("Start token not found in tokenizer")

            target_seq[0, 0] = start_token_idx

            # Initialize result
            result = []
            stop_condition = False

            while not stop_condition:
                # Predict next token
                output_tokens, h, c = self.decoder_model.predict([target_seq, state_h, state_c], verbose=0)

                # Get predicted token
                sampled_token_index = np.argmax(output_tokens[0, 0, :])

                # Convert token to word
                sampled_word = None
                for word, index in self.summary_tokenizer.word_index.items():
                    if index == sampled_token_index:
                        sampled_word = word
                        break

                if sampled_word is None:
                    break

                # Check for end token
                if sampled_word == 'endseq' or len(result) >= max_length:
                    stop_condition = True
                else:
                    result.append(sampled_word)

                # Update target sequence
                target_seq = np.zeros((1, 1))
                target_seq[0, 0] = sampled_token_index

                # Update states
                state_h, state_c = h, c

            return ' '.join(result)
        except Exception as e:
            import traceback
            print(f"Error generating summary: {e}")
            print(traceback.format_exc())
            return f"Error generating summary: {str(e)}"

    def save_model(self, filepath):
        """Save the model to a file"""
        if self.model is None:
            raise ValueError("No model to save")

        try:
            # Save main model
            self.model.save(filepath + '.keras')

            # Save tokenizers
            import pickle
            with open(filepath + '_news_tokenizer.pkl', 'wb') as f:
                pickle.dump(self.news_tokenizer, f)

            with open(filepath + '_summary_tokenizer.pkl', 'wb') as f:
                pickle.dump(self.summary_tokenizer, f)

            print(f"Model saved successfully to {filepath}")
            return True
        except Exception as e:
            print(f"Error saving model: {e}")
            return False

    def load_model(self, filepath):
        """Load the model from a file"""
        try:
            # Load main model
            self.model = tf.keras.models.load_model(filepath + '.keras')

            # Load tokenizers
            import pickle
            with open(filepath + '_news_tokenizer.pkl', 'rb') as f:
                self.news_tokenizer = pickle.load(f)

            with open(filepath + '_summary_tokenizer.pkl', 'rb') as f:
                self.summary_tokenizer = pickle.load(f)

            print(f"Model loaded successfully from {filepath}")
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False




In [11]:
# Main execution function
def run_summarizer(dataset_path, num_samples=None):
    # Configuration parameters
    MAX_NEWS_LENGTH = 500
    MAX_SUMMARY_LENGTH = 100
    EMBEDDING_DIM = 256
    LSTM_UNITS = 256
    BATCH_SIZE = 64
    EPOCHS = 10

    # Initialize the summarizer
    summarizer = NewsSummarizer(
        max_news_length=MAX_NEWS_LENGTH,
        max_summary_length=MAX_SUMMARY_LENGTH,
        embedding_dim=EMBEDDING_DIM,
        lstm_units=LSTM_UNITS
    )

    try:
        # Load and prepare the dataset
        print("Loading dataset...")
        news_articles, summaries = summarizer.load_dataset(dataset_path, num_samples=num_samples)

        if not news_articles or not summaries:
            print("No valid data loaded. Exiting.")
            return None

        print("Preparing data...")
        X_train, X_val, y_train, y_val, news_vocab_size, summary_vocab_size = summarizer.prepare_data(
            news_articles, summaries
        )

        # Build model
        print("Building model...")
        model = summarizer.build_model(news_vocab_size, summary_vocab_size)
        print(model.summary())

        # Train model
        print("Training model...")
        history = summarizer.train(
            X_train, X_val, y_train, y_val,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS
        )

        # Save model
        print("Saving model...")
        summarizer.save_model("news_summarizer_model")

        # Initialize inference models
        print("Initializing inference models...")
        summarizer.infer_init()

        # Example summary generation
        print("Generating sample summaries...")
        for i in range(min(3, len(news_articles))):  # Generate for up to 3 sample articles
            article = news_articles[i]
            original_summary = summaries[i]
            generated_summary = summarizer.generate_summary(article)

            print(f"\nOriginal Article: {article[:100]}...")
            print(f"Original Summary: {original_summary}")
            print(f"Generated Summary: {generated_summary}")
            print("-" * 50)

        return summarizer
    except Exception as e:
        import traceback
        print(f"Error in run_summarizer: {e}")
        print(traceback.format_exc())  # Print the full stack trace for better debugging
        return None

# Run if executed directly
if __name__ == "__main__":
    run_summarizer('news_summary_more.csv', num_samples=1000)  # Limit to 1000 samples for faster training

Loading dataset...
Loaded 1000 valid articles with summaries
Preparing data...
Training data shape: (800, 500), (800, 100)
Validation data shape: (200, 500), (200, 100)
Building model...


None
Training model...
Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 231ms/step - accuracy: 0.6879 - loss: 5.1224 - val_accuracy: 0.8973 - val_loss: 1.0623
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 184ms/step - accuracy: 0.8923 - loss: 1.0188 - val_accuracy: 0.8873 - val_loss: 1.0130
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 183ms/step - accuracy: 0.8895 - loss: 0.9531 - val_accuracy: 0.8989 - val_loss: 0.8134
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 187ms/step - accuracy: 0.8996 - loss: 0.7734 - val_accuracy: 0.9007 - val_loss: 0.7659
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 188ms/step - accuracy: 0.9010 - loss: 0.7329 - val_accuracy: 0.9047 - val_loss: 0.7552
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 186ms/step - accuracy: 0.9043 - loss: 0.7131 - val_accuracy: 0.9068 - val_loss: 0.7555
Epoch