In [1]:
!nvidia-smi

Fri Jun  2 11:50:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import math

import warnings 
from google.colab import drive
drive.mount('/content/drive')
warnings.filterwarnings("ignore")

Mounted at /content/drive


In [3]:
data_path = "/content/drive/MyDrive"
# Load the data
# train_df = pd.read_csv(data_path + '/train.csv')
# val_df = pd.read_csv(data_path + '/validation.csv')
# test_df = pd.read_csv(data_path + '/test.csv')

In [16]:
import numpy as np
from scipy.spatial.distance import cosine

def BM25_(sim_score, D, Q, k1=1.5, b=0.75, k2=100, d=0.85, embeddings=None):
    """
    Computes the BM25 similarity score between a document and a query.

    Parameters:
    sim_score (function): A similarity function that takes in two vectors and returns their similarity score.
    D (list): A list of words in the document.
    Q (list): A list of words in the query.
    k1 (float): A constant that controls the impact of term frequency on the BM25 score.
    b (float): A constant that controls the impact of document length on the BM25 score.
    k2 (float): A constant that controls the impact of query term frequency on the BM25 score.
    d (float): A constant that controls the impact of the TextRank score on the BM25 score.
    embeddings (dict): A dictionary mapping terms to their GloVe embeddings.

    Returns:
    A float value representing the BM25 similarity score between the document and the query.
    """
    # Compute the inverse document frequency for each query term
    n = len(D)
    idfs = {}
    for term in set(Q):
        nq = sum(1 for word in D if word == term)
        idfs[term] = math.log((n - nq + 0.5) / (nq + 0.5))

    # Compute the document length
    doc_len = len(D)

    # Compute the average word length
    avg_wrd_len = sum(len(word) for word in D) / doc_len

    # Compute the document frequency and query term frequency for each query term
    dfs = {}
    qfs = {}
    for term in Q:
        dfs[term] = sum(1 for word in D if word == term)
        qfs[term] = Q.count(term)

    # Compute the BM25 score for each query term
    bm25_scores = {}
    for term in Q:
        idf = idfs[term]
        fij = dfs[term]
        qfi = qfs[term]
        term1 = (idf * (fij * (k1 + 1)) / (fij + k1 * (1 - b + b * (doc_len / avg_wrd_len))))
        term2 = ((k2 + 1) * qfi / (k2 + qfi))
        bm25_scores[term] = term1 * term2

    # Compute the TextRank score for each query term
    textrank_scores = {}
    for term in Q:
        neighbors = []
        for i, word in enumerate(D):
            if word == term:
                if i > 0:
                    neighbors.append(D[i - 1])
                if i < len(D) - 1:
                    neighbors.append(D[i + 1])
        wji = {neighbor: 1 for neighbor in neighbors}
        score = sum(
            wji[neighbor] * textrank_scores.get(neighbor, 0) / sum(wji.get(neighbor, 0) for neighbor in neighbors)
            for neighbor in neighbors
        )
        textrank_scores[term] = (1 - d) + d * score

    # Compute the BM25 score for the document
    bm25_score = sum(bm25_scores[term] * sim_score(embeddings.get(term, None), embeddings.get(term, None)) for term in Q if embeddings.get(term, None) is not None)

    # Compute the BM25 score for
        # Compute the BM25 score for the query
    q_bm25_score = sum(bm25_scores[term] * textrank_scores[term] for term in Q)

    return bm25_score, q_bm25_score

def glove_similarity(v1, v2):
    """
    Computes the cosine similarity between two GloVe embeddings.

    Parameters:
    v1 (ndarray): The GloVe embedding vector for the first term.
    v2 (ndarray): The GloVe embedding vector for the second term.

    Returns:
    A float value representing the cosine similarity between the two embeddings.
    """
    if v1 is None or v2 is None:
        return 0.0
    return 1.0 - cosine(v1, v2)

def get_embedding(embeddings, term):
    """
    Retrieves the GloVe embedding for a given term.

    Parameters:
    embeddings (dict): A dictionary mapping terms to their GloVe embeddings.
    term (str): The term for which to retrieve the embedding.

    Returns:
    The GloVe embedding vector for the given term, or None if the term is not found.
    """
    return embeddings.get(term, None)




# Define the similarity function for BM25_
def cosine_similarity(x,y):
    """
    Parameters:
    x (ndarray): A numpy array representing a vector.
    y (ndarray): A numpy array representing a vector.

    Returns:
    A float value representing the cosine similarity between x and y.
    """
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))





    
def preprocess_text(text):
    """
    
    Preprocesses the input text by removing stop words, stemming, and tokenizing it.
    Parameters:
    text (str): A string representing the input text.

    Returns:
    A list of preprocessed tokens.
    """
    # Remove HTML tags
    text = BeautifulSoup(text).get_text()

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Preprocess the article
train_df['article'] = train_df['article'].apply(preprocess_text)
val_df['article'] = val_df['article'].apply(preprocess_text)
test_df['article'] = test_df['article'].apply(preprocess_text)

# Preprocess the highlights
train_df['highlights'] = train_df['highlights'].apply(preprocess_text)
val_df['highlights'] = val_df['highlights'].apply(preprocess_text)
test_df['highlights'] = test_df['highlights'].apply(preprocess_text)

NameError: ignored

In [None]:
train_df.drop('id',inplace = True)
val_df.drop('id',inplace = True)
test_df.drop('id',inplace = True)

In [None]:
train_df

In [None]:
# Create a vocabulary from the training data
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['article'] + train_df['highlights'])

# Convert the text to sequences of integers
train_doc_seqs = tokenizer.texts_to_sequences(train_df['article'])
val_doc_seqs = tokenizer.texts_to_sequences(val_df['article'])
test_doc_seqs = tokenizer.texts_to_sequences(test_df['article'])
train_summ_seqs = tokenizer.texts_to_sequences(train_df['highlights'])
val_summ_seqs = tokenizer.texts_to_sequences(val_df['highlights'])
test_summ_seqs = tokenizer.texts_to_sequences(test_df['highlights'])

# Pad the sequences to a fixed length
max_len_doc = max(len(seq) for seq in train_doc_seqs)
max_len_summ = max(len(seq) for seq in train_summ_seqs)
max_len = max(max_len_doc, max_len_summ)


train_doc_seqs = pad_sequences(train_doc_seqs, maxlen=max_len, padding='post', truncating='post')
val_doc_seqs = pad_sequences(val_doc_seqs, maxlen=max_len, padding='post', truncating='post')
test_doc_seqs = pad_sequences(test_doc_seqs, maxlen=max_len, padding='post', truncating='post')
train_summ_seqs = pad_sequences(train_summ_seqs, maxlen=max_len, padding='post', truncating='post')
val_summ_seqs = pad_sequences(val_summ_seqs, maxlen=max_len, padding='post', truncating='post')
test_summ_seqs = pad_sequences(test_summ_seqs, maxlen=max_len, padding='post', truncating='post')

In [6]:
data_path = "/content/drive/MyDrive"
train_df = pd.read_csv(data_path + '/train_df_preprocess.csv', encoding='iso-8859-1')
val_df = pd.read_csv(data_path + '/val_df_df_preprocess.csv')
test_df = pd.read_csv(data_path + '/test.csv')
train_df

Unnamed: 0.1,Unnamed: 0,article,highlights
0,0,"['.', 'associ', 'press', '.', 'publish', ':', ...","['bishop', 'john', 'folda', ',', 'north', 'dak..."
1,1,"['(', 'cnn', ')', '--', 'ralph', 'mata', 'inte...","['crimin', 'complaint', ':', 'cop', 'use', 'ro..."
2,2,"['drunk', 'driver', 'kill', 'young', 'woman', ...","['craig', 'eccleston-todd', ',', '27', ',', 'd..."
3,3,"['(', 'cnn', ')', '--', 'breezi', 'sweep', 'pe...","['nina', 'do', 'santo', 'say', 'europ', 'must'..."
4,4,"['fleetwood', 'team', 'still', '100', '%', 're...","['fleetwood', 'top', 'leagu', 'one', '2-0', 'w..."
...,...,...,...
287108,287108,"['.', 'jame', 'rush', '.', 'former', 'first', ...","['chelsea', 'clinton', 'said', 'question', 'ru..."
287109,287109,"['apologet', 'vanilla', 'ice', 'given', 'first...","['vanilla', 'ice', ',', '47', '-', 'real', 'na..."
287110,287110,"['america', ""'s"", 'lethal', 'sniper', 'claim',...","['america', ""'s"", 'lethal', 'sniper', 'made', ..."
287111,287111,"['.', 'sara', 'malm', '.', 'publish', ':', '.'...","['swarm', 'one', 'million', 'cross', 'border',..."


In [7]:
train_df.drop('Unnamed: 0',axis = 1,inplace = True)
val_df.drop('Unnamed: 0',axis = 1,inplace = True)
test_df.drop('id',axis = 1,inplace = True)
train_df

Unnamed: 0,article,highlights
0,"['.', 'associ', 'press', '.', 'publish', ':', ...","['bishop', 'john', 'folda', ',', 'north', 'dak..."
1,"['(', 'cnn', ')', '--', 'ralph', 'mata', 'inte...","['crimin', 'complaint', ':', 'cop', 'use', 'ro..."
2,"['drunk', 'driver', 'kill', 'young', 'woman', ...","['craig', 'eccleston-todd', ',', '27', ',', 'd..."
3,"['(', 'cnn', ')', '--', 'breezi', 'sweep', 'pe...","['nina', 'do', 'santo', 'say', 'europ', 'must'..."
4,"['fleetwood', 'team', 'still', '100', '%', 're...","['fleetwood', 'top', 'leagu', 'one', '2-0', 'w..."
...,...,...
287108,"['.', 'jame', 'rush', '.', 'former', 'first', ...","['chelsea', 'clinton', 'said', 'question', 'ru..."
287109,"['apologet', 'vanilla', 'ice', 'given', 'first...","['vanilla', 'ice', ',', '47', '-', 'real', 'na..."
287110,"['america', ""'s"", 'lethal', 'sniper', 'claim',...","['america', ""'s"", 'lethal', 'sniper', 'made', ..."
287111,"['.', 'sara', 'malm', '.', 'publish', ':', '.'...","['swarm', 'one', 'million', 'cross', 'border',..."


In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [None]:
import tensorflow as tf
from keras import backend as K

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)
K.set_session(sess)


In [8]:
def generate_summary(document, embeddings, train_embeddings):
    """
    Generates a summary for the given document.

    Parameters:
    document (str): A string representing the input document.
    embeddings (gensim.models.keyedvectors.Word2VecKeyedVectors): Pre-trained GloVe embeddings.
    train_embeddings (list): List of training document embeddings.

    Returns:
    A string representing the generated summary.
    """

    # Preprocess the document
    doc_tokens = preprocess_text(document)

    # Convert the document tokens to their corresponding embeddings
    doc_embeddings = [embeddings[word] for word in doc_tokens if word in embeddings]

    # Compute the similarity scores between the document and all other documents in the dataset
    sim_scores = [glove_similarity(doc_embeddings, train_embeddings[i]) for i in range(len(train_embeddings))]

    # Sort the documents by their similarity score
    sorted_indices = np.argsort(sim_scores)[::-1]

    # Select the top-k documents to use for summarization
    k = 3
    topk_indices = sorted_indices[:k]
    topk_docs = [train_df.iloc[i]['article'] for i in topk_indices]

    # Generate a final summary using the TextRank algorithm
    final_summary = ''
    for i in range(len(topk_docs)):
        final_summary += ' '.join(train_df.iloc[topk_indices[i]]['highlights']) + ' '

    return final_summary


In [None]:
# import required libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# preprocess the data
train_df['article'] = train_df['article'].apply(lambda x: x.lower())  # convert text to lowercase
train_df['highlights'] = train_df['highlights'].apply(lambda x: x.lower())  # convert headlines to lowercase
max_text_len = 100  # maximum length of text sequences
max_headline_len = 20  # maximum length of headline sequences
tokenizer = Tokenizer(num_words=10000)  # initialize tokenizer with a 10,000 word limit
tokenizer.fit_on_texts(train_df['article'].values)  # fit tokenizer on text sequences
tokenizer.fit_on_texts(train_df['highlights'].values)  # fit tokenizer on headline sequences
text_sequences = tokenizer.texts_to_sequences(train_df['article'].values)  # convert text sequences to integer sequences
headline_sequences = tokenizer.texts_to_sequences(train_df['highlights'].values)  # convert headline sequences to integer sequences
text_sequences = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')  # pad text sequences
headline_sequences = pad_sequences(headline_sequences, maxlen=max_headline_len, padding='post')  # pad headline sequences
X = text_sequences  # input sequences
Y = headline_sequences  # target sequences



# define a data generator function to generate training and validation batches
def data_generator(X, Y, batch_size):
    while True:
        for i in range(0, len(X), batch_size):
            encoder_input_data = X[i:i+batch_size]
            decoder_input_data = Y[i:i+batch_size, :-1]
            decoder_target_data = Y[i:i+batch_size, 1:]
            yield ([encoder_input_data, decoder_input_data], decoder_target_data)

# create a MirroredStrategy object for data parallelism
strategy = tf.distribute.MirroredStrategy()

# wrap the optimizer and model with the strategy
with strategy.scope():
    # split the data into training and validation sets
    train_X, val_X, train_Y, val_Y = train_test_split(X, Y, test_size=0.2)

    # define the batch size and number of batches
    batch_size = 16
    num_train_batches = len(train_X) // batch_size
    num_val_batches = len(val_X) // batch_size

    # define the data generators for training and validation
    train_generator = data_generator(train_X, train_Y, batch_size)
    val_generator = data_generator(val_X, val_Y, batch_size)

    # define the model architecture
    embedding_dim = 64
    latent_dim = 256
    vocab_size = len(tokenizer.index_word)

    # define the encoder model
    encoder_inputs = tf.keras.layers.Input(shape=(None,))
    encoder_embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
    encoder_conv = tf.keras.layers.Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')
    encoder_cnn = encoder_conv(encoder_embed)
    state_h, state_c = tf.keras.layers.GRU(latent_dim, return_state=True)(encoder_cnn)
    encoder_states = state_h

    # define the decoder model
    decoder_inputs = tf.keras.layers.Input(shape=(None,))
    decoder_embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
    decoder_conv = tf.keras.layers.Conv1D(filters=latent_dim, kernel_size=5, padding='causal', activation='relu')
    decoder_cnn = decoder_conv(decoder_embed)
    decoder_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(latent_dim, return_sequences=True, return_state=True))
    decoder_states, forward_h, backward_h = decoder_gru(decoder_cnn, initial_state=[encoder_states, encoder_states])
    decoder_dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_states)

    # define the model
    model = tf.keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

    # compile the model
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])



    # # compile the model
    optimizer = tf.keras.optimizers.Adam()
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit(train_generator, epochs=2, steps_per_epoch=num_train_batches, validation_data=val_generator, validation_steps=num_val_batches) 

Epoch 1/2
Epoch 2/2


In [9]:
train_df['article'] = train_df['article'].apply(lambda x: x.lower())  # convert text to lowercase
train_df['highlights'] = train_df['highlights'].apply(lambda x: x.lower())  # convert headlines to lowercase
max_text_len = 100  # maximum length of text sequences
max_headline_len = 20  # maximum length of headline sequences
tokenizer = Tokenizer(num_words=10000)  # initialize tokenizer with a 10,000 word limit
tokenizer.fit_on_texts(train_df['article'].values)  # fit tokenizer on text sequences
tokenizer.fit_on_texts(train_df['highlights'].values)  # fit tokenizer on headline sequences
text_sequences = tokenizer.texts_to_sequences(train_df['article'].values)  # convert text sequences to integer sequences
headline_sequences = tokenizer.texts_to_sequences(train_df['highlights'].values)  # convert headline sequences to integer sequences
text_sequences = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')  # pad text sequences
headline_sequences = pad_sequences(headline_sequences, maxlen=max_headline_len, padding='post')  # pad headline sequences
X = text_sequences  # input sequences
Y = headline_sequences  # target sequences

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from rouge import Rouge
from gensim.models import KeyedVectors
from gensim.scripts import glove2word2vec

# Path to the GloVe word embeddings file
glove_path = '/content/drive/MyDrive/glove.6B.50d.txt'
# Path to save the converted Word2Vec format
word2vec_path = '/content/drive/MyDrive/glove.6B.50d.word2vec'

# Convert GloVe to Word2Vec format
glove2word2vec.glove2word2vec(glove_path, word2vec_path)

# Load the converted GloVe embeddings in Word2Vec format
word_vectors = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)

# Define the maximum sequence lengths
max_encoder_seq_length = 512
max_decoder_seq_length = 128

# Define the batch size
batch_size = 8

# Define the data generator function
def data_generator(encoder_input_data, decoder_input_data, decoder_target_data, batch_size):
    while True:
        for i in range(0, len(encoder_input_data), batch_size):
            encoder_inputs = encoder_input_data[i:i+batch_size]
            decoder_inputs = decoder_input_data[i:i+batch_size]
            decoder_targets = decoder_target_data[i:i+batch_size]
            yield ([encoder_inputs, decoder_inputs], decoder_targets)


# Tokenize the input text
train_input_sequences = tokenizer.texts_to_sequences(train_df['article'])
# Pad the input sequences
train_encoder_input_data = pad_sequences(train_input_sequences, maxlen=max_encoder_seq_length, padding='post')

# Tokenize the target text
train_target_sequences = tokenizer.texts_to_sequences(train_df['highlights'])
# Pad the target sequences
train_decoder_input_data = pad_sequences(train_target_sequences, maxlen=max_decoder_seq_length, padding='post')

# Shift the target sequences by one time step
train_decoder_target_data = np.roll(train_decoder_input_data, -1, axis=1)
train_decoder_target_data[:, -1] = 0  # Set the last element to 0 as it's not needed

# Tokenize the input text for validation data
val_input_sequences = tokenizer.texts_to_sequences(val_df['article'])
# Pad the input sequences
val_encoder_input_data = pad_sequences(val_input_sequences, maxlen=max_encoder_seq_length, padding='post')

# Tokenize the target text for validation data
val_target_sequences = tokenizer.texts_to_sequences(val_df['highlights'])
# Pad the target sequences
val_decoder_input_data = pad_sequences(val_target_sequences, maxlen=max_decoder_seq_length, padding='post')

# Shift the target sequences by one time step for validation data
val_decoder_target_data = np.roll(val_decoder_input_data, -1, axis=1)
val_decoder_target_data[:, -1] = 0  # Set the last element to 0 as it's not needed

# define the model architecture
embedding_dim = 64
latent_dim = 256
vocab_size = len(tokenizer.index_word)

encoder_inputs = Input(shape=(max_encoder_seq_length,))
encoder_embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embed)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_decoder_seq_length-1,))
decoder_embed = tf.keras.layers.Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embed, initial_state=encoder_states)
decoder_dense = Dense(len(word_vectors.key_to_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
train_steps = len(train_df) // batch_size
val_steps = len(val_df) // batch_size

history = model.fit(
    data_generator(train_encoder_input_data, train_decoder_input_data, train_decoder_target_data, batch_size),
    steps_per_epoch=train_steps,
    epochs=10,
    validation_data=data_generator(val_encoder_input_data, val_decoder_input_data, val_decoder_target_data, batch_size),
    validation_steps=val_steps
)


# Save the trained model
model.save('/content/drive/MyDrive/new_trained.h5')

# Define the Rouge scorer
scorer = Rouge()


# Create an empty dataframe to store the scores
scores_df = pd.DataFrame(columns=['Rouge1 Precision', 'Rouge1 Recall', 'Rouge1 F-measure',
                                  'Rouge2 Precision', 'Rouge2 Recall', 'Rouge2 F-measure',
                                  'RougeL Precision', 'RougeL Recall', 'RougeL F-measure'])

# Generate summaries for the test data and compute Rouge scores
for i in range(0, len(test_df), batch_size):
    batch_inputs = test_df['article'].iloc[i:i+batch_size]
    batch_targets = test_df['highlights'].iloc[i:i+batch_size]

    # Prepare inputs for the model
    input_encoder = tokenizer.texts_to_sequences(batch_inputs)
    input_decoder = tokenizer.texts_to_sequences(batch_targets)

    # Pad sequences to the same length
    input_encoder = pad_sequences(input_encoder, maxlen=max_encoder_seq_length, padding='post')
    input_decoder = pad_sequences(input_decoder, maxlen=max_decoder_seq_length, padding='post')

    # Predict summaries
    predicted_summaries = model.predict([input_encoder, input_decoder])
    for j in range(len(batch_inputs)):
      predicted_summary = predicted_summaries[j]
      predicted_summary_text = tokenizer.sequences_to_texts([np.argmax(predicted_summary, axis=1)])[0]

      target_summary = batch_targets.iloc[j]

      scores = scorer.score(target_summary, predicted_summary_text)

      # Store the Rouge scores in the dataframe
      scores_df.loc[i+j] = [scores['rouge1'].precision, scores['rouge1'].recall, scores['rouge1'].fmeasure,
                            scores['rouge2'].precision, scores['rouge2'].recall, scores['rouge2'].fmeasure,
                            scores['rougeL'].precision, scores['rougeL'].recall, scores['rougeL'].fmeasure]

      print('Input:', batch_inputs.iloc[j])
      print('Target Summary:', target_summary)
      print('Generated Summary:', predicted_summary_text)
      print('Rouge Scores:', scores)
      print('-----------------------------')

scores_df.head()

Epoch 1/10
Epoch 2/10

In [10]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
scores_df.to_csv('/content/drive/MyDrive/scores.csv', index=False)

In [None]:
def generate_summary(document, embeddings):
    """
    Generates a summary for the given document.

    Parameters:
    document (str): A string representing the input document.
    embeddings (dict): A dictionary mapping terms to their GloVe embeddings.

    Returns:
    A string representing the generated summary.
    """

    # Preprocess the document
    doc_tokens = preprocess_text(document)

    # Convert the document to a sequence of integers
    doc_seq = tokenizer.texts_to_sequences([doc_tokens])
    doc_seq = pad_sequences(doc_seq, maxlen=max_len, padding='post', truncating='post')

    # Compute the BM25_ similarity scores between the document and all other documents in the dataset
    sim_scores = [BM25_(glove_similarity, train_df.iloc[i]['article'], doc_tokens, embeddings=embeddings) for i in range(len(train_df))]


    # Sort the documents by their similarity score
    sorted_indices = np.argsort(sim_scores)[::-1]

    # Select the top-k documents to use for summarization
    k = 3
    topk_indices = sorted_indices[:k]
    topk_docs = [train_df.iloc[i]['article'] for i in topk_indices]

    # Convert the top-k documents to sequences of integers
    topk_doc_seqs = tokenizer.texts_to_sequences(topk_docs)
    topk_doc_seqs = pad_sequences(topk_doc_seqs, maxlen=max_len, padding='post', truncating='post')

    # Generate a final summary using the TextRank algorithm
    final_summary = ''
    sim_scores = []
    for i in range(len(topk_doc_seqs)):
        sim_score = glove_similarity(doc_seq.flatten(), topk_doc_seqs[i].flatten())
        sim_scores.append(sim_score)
        final_summary += ' '.join(train_df.iloc[topk_indices[i]]['highlights']) + ' '
    sim_scores = np.array(sim_scores)
    final_summary_tokens = preprocess_text(final_summary)
    tr_scores = {}
    for token in set(final_summary_tokens):
        neighbors = []
        for i, word in enumerate(final_summary_tokens):
            if word == token:
                if i > 0:
                    neighbors.append(final_summary_tokens[i - 1])
                if i < len(final_summary_tokens) - 1:
                    neighbors.append(final_summary_tokens[i + 1])
        wji = {neighbor: 1 for neighbor in neighbors}
        score = sum(wji[neighbor] * tr_scores.get(neighbor, 0) / sum(wji.get(neighbor, 0) for neighbor in neighbors) for neighbor in neighbors)
        tr_scores[token] = (1 - d) + d * score
    final_summary = ' '.join([token for token in final_summary_tokens if tr_scores[token] >= np.percentile(sim_scores, 25)])

    return final_summary


In [None]:
def generate_summary(document, embeddings, train_embeddings):
    """
    Generates a summary for the given document.

    Parameters:
    document (str): A string representing the input document.
    embeddings (gensim.models.keyedvectors.Word2VecKeyedVectors): Pre-trained GloVe embeddings.
    train_embeddings (list): List of training document embeddings.

    Returns:
    A string representing the generated summary.
    """

    # Preprocess the document
    doc_tokens = preprocess_text(document)

    # Convert the document tokens to their corresponding embeddings
    doc_embeddings = [embeddings[word] for word in doc_tokens if word in embeddings]

    # Compute the similarity scores between the document and all other documents in the dataset
    sim_scores = [glove_similarity(doc_embeddings, train_embeddings[i]) for i in range(len(train_embeddings))]

    # Sort the documents by their similarity score
    sorted_indices = np.argsort(sim_scores)[::-1]

    # Select the top-k documents to use for summarization
    k = 3
    topk_indices = sorted_indices[:k]
    topk_docs = [train_df.iloc[i]['article'] for i in topk_indices]

    # Generate a final summary using the TextRank algorithm
    final_summary = ''
    for i in range(len(topk_docs)):
        final_summary += ' '.join(train_df.iloc[topk_indices[i]]['highlights']) + ' '

    return final_summary


In [None]:
model.save(data_path+'/cnn_gru_final.h5')

NameError: ignored

In [None]:
def generate_summary(document):
    """
    Generates a summary for the given document.

    Parameters:
    document (str): A string representing the input document.

    Returns:
    A string representing the generated summary.
    """

    # Preprocess the document
    doc_tokens = preprocess_text(document)

    # Convert the document to a sequence of integers
    doc_seq = tokenizer.texts_to_sequences([doc_tokens])
    doc_seq = pad_sequences(doc_seq, maxlen=max_len, padding='post', truncating='post')

    # Compute the BM25_ similarity scores between the document and all other documents in the dataset
    sim_scores = [BM25_(glove_similarity, train_df.iloc[i]['article'], doc_tokens) for i in range(len(train_df))]

    # Sort the documents by their similarity score
    sorted_indices = np.argsort(sim_scores)[::-1]

    # Select the top-k documents to use for summarization
    k = 3
    topk_indices = sorted_indices[:k]
    topk_docs = [train_df.iloc[i]['article'] for i in topk_indices]

    # Convert the top-k documents to sequences of integers
    topk_doc_seqs = tokenizer.texts_to_sequences(topk_docs)
    topk_doc_seqs = pad_sequences(topk_doc_seqs, maxlen=max_len, padding='post', truncating='post')

    # Generate an initial summary using the model
    initial_summary = '<start> '
    curr_seq = np.zeros((1, max_len))
    curr_seq[0, -1] = tokenizer.word_index['<start>']
    while True:
        preds = model.predict([doc_seq, curr_seq])
        next_token = np.argmax(preds[0, -1, :])
        if tokenizer.index_word[next_token] == '<end>' or len(initial_summary.split()) >= max_len_summ:
            break
        initial_summary += tokenizer.index_word[next_token] + ' '
        curr_seq = np.zeros((1, max_len))
        curr_seq[0, :-1] = doc_seq
        curr_seq[0, -len(initial_summary.split()) + 1:] = tokenizer.texts_to_sequences([initial_summary])[0][1:]

    # Generate a final summary using the TextRank algorithm
    final_summary = ''
    sim_scores = []
    for i in range(len(topk_doc_seqs)):
        sim_score = glove_similarity(doc_seq.flatten(), topk_doc_seqs[i].flatten())
        sim_scores.append(sim_score)
        final_summary += ' '.join(train_df.iloc[topk_indices[i]]['highlights']) + ' '
    sim_scores = np.array(sim_scores)
    final_summary_tokens = preprocess_text(final_summary)
    tr_scores = {}
    for token in set(final_summary_tokens):
        neighbors = []
        for i, word in enumerate(final_summary_tokens):
            if word == token:
                if i > 0:
                    neighbors.append(final_summary_tokens[i - 1])
                if i < len(final_summary_tokens) - 1:
                    neighbors.append(final_summary_tokens[i + 1])
        wji = {neighbor: 1 for neighbor in neighbors}
        score = sum(wji[neighbor] * tr_scores.get(neighbor, 0) / sum(wji.get(neighbor, 0) for neighbor in neighbors) for neighbor in neighbors)
        tr_scores[token] = (1 - d) + d * score
    final_summary = ' '.join([token for token in final_summary_tokens if tr_scores[token] >= np.percentile(sim_scores, 25)])

    return final_summary


In [11]:
from keras.models import load_model 

model = load_model('/content/drive/MyDrive/cnn_gru.h5')

In [None]:
# set batch size
batch_size = 32
max_len_doc = max(len(seq) for seq in text_sequences)
max_len_summ = max(len(seq) for seq in headline_sequences)
max_len = max(max_len_doc, max_len_summ)
# calculate number of batches
num_batches = len(df_val) // batch_size

# split validation data into batches
val_batches = np.array_split(df_val.to_numpy(), num_batches)

# generate summaries for each batch of validation data
generated_summaries = []
for batch in val_batches:
    # convert batch to input sequences
    val_X = batch[:, :-1]
    # generate summaries for batch
    summaries = generate_summary(val_X)
    # append generated summaries to list
    generated_summaries.append(summaries)

# combine generated summaries for all batches into a single array
generated_summaries = np.concatenate(generated_summaries, axis=0)

In [None]:
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np

def generate_summary(document, embeddings):
    """
    Generates a summary for the given document.

    Parameters:
    document (str): A string representing the input document.
    embeddings (dict): Pre-trained GloVe embeddings.

    Returns:
    A string representing the generated summary.
    """

    # Preprocess the document
    doc_tokens = preprocess_text(document)

    # Compute the average embedding vector for the document
    doc_embedding = np.mean([embeddings[word] for word in doc_tokens if word in embeddings], axis=0)

    # Compute the similarity scores between the document and all pre-trained embeddings
    sim_scores = {word: glove_similarity(doc_embedding, embeddings[word]) for word in embeddings}

    # Sort the pre-trained embeddings by their similarity score
    sorted_words = sorted(embeddings, key=lambda word: sim_scores[word], reverse=True)

    # Select the top-k words to use for summarization
    k = 3
    topk_words = sorted_words[:k]

    # Generate a final summary using the selected top-k words
    final_summary = ' '.join(topk_words)

    return final_summary
# Path to the GloVe embeddings file
glove_path = '/content/drive/MyDrive/glove.6B.50d.txt'

# Load the GloVe embeddings
embeddings_index = {}
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float16')
        embeddings_index[word] = embedding

In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
from rouge import Rouge
import nltk
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')

# Load GloVe embeddings
def load_embeddings(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings[word] = embedding
    return embeddings

glove_embeddings = load_embeddings('/content/drive/MyDrive/glove.6B.50d.txt')

# Create an empty dataframe to store the results
results_df = pd.DataFrame(columns=['Input', 'Rouge Scores', 'BM25 Scores'])

# Create a Rouge scorer
scorer = Rouge()

# Compute the maximum sequence lengths
max_encoder_seq_length = min(512, max(len(seq) for seq in test_df['article']))
max_decoder_seq_length = min(128, max(len(seq) for seq in test_df['highlights']))
batch_size = 1  # Set the batch size to a lower value

# Cosine similarity function
def cosine_similarity_function(vector1, vector2):
    return cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))[0][0]

# Iterate over the test data in batches
for i in range(0, len(test_df), batch_size):
    batch_inputs = test_df['article'].iloc[i:i+batch_size]
    batch_targets = test_df['highlights'].iloc[i:i+batch_size]

    # Prepare inputs for the model
    input_encoder = tokenizer.texts_to_sequences(batch_inputs)
    input_decoder = tokenizer.texts_to_sequences(batch_targets)

    # Pad sequences to the same length
    input_encoder = tf.keras.preprocessing.sequence.pad_sequences(input_encoder, maxlen=max_encoder_seq_length, padding='post')
    input_decoder = tf.keras.preprocessing.sequence.pad_sequences(input_decoder, maxlen=max_decoder_seq_length, padding='post')

    # Predict summaries
    predicted_summaries = model.predict([input_encoder, input_decoder])

    for j in range(len(batch_inputs)):
      predicted_summary = predicted_summaries[j]
      predicted_summary_text = tokenizer.sequences_to_texts([np.argmax(predicted_summary, axis=1)])[0]

      target_summary = batch_targets.iloc[j]

      if predicted_summary_text.strip():  # Check if the predicted summary is not empty
          # Calculate Rouge scores
          scores = scorer.get_scores(predicted_summary_text, target_summary)

          # Calculate BM25 scores for the predicted summary
          tokenized_input = nltk.word_tokenize(batch_inputs.iloc[j].lower())
          tokenized_summary = nltk.word_tokenize(predicted_summary_text.lower())

          # Map terms to GloVe embeddings
          embeddings = {}
          for term in set(tokenized_input + tokenized_summary):
              embeddings[term] = glove_embeddings.get(term, None)

          bm25_score, q_bm25_score = BM25_(cosine_similarity_function, tokenized_input, tokenized_summary, embeddings=embeddings)

          # Append the results to the dataframe
          results_df = results_df.append({'Input': batch_inputs.iloc[j], 'Rouge Scores': scores, 'BM25 Scores': bm25_score}, ignore_index=True)

          print('Input:', batch_inputs.iloc[j])
          print('Rouge Scores:', scores)
          print('BM25 Scores:', bm25_score)
      else:
          print('Input:', batch_inputs.iloc[j])
          print('Predicted Summary is empty.')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Input: France legend Serge Betsen tells Sportsmail where he thinks Clermont's clash with Saracens at Stade Geoffrey-Guichard will be won... HEAD TO HEADS . Wesley Fofana v Brad Barritt . If there is one genius in French rugby at the moment, it’s Wesley Fofana. The kid is so talented and skilful that something happens every time he gets the ball so Saracens will need to focus on him. He is up against Brad Barritt, who is not as easy on the eye but is a very solid unit for Saracens — hence why they have made him captain. He reminds me of former British Lions flanker Richard Hill — someone you don’t really notice but is so important for the team. Clermont centre Wesley Fofana was in scintillating form during his side's recent destruction of Northampton . Brad Barritt is back to anchor the Saracens midfield against the Top 14 giants . Julien Bonnaire vs Billy Vunipola . Julien Bonnaire may be a flanker and Billy Vunipola a No

In [24]:
pd.set_option('display.max_columns',None)
results_df.head()

Unnamed: 0,Input,Rouge Scores,BM25 Scores
0,Ever noticed how plane seats appear to be gett...,"[{'rouge-1': {'r': 0.06060606060606061, 'p': 0...",1.378268
1,A drunk teenage boy had to be rescued by secur...,"[{'rouge-1': {'r': 0.08823529411764706, 'p': 0...",12.740705
2,Dougie Freedman is on the verge of agreeing a ...,"[{'rouge-1': {'r': 0.10714285714285714, 'p': 0...",14.649037
3,Liverpool target Neto is also wanted by PSG an...,"[{'rouge-1': {'r': 0.16216216216216217, 'p': 0...",4.675266
4,Bruce Jenner will break his silence in a two-h...,"[{'rouge-1': {'r': 0.16326530612244897, 'p': 0...",16.509613


In [25]:
results_df.to_csv(index = False)