In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## ***encoder-decoder without attention model***

In [None]:
pip install keras_self_attention

Collecting keras_self_attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keras_self_attention
  Building wheel for keras_self_attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras_self_attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18895 sha256=864d93ae4b6693a76e2a90059e2a62a5c7fd95e5e3623484a57182202cb67e1f
  Stored in directory: /root/.cache/pip/wheels/b8/f7/24/607b483144fb9c47b4ba2c5fba6b68e54aeee2d5bf6c05302e
Successfully built keras_self_attention
Installing collected packages: keras_self_attention
Successfully installed keras_self_attention-0.51.0


In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from keras_self_attention import SeqSelfAttention as AttentionLayer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings

In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved models
encoder_model = load_model('/content/drive/MyDrive/deeplab/non attention/noattention_encoder_model.h5')
decoder_model = load_model('/content/drive/MyDrive/deeplab/non attention/noattention_decoder_model.h5')

# Load the tokenizers
with open('/content/drive/MyDrive/deeplab/non attention/noattention_source_tokenizer.pkl', 'rb') as f:
    source_tokenizer = pickle.load(f)

with open('/content/drive/MyDrive/deeplab/non attention/noattention_target_tokenizer.pkl', 'rb') as f:
    target_tokenizer = pickle.load(f)

# Reverse dictionaries for decoding
reverse_target_word_index = target_tokenizer.index_word
target_word_index = target_tokenizer.word_index
reverse_source_word_index = source_tokenizer.index_word
source_word_index = source_tokenizer.word_index

# Maximum lengths (update based on your training setup)
max_text_len = 50  # Example value; set to your actual max input length
max_summary_len = 15  # Example value; set to your actual max summary length

# Decode sequence function
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first word of the target sequence with the start token.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        # Predict the next word in the sequence
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index.get(sampled_token_index, '')

        # Append the token to the decoded sentence
        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence.strip()

# Preprocess user input
def preprocess_input(input_text):
    # Tokenize and pad the input
    input_sequence = source_tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_text_len, padding='post')
    return input_sequence

# Main flow for user input
if __name__ == "__main__":
    print("Text Summarization Model")
    print("========================\n")

    while True:
        input_text = input("Enter a text to summarize (or type 'exit' to quit): ")
        if input_text.lower() == 'exit':
            print("Exiting...")
            break

        # Preprocess input
        input_sequence = preprocess_input(input_text)

        # Generate summary
        predicted_summary = decode_sequence(input_sequence)

        print("\nOriginal Text: ", input_text)
        print("Predicted Summary: ", predicted_summary)
        print("\n")




Text Summarization Model

Enter a text to summarize (or type 'exit' to quit): exit
Exiting...


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Path to the image
image_path = '/content/drive/MyDrive/deeplab/non attention/without attention5).png'

# Display the image
img = mpimg.imread(image_path)  # Read the image
plt.imshow(img)                 # Display the image
plt.axis('off')                 # Turn off axes for better visualization
plt.show()


## ***encoder-decoder with attention model***

In [None]:
pip install keras-self-attention



In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from keras_self_attention import SeqSelfAttention as AttentionLayer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings

In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved models
encoder_model2 = load_model('/content/drive/MyDrive/deeplab/attention/encoder_model.h5')
decoder_model2 = load_model('/content/drive/MyDrive/deeplab/attention/decoder_model.h5')

# Load the tokenizers
with open('/content/drive/MyDrive/deeplab/attention/source_tokenizer.pkl', 'rb') as f:
    source_tokenizer = pickle.load(f)

with open('/content/drive/MyDrive/deeplab/attention/target_tokenizer.pkl', 'rb') as f:
    target_tokenizer = pickle.load(f)

# Reverse dictionaries for decoding
reverse_target_word_index = target_tokenizer.index_word
target_word_index = target_tokenizer.word_index
reverse_source_word_index = source_tokenizer.index_word
source_word_index = source_tokenizer.word_index

# Maximum lengths (update based on your training setup)
max_text_len = 50  # Example value; set to your actual max input length
max_summary_len = 15  # Example value; set to your actual max summary length

# Decode sequence function
def decode_sequence2(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model2.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))

    # Populate the first word of the target sequence with the start token.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        # Predict the next word in the sequence
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])

        # Sample the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index.get(sampled_token_index, '')

        # Append the token to the decoded sentence
        if sampled_token != 'eostok':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find stop word.
        if sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len - 1):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence.strip()

# Preprocess user input
def preprocess_input(input_text):
    # Tokenize and pad the input
    input_sequence = source_tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_text_len, padding='post')
    return input_sequence

# Main flow for user input
if __name__ == "__main__":
    print("Text Summarization Model")
    print("========================\n")

    while True:
        input_text = input("Enter a text to summarize (or type 'exit' to quit): ")
        if input_text.lower() == 'exit':
            print("Exiting...")
            break

        # Preprocess input
        input_sequence = preprocess_input(input_text)

        # Generate summary
        predicted_summary = decode_sequence(input_sequence)

        print("\nOriginal Text: ", input_text)
        print("Predicted Summary: ", predicted_summary)
        print("\n")




Text Summarization Model

Enter a text to summarize (or type 'exit' to quit): hello
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 883ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Path to the image
image_path = '//content/drive/MyDrive/deeplab/attention/attention).png'

# Display the image
img = mpimg.imread(image_path)  # Read the image
plt.imshow(img)                 # Display the image
plt.axis('off')                 # Turn off axes for better visualization
plt.show()


## **T5**

In [None]:
import zipfile

# Replace 'your_file.zip' with the name of your uploaded zip file
with zipfile.ZipFile('/content/drive/MyDrive/deeplab/T5/saved_t5_model.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_filest5')

# List the files to verify extraction
import os
os.listdir('extracted_files')


['generation_config.json',
 'special_tokens_map.json',
 'tokenizer_config.json',
 'spiece.model',
 'model.safetensors',
 'config.json',
 'added_tokens.json']

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Directory to save the model
save_directory = "/content/extracted_filest5"

# Save function
def save_model(model, tokenizer, directory):
    model.save_pretrained(directory)
    tokenizer.save_pretrained(directory)
    print(f"Model and tokenizer saved to {directory}")

# Load function
def load_model(directory):
    model = T5ForConditionalGeneration.from_pretrained(directory)
    tokenizer = T5Tokenizer.from_pretrained(directory)
    print(f"Model and tokenizer loaded from {directory}")
    return model, tokenizer

# Example usage:
# Save the trained model and tokenizer
# save_model(model, tokenizer, save_directory)

# Reload the model and tokenizer
loaded_model, loaded_tokenizer = load_model(save_directory)
loaded_model.to(device)

# Generate summaries using the reloaded model
def generate_summary_with_loaded_model(text, model, tokenizer, max_length=15):
    input_ids = tokenizer(
        f"summarize: {text}", return_tensors="pt", max_length=100, truncation=True
    ).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the reloaded model
sample_text = "new zealand defeated india by wickets in the fourth odi at hamilton on thursday to win their first match of the five-match odi series india lost an international match under rohit sharma captaincy after 12 consecutive victories dating back to march 2018 the match witnessed india getting all out for 92 their seventh lowest total in odi cricket history."
summary = generate_summary_with_loaded_model(sample_text, loaded_model, loaded_tokenizer)
print("Generated Summary:", summary)


Using device: cpu
Model and tokenizer loaded from /content/extracted_filest5
Generated Summary: new zealand defeat india by wickets in od


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Path to the image
image_path = '/content/drive/MyDrive/deeplab/T5/Screenshot 2024-11-22 110957.png'

# Display the image
img = mpimg.imread(image_path)  # Read the image
plt.imshow(img)                 # Display the image
plt.axis('off')                 # Turn off axes for better visualization
plt.show()


## ***bert***

In [None]:
import zipfile

# Replace 'your_file.zip' with the name of your uploaded zip file
with zipfile.ZipFile('/content/drive/MyDrive/deeplab/bert/trained_model.zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_filesbert')

# List the files to verify extraction
import os
os.listdir('extracted_files')


['generation_config.json',
 'special_tokens_map.json',
 'tokenizer_config.json',
 'spiece.model',
 'model.safetensors',
 'config.json',
 'added_tokens.json']

In [None]:
!pip install torch



In [None]:
from transformers import BertTokenizer, EncoderDecoderModel
# Set device to CUDA if available, else use CPU
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model and tokenizer from the saved directory
def load_model_and_tokenizer(load_dir='/content/extracted_filesbert'):
    tokenizer = BertTokenizer.from_pretrained(load_dir)
    model = EncoderDecoderModel.from_pretrained(load_dir)
    model.to(device)
    print(f"Model and tokenizer loaded from {load_dir}")
    return model, tokenizer

# Load the model
loaded_model, loaded_tokenizer = load_model_and_tokenizer('/content/extracted_filesbert')


Using device: cpu
Model and tokenizer loaded from /content/extracted_filesbert


In [None]:
# Generate summaries for new texts
def generate_summary(model, tokenizer, text, max_input_length=512, max_output_length=40):
    # Preprocess the input text
    inputs = tokenizer(
        [text], max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt"
    ).to(device)

    # Generate the summary (include decoder_start_token_id)
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=max_output_length,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True,
        decoder_start_token_id=model.config.decoder_start_token_id  # Ensure this is set
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example text for summary
text = "The quick brown fox jumps over the lazy dog. This is just an example sentence for testing."
summary = generate_summary(loaded_model, loaded_tokenizer, text)
print(f"Generated Summary: {summary}")


Generated Summary: short - swift dog jumps over dog jumps around over him working working gets congress hairs indian


In [None]:
text = "saurav kant an alumnus of upgrad and iiit-b pg program in machine learning and artificial intelligence was sr systems engineer at infosys with almost years of work experience the program and upgrad 360-degree career support helped him transition to data scientist at tech mahindra with 90% salary hike upgrad online power learning has powered lakh+ careers."
summary = generate_summary(loaded_model, loaded_tokenizer, text)
print(f"Generated Summary: {summary}")


Generated Summary: upgrad is up - up - srura - ypros worked at info assistantssssss


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Path to the image
image_path = '/content/drive/MyDrive/deeplab/bert/bert.png'

# Display the image
img = mpimg.imread(image_path)  # Read the image
plt.imshow(img)                 # Display the image
plt.axis('off')                 # Turn off axes for better visualization
plt.show()


### **finally the check**

In [None]:

# text = "The quick brown fox jumps over the lazy dog. This is just an example sentence for testing."

In [None]:

while True:
  input_text = input("Enter a text to summarize (or type 'exit' to quit): ")
  if input_text.lower() == 'exit':
    print("Exiting...")
    break

        # Preprocess input
  input_sequence = preprocess_input(input_text)

  print("\nOriginal Text: ", input_text)
  print("\n")
  predicted_summary = decode_sequence(input_sequence)
  print("Predicted Summary: ", predicted_summary)
  print("\n")

  predicted_summary = decode_sequence2(input_sequence)
  print("\nOriginal Text: ", input_text)
  print("Predicted Summarywith attention: ", predicted_summary)
  print("\n")

  text = input_text
  summary = generate_summary(loaded_model, loaded_tokenizer, text)
  print(f"Generated Summary bert: {summary}")

  print("\n")
  # sample_text = input_text
  summary = generate_summary_with_loaded_model(text, loaded_model, loaded_tokenizer)
  print("Generated Summary t5:", summary)
  print("\n")


Original Text:  new zealand defeated india by wickets in the fourth odi at hamilton on thursday to win their first match of the five-match odi series india lost an international match under rohit sharma captaincy after 12 consecutive victories dating back to march 2018 the match witnessed india getting all out for 92 their seventh lowest total in odi cricket history.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
!pip install rouge_score

In [None]:
import torch
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

# Generate summaries and calculate scores for 30 samples
def generate_summary(model, tokenizer, text, max_input_length=512, max_output_length=40):
    inputs = tokenizer(
        [text], max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt"
    ).to(device)

    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=max_output_length,
        num_beams=4,
        no_repeat_ngram_size=3,
        early_stopping=True,
        decoder_start_token_id=model.config.decoder_start_token_id
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


ile_path = '/content/post_prewithno_start.csv'  # Replace with the path to your CSV file
df = pd.read_csv(file_path)
df = df.head(30)
# Sample test cases and their expected summaries (for BLEU score)
reference_summaries = df["summary"]
# Replace with actual model and tokenizer objects
generated_summaries = []


texts = df["text"]

# Generate summaries
for text in texts:
    summary = generate_summary(loaded_model, loaded_tokenizer, text)
    generated_summaries.append(summary)
    print(summary)



# Calculate BLEU score for the generated summaries
bleu_score = corpus_bleu([[ref.split()] for ref in reference_summaries], [gen.split() for gen in generated_summaries])

print(f"BLEU Score: {bleu_score:.4f}")

# Plot BLEU scores for 30 samples
sample_bleu_scores = [corpus_bleu([[ref.split()] for ref in reference_summaries[i:i+1]], [gen.split() for gen in generated_summaries[i:i+1]]) for i in range(30)]

plt.figure(figsize=(10, 6))
plt.plot(range(1, 31), sample_bleu_scores, marker='o', linestyle='-', color='b')
plt.title("BLEU Scores for 30 Summaries")
plt.xlabel("Sample Number")
plt.ylabel("BLEU Score")
plt.grid(True)
plt.show()

# Calculate additional scores (e.g., METEOR, ROUGE, etc. if needed)
# For example, ROUGE score using rouge_score library:
from rouge_score import rouge_scorer

rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []

for i in range(30):
    scores = rouge_scorer.score(reference_summaries[i], generated_summaries[i])
    rouge_scores.append(scores)

# Print out average ROUGE scores
average_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
average_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
average_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

print(f"Average ROUGE-1: {average_rouge1:.4f}")
print(f"Average ROUGE-2: {average_rouge2:.4f}")
print(f"Average ROUGE-L: {average_rougeL:.4f}")


In [None]:

# Plot BLEU and ROUGE scores
rouge1_scores = [score['rouge1'].fmeasure for score in rouge_scores]
rouge2_scores = [score['rouge2'].fmeasure for score in rouge_scores]
rougeL_scores = [score['rougeL'].fmeasure for score in rouge_scores]

plt.figure(figsize=(12, 6))
plt.plot(sample_bleu_scores, label='BLEU Scores', marker='o')
plt.plot(rouge1_scores, label='ROUGE-1 F1', marker='o')
plt.plot(rouge2_scores, label='ROUGE-2 F1', marker='o')
plt.plot(rougeL_scores, label='ROUGE-L F1', marker='o')
plt.title("Evaluation Metrics for 30 Samples", fontsize=16)
plt.xlabel("Sample Index", fontsize=12)
plt.ylabel("Scores", fontsize=12)
plt.legend()
plt.grid()
plt.show()