!pip install scikit-learn
!pip install evaluate
!pip install rouge-score

In [None]:
# Import necessary libraries for evaluation and data processing
import evaluate
rouge = evaluate.load('rouge')  # Load ROUGE metric for evaluating summarization performance

In [None]:
# Mount Google Drive to access and save files
from google.colab import drive
drive.mount('/content/drive')

**data processing**

In [None]:
# Import pandas for data processing
import pandas as pd

In [None]:
# Load the dataset from Google Drive, selecting specific columns and a limited number of rows
df = pd.read_csv("/content/drive/MyDrive/AI LAB PROJECT/train.csv", usecols=['highlights', 'article'], nrows=5000)
df  # Display the dataset

In [None]:
# Rename the columns for better clarity and reorder them
df = df.rename(columns={"highlights": "target_text", "article": "source_text"})
df = df[['source_text', 'target_text']]
df.head()  # Display the first few rows

In [None]:
# Add a prefix to the source text for T5 input format
df['source_text'] = "summarize: " + df['source_text']
df.head()  # Check the modified data

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2)  # 80% training, 20% testing
train_df.shape, test_df.shape  # Display shapes of training and testing sets

In [None]:
# Address encoding issues for T5 training
import locale
locale.getpreferredencoding = lambda: "UTF-8"

**Google T5 model tuning**

! pip install simplet5 -q

In [None]:
# Install and import the SimpleT5 library for training the T5 model
from simplet5 import SimpleT5
# Load a pre-trained T5 model
model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")

In [None]:
# Get the number of training samples
size = train_df['source_text'].size

size  # Display the size (number of training samples)

4000

In [None]:
# Train the T5 model
model.train(
    train_df=train_df[:4000],         # Training data
    eval_df=test_df[:100],           # Evaluation data
    source_max_token_len=512,        # Maximum token length for the input
    target_max_token_len=150,        # Maximum token length for the output
    outputdir="/content/drive/MyDrive/Colab Notebooks/Model",  # Directory to save the model
    batch_size=16,                   # Batch size
    max_epochs=8,                    # Number of epochs
    use_gpu=True                     # Enable GPU if available
)

In [None]:
# Load the fine-tuned model for inference
model.load_model(
    "t5",
    "/content/drive/MyDrive/Colab Notebooks/Model/simplet5-epoch-7-train-loss-0.9762-val-loss-1.6444",
    use_gpu=True
)

**Youtube video to text processing**

!pip install deepmultilingualpunctuation
!pip install deepmultilingualpunctuation youtube-transcript-api -q

In [None]:
# Install and use libraries for YouTube video transcription and punctuation restoration
from deepmultilingualpunctuation import PunctuationModel
punctuation_model = PunctuationModel()
#model = PunctuationModel()

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
# Provide the YouTube video link and extract its video ID
link = "https://www.youtube.com/watch?v=veMFCFyOwFI"
video_id = link.split("=")[-1]
video_id  # Display the video ID

'veMFCFyOwFI'

In [None]:
# Fetch the transcript for the YouTube video
pretext = YouTubeTranscriptApi.get_transcript(video_id)
pretext  # Display the raw transcript

In [None]:
text = ""  # Initialize an empty string to store the full transcript

for i in pretext:  # Iterate over each segment in the transcript
    s = i["text"]  # Extract the "text" field from the segment
    text = text + s + " "  # Append the text to `text` with a space for separation

text  # This now contains the entire transcript as a single string


In [None]:
# Restore punctuation in the plain text using the punctuation model
text = punctuation_model.restore_punctuation(text)  # Processes the text to add punctuation
print(text)  # Displays the text with restored punctuation

In [None]:
# Prepares the text for summarization by adding the T5 prompt
finaltext = "summarize: " + text  # Prefixes "summarize:" to the text as required by the T5 model
print(finaltext)  # Displays the final formatted text

**Google T5 Abstractive Summarization Result**

In [None]:
# Generate a summary using the trained T5 model
t5_result = model.predict(finaltext)  # Predict a summary for the given input text (finaltext)
print(t5_result[0])  # Print the first summary from the list of predictions

**Saving Model**

In [None]:
import pickle  # Library for serializing and saving Python objects

# Save the trained model to a file for later use
with open('/content/drive/MyDrive/AI LAB PROJECT/model.pkl', 'wb') as f:
    pickle.dump(model, f)  # Save the model object as a binary file

**Text Preprocessing with SpaCy**

***Load SpaCy Model***

In [None]:
import spacy  # Library for natural language processing
nlp = spacy.load('en_core_web_sm')  # Load a small English language model

# Convert raw text into a SpaCy document object for linguistic processing
doc = nlp(text)  # 'text' is the raw input string to be processed

***Define Stopwords and Punctuation***

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS  # Import default English stopwords
stopwords = list(STOP_WORDS)  # Convert the stopwords set into a list

from string import punctuation  # Import string punctuation characters
punctuation = punctuation + '\n'  # Include newline as punctuation

***Calculate Word Frequencies***

In [None]:
# Initialize an empty dictionary to store word frequencies
wordfreq = {}

# Iterate through each token in the processed document
for word in doc:
  # Check if the word is not a stopword or punctuation
  if word.text.lower() not in stopwords:
    if word.text.lower() not in punctuation:
      if word.text not in wordfreq.keys():  # If word is not already in dictionary
        wordfreq[word.text] = 1  # Initialize frequency count
      else:
        wordfreq[word.text] += 1  # Increment frequency count

# Normalize word frequencies so the most frequent word has a value of 1.0
for word in wordfreq.keys():
  wordfreq[word] = wordfreq[word] / max(wordfreq.values())  # Normalize by dividing by the max frequency


In [None]:
# Extracting sentences from the processed text using SpaCy's sentence tokenizer
# `doc.sents` iterates over sentences in the `doc` object, generated by SpaCy's pipeline
senttokens = [i for i in doc.sents]  # List comprehension to collect all sentences in a list

# `senttokens` now contains a list of sentences as SpaCy `Span` objects
senttokens


In [None]:
# Calculating the weight of each sentence based on word frequencies
sentweight = {}  # Dictionary to store sentence weights

# Iterating through each sentence in the tokenized list
for sent in senttokens:
    # Iterating through each word in the sentence
    for word in sent:
        # Checking if the word (in lowercase) exists in the word frequency dictionary
        if word.text.lower() in wordfreq.keys():
            # If the sentence is not already in `sentweight`, initialize its weight
            if sent not in sentweight.keys():
                sentweight[sent] = wordfreq[word.text.lower()]
            # If it exists, add the weight of the current word to the sentence's weight
            else:
                sentweight[sent] += wordfreq[word.text.lower()]

# The `sentweight` dictionary contains sentences as keys and their calculated weights as values
sentweight

**SpaCy Extractive Summarization Result**

In [None]:
# Importing `nlargest` to retrieve the highest weighted sentences for summarization
from heapq import nlargest

# Determining the number of sentences to include in the summary
# Calculating 5% of the total number of sentences
sizee = int(len(senttokens) * 0.05)

# Extracting the top `sizee` sentences with the highest weights using `nlargest`
# The `key=sentweight.get` ensures that sentences are selected based on their weights
summ = nlargest(sizee, sentweight, key=sentweight.get)

# `summ` contains the list of selected sentences for the summary
summ

In [None]:
# Extracting the text of each sentence from the selected top-weighted sentences
# `summ` contains the highest-weighted sentences (SpaCy `Span` objects)
# `word.text` retrieves the text content of each sentence

spacy_res = [word.text for word in summ]

# Printing the summarized sentences as a list
print(spacy_res)

In [None]:
# Joining the summarized sentences into a single string separated by spaces
# This results in the final summary generated by the SpaCy-based method

spacy_result = ' '.join(spacy_res)

# Printing the final summarized text
print(spacy_result)

**Evaluation**

In [None]:
# This is a reference text used for comparison or evaluation
references = [
    "The Saudis and Iranians have never actually declared war on each other. Instead, they fight indirectly by supporting opposing sides in other countries and inciting conflicts. This is known as proxy warfare. And it’s had a devastating effect on the region. Countries, especially poor ones, can’t function if there are larger countries pulling strings within their borders. And that’s exactly what's happening in the Middle East. The Saudi-Iranian rivalry has become a fight over influence, and the whole region is a battlefield."
]

In [None]:
# Predictions from the T5 model
predictions_t5 = [t5_result[0]]

# Compute the ROUGE score between the predicted and reference text
results_t5 = rouge.compute(predictions=predictions_t5, references=references)

# Print the ROUGE score results
print(results_t5)

In [None]:
# Predictions from the SpaCy-based summarization method
predictions_spacy = [spacy_result]

# Compute the ROUGE score between the SpaCy-generated summary and the reference text
results_spacy = rouge.compute(predictions=predictions_spacy, references=references)

# Print the ROUGE score results
print(results_spacy)