In [None]:
# Import Necessary Libraries
# These are the libraries we need to work with data and the model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

# To ignore warnings for a cleaner output
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the Dataset
# Load the CNN/Daily Mail dataset from Hugging Face
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Print the dataset structure to see what it looks like
print(dataset)

In [None]:
# Explore the Dataset
# Print a few examples from the dataset to understand its structure
print(dataset['train'][0])
print(dataset['validation'][0])
print(dataset['test'][0])

# Convert the dataset to pandas DataFrames for easier manipulation
df_train = pd.DataFrame(dataset['train'])
df_valid = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

# Print basic statistics about the data
print(df_train.describe())
print(df_valid.describe())
print(df_test.describe())

In [None]:
# Visualize Data
# Plot the distribution of article lengths
# This helps us understand how long the articles are
article_lengths = df_train['article'].apply(lambda x: len(x.split()))
plt.hist(article_lengths, bins=50)
plt.title('Distribution of Article Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# Plot the distribution of summary lengths
# This helps us understand how long the summaries are
summary_lengths = df_train['highlights'].apply(lambda x: len(x.split()))
plt.hist(summary_lengths, bins=50)
plt.title('Distribution of Summary Lengths')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

In [None]:
from transformers import T5Tokenizer

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenize a sample article
sample_article = df_train['article'][0]
tokenized_article = tokenizer.encode(sample_article, max_length=512, truncation=True)
print(tokenized_article)

# Decode the tokenized article
decoded_article = tokenizer.decode(tokenized_article)
print(decoded_article)


In [None]:
# Initialize Tokenizer and Model
# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
# Tokenize a Sample Article
# Tokenize a sample article to see what the tokenized output looks like
sample_article = df_train['article'][0]
tokenized_article = tokenizer.encode(sample_article, max_length=512, truncation=True)
print(tokenized_article)

In [None]:
# Decode the Tokenized Article
# Decode the tokenized article back to text to verify the tokenization
decoded_article = tokenizer.decode(tokenized_article)
print(decoded_article)

In [None]:
# Input Your Own Article
# Input your own article for summarization
custom_article = """
Your long article text goes here. Make sure to keep it within a reasonable length
to avoid issues with the model's maximum input length.
"""

# Tokenize the custom article
inputs = tokenizer(custom_article, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:")
print(summary)
