In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
val_df = pd.read_csv('validation.csv')


In [2]:
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [3]:
test_df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [4]:
val_df.head()

Unnamed: 0,id,article,highlights
0,61df4979ac5fcc2b71be46ed6fe5a46ce7f071c3,"Sally Forrest, an actress-dancer who graced th...","Sally Forrest, an actress-dancer who graced th..."
1,21c0bd69b7e7df285c3d1b1cf56d4da925980a68,A middle-school teacher in China has inked hun...,Works include pictures of Presidential Palace ...
2,56f340189cd128194b2e7cb8c26bb900e3a848b4,A man convicted of killing the father and sist...,"Iftekhar Murtaza, 29, was convicted a year ago..."
3,00a665151b89a53e5a08a389df8334f4106494c2,Avid rugby fan Prince Harry could barely watch...,Prince Harry in attendance for England's crunc...
4,9f6fbd3c497c4d28879bebebea220884f03eb41a,A Triple M Radio producer has been inundated w...,Nick Slater's colleagues uploaded a picture to...


In [5]:
test_df.isnull().sum()

id            0
article       0
highlights    0
dtype: int64

In [6]:
def clean_text(text):
    text = text.lower()
    text = text.replace('\n', ' ').strip()
    return text

# Clean the 'article' and 'highlights' columns
train_df['article'] = train_df['article'].apply(clean_text)
train_df['highlights'] = train_df['highlights'].apply(clean_text)

test_df['article'] = test_df['article'].apply(clean_text)
test_df['highlights'] = test_df['highlights'].apply(clean_text)
val_df['article'] = val_df['article'].apply(clean_text)
val_df['highlights'] = val_df['highlights'].apply(clean_text)

print(train_df.head())


                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  by . associated press . published: . 14:11 est...   
1  (cnn) -- ralph mata was an internal affairs li...   
2  a drunk driver who killed a young woman in a h...   
3  (cnn) -- with a breezy sweep of his pen presid...   
4  fleetwood are the only team still to have a 10...   

                                          highlights  
0  bishop john folda, of north dakota, is taking ...  
1  criminal complaint: cop used his role to help ...  
2  craig eccleston-todd, 27, had drunk at least t...  
3  nina dos santos says europe must be ready to a...  
4  fleetwood top of league one after 2-0 win at s...  


In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary resources from NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    preprocessed_sentences = []
    
    for sentence in sentences:
        # Remove punctuation using regex
        sentence = re.sub(r'[^\w\s]', '', sentence)
        
        # Tokenize the sentence into words
        words = word_tokenize(sentence)
        
        # Remove stopwords and lemmatize words
        filtered_words = [
            lemmatizer.lemmatize(word.lower())  # Lemmatize and convert to lowercase
            for word in words if word.lower() not in stop_words and word.isalnum()
        ]
        
        # Rebuild the sentence from the filtered words
        preprocessed_sentence = ' '.join(filtered_words)
        preprocessed_sentences.append(preprocessed_sentence)
    
    # Join the sentences back into a full text
    preprocessed_text = ' '.join(preprocessed_sentences)
    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def extractive_summary(text, top_n=3):
    sentences = sent_tokenize(text)
    
    # Create the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Calculate cosine similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Sum the similarity scores for each sentence
    sentence_scores = cosine_sim.sum(axis=1)
    
    # Get the indices of the top_n sentences with highest scores
    ranked_sentences = sentence_scores.argsort()[-top_n:][::-1]
    
    # Extract the top N sentences
    summary = ' '.join([sentences[i] for i in ranked_sentences])
    
    return summary


In [9]:
from transformers import pipeline

# Load the pre-trained T5 model for summarization
summarizer = pipeline('summarization', model='t5-small')

def abstractive_summary(text):
    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [10]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

data = [
    {
        "article": "The Apollo missions landed the first humans on the Moon. The program ran from 1961 to 1972 and achieved six successful lunar landings.",
        "summary": "The Apollo program landed humans on the Moon between 1961 and 1972."
    },
    {
        "article": "The Eiffel Tower is a wrought-iron lattice tower in Paris, France. It is one of the most recognizable structures in the world.",
        "summary": "The Eiffel Tower is a famous landmark in Paris."
    }
]

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare dataset
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.inputs = []
        self.labels = []
        for item in data:
            input_enc = tokenizer(
                "summarize: " + item["article"],
                max_length=max_input_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            label_enc = tokenizer(
                item["summary"],
                max_length=max_output_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )
            self.inputs.append(input_enc.input_ids.squeeze())
            self.labels.append(label_enc.input_ids.squeeze())

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "input_ids": self.inputs[idx],
            "attention_mask": self.inputs[idx].ne(tokenizer.pad_token_id),
            "labels": self.labels[idx]
        }

train_dataset = SummaryDataset(data, tokenizer)

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_strategy="no",
    logging_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

trainer.train()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=1, training_loss=16.43187141418457, metrics={'train_runtime': 14.5338, 'train_samples_per_second': 0.138, 'train_steps_per_second': 0.069, 'total_flos': 270683602944.0, 'train_loss': 16.43187141418457, 'epoch': 1.0})

In [23]:
input_text = "The Great Wall of China is a historic structure built to protect states from invasions. It stretches over 13,000 miles."

# Tokenize input and generate summary
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(input_ids, max_length=150, num_beams=4, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)


the Great wall of China stretches over 13,000 miles.


In [25]:
model.save_pretrained("./t5-summary-model")
tokenizer.save_pretrained("./t5-summary-model")


('./t5-summary-model\\tokenizer_config.json',
 './t5-summary-model\\special_tokens_map.json',
 './t5-summary-model\\spiece.model',
 './t5-summary-model\\added_tokens.json')