In [None]:
!pip install simplet5
!pip install datasets

In [None]:
!pip install seaborn
!pip install matplotlib
!pip install simpletransformers
!pip install rouge


In [None]:
from datasets import load_dataset

In [None]:
import pandas as pd
import numpy as np


In [None]:
train_dataset = load_dataset("cnn_dailymail", "3.0.0", split="train")

valid_dataset = load_dataset("cnn_dailymail", "3.0.0", split="validation")

test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")


In [None]:
train_data = pd.DataFrame(train_dataset.to_dict())
test_data = pd.DataFrame(test_dataset.to_dict())
validate_data = pd.DataFrame(valid_dataset.to_dict())


In [None]:
train_data.shape[0]

5742

In [None]:
test_data.shape[0]

230

In [None]:
validate_data.shape[0]

267

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import re

stop_words = set(stopwords.words('english'))

def clean_text(article):
    # Convert text to lowercase
    article = article.lower()
    
    # Remove unwanted characters
    article = re.sub(r'\b\d+\b', '', article)
    article = re.sub(r'[^\w\s]', '', article)
    
    # Remove stopwords
    article = ' '.join([word for word in article.split() if word not in stop_words])
    
    # Remove punctuation marks
    article = article.translate(str.maketrans('', '', string.punctuation))
    
    return article

train_data['article'] = train_data['article'].apply(clean_text)

train_data['highlights'] = train_data['highlights'].apply(clean_text)

train_data['article']


In [None]:
test_data['article'] = test_data['article'].apply(clean_text)

test_data['highlights'] = test_data['highlights'].apply(clean_text)

test_data['article']

In [None]:
validate_data['article'] = validate_data['article'].apply(clean_text)
validate_data['highlights'] = validate_data['highlights'].apply(clean_text)
validate_data['article']

In [None]:
train_data = train_data.rename(columns={"highlights":"target_text", "article":"input_text"})
train_data

In [None]:
test_data = test_data.rename(columns={"highlights":"target_text", "article":"input_text"})
test_data

In [None]:
validate_data = validate_data.rename(columns={"highlights":"target_text", "article":"input_text"})
validate_data

In [None]:
train_data = train_data[['input_text', 'target_text']]
train_data

In [None]:
test_data = test_data[['input_text', 'target_text']]
test_data

In [None]:
validate_data = validate_data[['input_text', 'target_text']]
validate_data

In [None]:
train_data['input_text'] = "summarize: " + train_data['input_text']


In [None]:
train_data["prefix"] = "summarize"

train_data

In [None]:
!pip install torch

In [None]:
import torch
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from simpletransformers.t5 import T5Model
import sklearn


In [None]:
# import required libraries
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cuda')
model.to('cuda')
data = pd.DataFrame()

def generate_summary(text):
    # preprocess the text
    preprocess_text = text.strip().replace("\n","")
    t5_prepared_Text = "summarize: " + preprocess_text

    # encode the text using the tokenizer
    tokenized_text = tokenizer.encode(t5_prepared_Text, max_length=512, truncation=True, return_tensors="pt").to(device)
    tokenized_text.to('cuda')
    # generate the summary
    summary_ids = model.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=30,
                                    max_length=250,
                                    early_stopping=True)
    summary_ids.to('cpu')
    # decode the summary and return
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output

# apply the function to the dataframe and save the summaries
data['summary'] = test_data['input_text'].apply(generate_summary)
data.to_csv('output_data.csv', index=False)


In [None]:
from rouge import Rouge

expected = test_data['target_text'].tolist()
generated = pd.read_csv('/content/output_data.csv')['summary'].tolist()

rouge = Rouge()

scores = rouge.get_scores(generated, expected, avg=True)

print(scores)

In [None]:
args = {
    "reprocess_input_data": False,
    "overwrite_output_dir": True,
    "max_seq_length": 512,
    "num_train_epochs": 5,
    "num_beams": None, 
    "do_sample": True,
    "top_k": 10,
    "top_p": 0.5,
    "use_multiprocessing": False,
    "save_steps": -1,
    "save_eval_checkpoints": True,
    "evaluate_during_training": False,
    "adam_epsilon": 1e-08,
    "eval_batch_size": 10,
    "fp_16": False,
    "gradient_accumulation_steps": 16,
    "learning_rate": 0.01,
    "max_grad_norm": 1.0,
    "n_gpu": 1,
    "seed": 42,
    "train_batch_size": 10,
    "warmup_steps": 0,
    "weight_decay": 0.01,
}

In [None]:
model = T5Model("t5","t5-small", args=args, use_cuda=True)

In [None]:
train = model.train_model(train_data, eval_data=test_data, use_cuda=True, acc=sklearn.metrics.accuracy_score)

In [None]:
data = pd.DataFrame()

def generate_summary(text):
    # preprocess the text
    preprocess_text = text.strip().replace("\n","")
    t5_prepared_Text = "summarize: " + preprocess_text

    # encode the text using the tokenizer
    tokenized_text = tokenizer.encode(t5_prepared_Text, max_length=512, truncation=True, return_tensors="pt").to(device)
    tokenized_text.to('cuda')
    # generate the summary
    summary_ids = train.generate(tokenized_text,
                                    num_beams=4,
                                    no_repeat_ngram_size=2,
                                    min_length=30,
                                    max_length=250,
                                    early_stopping=True)
    summary_ids.to('cpu')
    # decode the summary and return
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return output
# apply the function to the dataframe and save the summaries
data = pd.DataFrame()
for i in test_data['input_text']:
  
  data['summary']= i.apply(generate_summary)
  
# data['summary'] = test_data['input_text'].apply(generate_summary)
data.to_csv('output.csv', index=False)

In [None]:
import os
root_dir = os.getcwd()
trained_model_path = os.path.join(root_dir,"outputs")
args = {
    "max_seq_length": 512,
    "num_return_sequences": 5,
    "min_length": 30,
    "max_length": 250,
    "early_stopping": True,
    "repetition_penalty": 1.5, 
    "length_penalty": 2.0,
    "top_k": 50,
    "top_p": 0.5,
}
trained_model = T5Model("t5",trained_model_path,args=args)


In [None]:
predict

In [None]:
# Extract the text data from the 'text' column
input_data = list(test_data['input_text'])

# Pass the input data to the model's predict() method
predictions = trained_model.predict(input_data)

In [None]:
prediction = pd.DataFrame()
prediction['summary'] = predictions
# Save the updated DataFrame to a new CSV file
prediction.to_csv('tested_output.csv', index=False)

In [None]:
from rouge import Rouge

expected = test_data['target_text'].tolist()
generated = pd.read_csv('/content/tested_output.csv')['summary'].tolist()

rouge = Rouge()

scores = rouge.get_scores(generated, expected, avg=True)

print(scores)

In [None]:
text = '''summarize: Fit-again Betsen in France squad France have brought flanker Serge Betsen back into their squad to face England at Twickenham on Sunday.But the player, who missed the victory over Scotland through injury, must attend a disciplinary hearing on Wednesday after being cited by Wasps. Serge has a good case so we are confident he will play said France coach Bernard Laporte. The inexperienced Nicolas Mas, Jimmy Marlu and Jean-Philippe Grandclaude are also included in a 22-man squad. The trio have been called up after Pieter de Villiers, Ludovic Valbon and Aurelien Rougerie all picked up injuries in France's 16-9 win on Saturday.Laporte said he was confident that Betsen would be cleared by the panel investigating his alleged trip that broke Wasps centre Stuart Abbott's leg. If he was to be suspended, we would call up Imanol Harinordoquy or Thomas Lievremont,said Laporte, who has dropped Patrick Tabacco. We missed Serge badly against Scotland. He has now recovered from his thigh injury and played on Saturday with Biarritz.France's regular back-row combination of Betsen, Harinordoquy and Olivier Magne were all missing from France side at the weekend because of injury. Laporte is expected to announce France starting line-up on Wednesday.Forwards: Nicolas Mas, Sylvain Marconnet, Olivier Milloud, William Servat, Sebastien Bruno, Fabien Pelous, Jerome Thion, Gregory Lamboley, Serge Betsen, Julien Bonnaire, Sebastien Chabal, Yannick Nyanga. Backs: Dimitri Yachvili, Pierre Mignoni, Frederic Michalak, Yann Delaigue, Damien Traille, Brian Liebenberg, Jean-Philippe Grandclaude, Christophe Dominici, Jimmy Marlu, Pepito Elhorga
 '''
predict = trained_model.predict([text])

In [None]:
prefix = "summarize"

pred = trained_model.predict([f"{prefix}: Fit-again Betsen in France squad France have brought flanker Serge Betsen back into their squad to face England at Twickenham on Sunday.But the player, who missed the victory over Scotland through injury, must attend a disciplinary hearing on Wednesday after being cited by Wasps. Serge has a good case so we are confident he will play said France coach Bernard Laporte. The inexperienced Nicolas Mas, Jimmy Marlu and Jean-Philippe Grandclaude are also included in a 22-man squad. The trio have been called up after Pieter de Villiers, Ludovic Valbon and Aurelien Rougerie all picked up injuries in France's 16-9 win on Saturday.Laporte said he was confident that Betsen would be cleared by the panel investigating his alleged trip that broke Wasps centre Stuart Abbott's leg. If he was to be suspended, we would call up Imanol Harinordoquy or Thomas Lievremont,said Laporte, who has dropped Patrick Tabacco. We missed Serge badly against Scotland. He has now recovered from his thigh injury and played on Saturday with Biarritz.France's regular back-row combination of Betsen, Harinordoquy and Olivier Magne were all missing from France side at the weekend because of injury. Laporte is expected to announce France starting line-up on Wednesday.Forwards: Nicolas Mas, Sylvain Marconnet, Olivier Milloud, William Servat, Sebastien Bruno, Fabien Pelous, Jerome Thion, Gregory Lamboley, Serge Betsen, Julien Bonnaire, Sebastien Chabal, Yannick Nyanga. Backs: Dimitri Yachvili, Pierre Mignoni, Frederic Michalak, Yann Delaigue, Damien Traille, Brian Liebenberg, Jean-Philippe Grandclaude, Christophe Dominici, Jimmy Marlu, Pepito Elhorga"])
print(pred)