# Data Collection

In [1]:
!pip install datasets




In [2]:
import pandas as pd
train_df = pd.read_excel('/content/drive/MyDrive/Gyanu.xlsx')  # change to pd.read_json for json files

train_df

Unnamed: 0,Article,Summary
0,Talk to someone who rode on the Titan submersi...,"Mike Reiss, a writer for “The Simpsons” televi..."
1,"Cairo, Egypt: Egyptian President Abdel Fattah ...",PM Modi Sunday visited the Heliopolis War Ceme...
2,Prime Minister Narendra Modi held a bilateral ...,"After completing his historic visit to the US,..."
3,Prime Minister Narendra Modi’s visit to Washin...,India considers Russia a time-tested ally from...
4,Amid the Wagner private military company’s ins...,Moscow Mayor Sergei Sobyanin asked the residen...
5,Indian-American Neal Katyal has been hailed as...,"Neal Katyal, who served as Acting Solicitor Ge..."
6,Prime Minister Narendra Modi’s recent historic...,PM Modi can be seen placing his palm in front ...
7,Prime Minister Narendra Modi recently made a p...,What is Uniform Civil Code and how it will imp...
8,Prime Minister Narendra Modi flagged off five ...,Prime Minister Narendra Modi shared a special ...
9,Prime Minister Narendra Modi on Wednesday cong...,India finished their campaign with 202 medals ...


In [3]:
train_df.columns = train_df.columns.str.strip()


In [4]:
train_df.columns

Index(['Article', 'Summary'], dtype='object')

# Data Preprocessing


In [5]:
import re


def clean_text(text):
    text = re.sub(r'\n',' ', text)  # remove newline character
    text = re.sub(r'\s+', ' ', text)  # replace multiple spaces with one
    return text.strip()


In [6]:
train_df['Article'] = train_df['Article'].apply(clean_text)
train_df['Summary'] = train_df['Summary'].apply(clean_text)

# Tokenization

In [7]:
!pip install transformers


from transformers import BartTokenizer






In [8]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')



In [9]:
def tokenize_data(example):
    # Tokenize the article and summary
    inputs = tokenizer(example['Article'], truncation=True, padding='max_length', max_length=250)
    outputs = tokenizer(example['Summary'], truncation=True, padding='max_length', max_length=250)


    # Return the inputs and the labels (the tokenized summary ids)
    return pd.Series({**inputs, 'labels': outputs.input_ids})




In [10]:
train_df = train_df.apply(lambda row: tokenize_data(row), axis=1)


In [11]:
pd.DataFrame(train_df)

Unnamed: 0,input_ids,attention_mask,labels
0,"[0, 27743, 7, 951, 54, 12783, 15, 5, 23308, 28...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 15827, 1223, 3006, 6, 10, 3331, 13, 44, 48..."
1,"[0, 347, 14387, 6, 5028, 35, 10377, 270, 18392...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 5683, 4698, 395, 3790, 5, 6851, 28119, 256..."
2,"[0, 25973, 692, 9975, 4698, 547, 10, 9526, 529...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 4993, 8796, 39, 3575, 825, 7, 5, 382, 6, 2..."
3,"[0, 25973, 692, 9975, 4698, 17, 27, 29, 825, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 11015, 9857, 798, 10, 86, 12, 33882, 7564,..."
4,"[0, 42332, 5, 11634, 940, 831, 138, 17, 27, 29...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 41384, 3129, 15458, 21055, 7010, 179, 553,..."
5,"[0, 25767, 12, 4310, 15454, 17454, 337, 34, 57...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 28138, 17454, 337, 6, 54, 1665, 25, 13711,..."
6,"[0, 25973, 692, 9975, 4698, 17, 27, 29, 485, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 5683, 4698, 64, 28, 450, 9405, 39, 14262, ..."
7,"[0, 25973, 692, 9975, 4698, 682, 156, 10, 3242...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2264, 16, 39555, 5280, 8302, 8, 141, 24, 4..."
8,"[0, 25973, 692, 9975, 4698, 17253, 160, 292, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 25973, 692, 9975, 4698, 1373, 10, 780, 115..."
9,"[0, 25973, 692, 9975, 4698, 15, 307, 19412, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 11015, 1550, 49, 637, 19, 22991, 10214, 36..."


In [12]:
from datasets import load_dataset,Dataset

train_dataset = Dataset.from_pandas(train_df)


# Setting up the model and the training loops

In [13]:
from transformers import BartForConditionalGeneration

# Load pre-trained model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')


In [14]:
# Defining the training arguments

!pip install accelerate -U

from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir=r'/content/drive/MyDrive/results',           # output directory for model predictions and checkpoints
    num_train_epochs=10,               # total number of training epochs
    per_device_train_batch_size=5,   # batch size per device during training
    weight_decay=0.01,                # strength of weight decay
)




In [15]:
from transformers import Trainer

# Create trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)


In [16]:
# Train the model

trainer.train()



Step,Training Loss


TrainOutput(global_step=80, training_loss=6.3993793487548825, metrics={'train_runtime': 91.1039, 'train_samples_per_second': 4.391, 'train_steps_per_second': 0.878, 'total_flos': 211631308800000.0, 'train_loss': 6.3993793487548825, 'epoch': 10.0})

In [17]:
# Save model
trainer.save_model("/content/drive/MyDrive/my_model")


# Evaluation

In [18]:
column_names = ["Article", "Summary"]



val_df = pd.read_excel(r"/content/drive/MyDrive/Evaluation_abhi.xlsx", sheet_name='Sheet2', header=None, names=column_names)
val_df

Unnamed: 0,Article,Summary
0,When we hear or read the term ‘world’s deadlie...,This creature is smaller than your nail and we...
1,While the 20s is regarded as the decade of has...,"Cici Zhang, a 32-year-old woman, was told by h..."
2,"The day between July 4 and Monday, July 3, acc...",It's a death sentence for people and ecosystem...
3,Leaders of the Shanghai Cooperation Organisati...,Both Putin and Xi are expected to visit New De...
4,"The Taliban has in a new verbal decree, banned...",The Taliban Ministry of Vice and Virtue ordere...
5,Prize-winning Ukrainian writer Victoria Amelin...,At least 11 others were killed and 61 were wou...
6,When we hear or read the term ‘world’s deadlie...,This creature is smaller than your nail and we...
7,Venezuelan’s enthusiasm for beauty pageants is...,he influencer and Instagram model has applied ...
8,A Tunisian man who had been charged with the m...,"Sabita Thanwani, 19, was found under the sheet..."
9,New York City has been overrun by a swarm of t...,Videos shared on social media platforms depict...


In [19]:
val_df['Article'] = val_df['Article'].apply(clean_text)
val_df['Summary'] = val_df['Summary'].apply(clean_text)

In [20]:
val_df = val_df.apply(lambda row: tokenize_data(row), axis=1)


In [21]:
val_dataset = Dataset.from_pandas(val_df)


In [22]:
# Evaluate the model on the unseen data
eval_results = trainer.evaluate(val_dataset)



In [23]:
# Access the evaluation loss
eval_loss = eval_results["eval_loss"]

In [24]:
eval_loss

3.9554550647735596

In [25]:
from transformers import pipeline


input_texts=['''The wedding season in the entertainment industry is in full swing. After Parineeti Chopra-Raghav Chadha’s engagement ceremony and Sunny Deol’s son Karan Deol’s wedding with Drisha Acharya, television industry celebs are also commencing the journey of companionship. Sreejita De, who has been in the news for quite some time ever since her stint at Bigg Boss 16 hosted by Salman Khan also decided to get hitched. She has been dating her German beau Michael Blohm-Pape for quite some time and the couple had been serious for taking their relationship forward.''']


# Load the trained model
model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/my_model')

# Load the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Create a summarization pipeline
summarizer = pipeline('summarization', model=model, tokenizer=tokenizer, device=0) # device=0 for using GPU, if available

# Define the generation hyperparameters
gen_config = {
    'max_length': 512,
    'num_beams': 4,
    'early_stopping': True,
}

# You can now use this summarizer on your input text
for text in input_texts:
    print(summarizer(text, **gen_config))



Your max_length is set to 512, but your input_length is only 134. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)


[{'summary_text': "The wedding season in the entertainment industry is in full swing. After Parineeti Chopra-Raghav Chadha’s engagement ceremony and Sunny Deol's son Karan Deela's wedding with Drisha Acharya, television industry celebs are also commencing the journey of companionship. Sreejita De, who has been in the news for quite some time ever since her stint at Bigg Boss 16 hosted by Salman Khan, decided to get hitched."}]
