In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tweets/valid.csv
/kaggle/input/tweets/train.csv
/kaggle/input/tweets/test.csv


**Installing and importing needed libraries**
* tweet-preprocessor for preprocessing of tweets: it handles URLs, Mentions, Reserved words (eg, RT, FAV, etc), Emojis
* Using GPU to run code

In [2]:
!pip install tweet-preprocessor
import preprocessor as p
import numpy as np
import pandas as pd
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.optimization import Adafactor, AdafactorSchedule
import torch
import huggingface_hub
import gc
from torch import nn 
import nltk
import datasets
!pip install rouge_score

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0
[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=4f298b7c117b740e54b72db4fcf2b621be52c671a81adfb9e90618a85d650c63
  Stored in directory: /root/.cache/pip/wheels/8e/6b/70/59daa7c90a238610e34bac5916e001fe3d9bb0ec59c8cf5518
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[0m

**Note : This dataset is generated with help of ChatGPT because there is no Dataset available for Tweets and their Summaries**

**Loading Dataset**
* train.csv -> Contains Training Data (1148 rows)
* valid.csv -> Contains Validation Data (104 rows)
* test.csv -> Contains Testing Data (101 rows)

In [3]:
train = pd.read_csv('/kaggle/input/tweets/train.csv')
valid = pd.read_csv('/kaggle/input/tweets/valid.csv')
test = pd.read_csv('/kaggle/input/tweets/test.csv')
print(len(train))
print(len(test))
print(len(valid))

1148
101
104


In [4]:
train.head()

Unnamed: 0,inputs,summaries
0,Artificial intelligence is transforming the he...,AI is revolutionizing healthcare by improving ...
1,The future of transportation is electric. With...,"EVs are the future of transportation, and as w..."
2,"Remote work is here to stay, and it's changing...",Remote work is changing the way we balance wor...
3,Blockchain technology is transforming the way ...,Blockchain is transforming business by enablin...
4,The global food system is facing unprecedented...,Building a sustainable and equitable food syst...


In [5]:
for i in range(len(train)):
    train['inputs'][i] = p.clean(train['inputs'][i])

for i in range(len(test)):
    test['inputs'][i] = p.clean(test['inputs'][i])

for i in range(len(valid)):
    train['inputs'][i] = p.clean(valid['inputs'][i])

In [6]:
raw_datasets = datasets.DatasetDict({'train_dict': datasets.Dataset.from_dict(train),
                                    'valid_dict': datasets.Dataset.from_dict(valid),
                                    'test_dict': datasets.Dataset.from_dict(test)})

In [7]:
print(raw_datasets['train_dict'])
print(raw_datasets['valid_dict'])
print(raw_datasets['test_dict'])

Dataset({
    features: ['inputs', 'summaries'],
    num_rows: 1148
})
Dataset({
    features: ['inputs', 'summaries'],
    num_rows: 104
})
Dataset({
    features: ['inputs', 'summaries'],
    num_rows: 101
})


**Importing PLM, Tokenizer, metric for evaluation**
* Model: Google:Pegasus-Large (As shown in mandate 2 this model had best performance on dataset before training)
* Evaluation metrics: Rouge Score

In [8]:
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
metric = datasets.load_metric("rouge")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

**Preprocessing of Data**

* This function tokenizes inputs and summaries using pegasus tokenizer and returns data with labels.
* max_length: Max number of input tokens of data.
* We will use this tokenized dataset for training of model.

In [9]:
max_input_length = 512
max_target_length = 32

def preprocess(examples):
    model_inputs = tokenizer(examples['inputs'], max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summaries'], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(preprocess, batched=True)
print(tokenized_datasets)

  0%|          | 0/2 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train_dict: Dataset({
        features: ['inputs', 'summaries', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1148
    })
    valid_dict: Dataset({
        features: ['inputs', 'summaries', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 104
    })
    test_dict: Dataset({
        features: ['inputs', 'summaries', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 101
    })
})


**Finetuning of Model**

**Why do we freeze layers?**
Freezing layers during training prevents its weights from being modified as well as minimises computational time for training the model.

freeze_params: This function freezez parameters of model.

freeze_embeds: Used to freeze embed_positions and embed_tokens.

embed_positions: encoding that denotes the position of words.

embed_tokens: the pre-trained embeddings for different words.



In [10]:
def freeze_params(model: nn.Module):
    for par in model.parameters():
        par.requires_grad = False

def freeze_embeds(model):
    freeze_params(model.model.shared)
    for d in [model.model.encoder, model.model.decoder]:
        freeze_params(d.embed_positions)
        freeze_params(d.embed_tokens)

freeze_embeds(model)

**Arguments for training Model on Dataset**

* Data is trained using initial learning rate of 2e-5 with 5 train epochs.
* fp16: used 16 bit mixed precision training.
* optimizer: optimizer with weight decay fixed that can be used to fine-tuned models (used Adafactor)
* DataCollator: objects that will form a batch by using a list of dataset elements as input.

In [11]:
batch_size = 1
args = Seq2SeqTrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    output_dir = "none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
lr_scheduler = AdafactorSchedule(optimizer)

**Computing evaluation Metrics**
* This function is used to calculate rouge score of the model for model evaluation.

In [12]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
  result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)
  
  return {k: round(v, 4) for k, v in result.items()}

**Training Model on Dataset**
* train_dataset: tokenized_dataset is given as input for training of model.
* eval_dataset: tokenized validation dataset

In [13]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train_dict"],
    eval_dataset=tokenized_datasets["valid_dict"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics
)

trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.6897,0.85808,66.9326,53.0387,63.6352,63.407,23.1442
2,1.067,0.975797,62.5634,50.3595,59.8063,59.8884,24.4904
3,0.7517,1.175313,35.3475,23.7276,33.1597,33.2014,18.7019
4,0.6409,1.262829,43.7902,31.8434,41.334,41.0805,37.6154
5,0.5844,1.433784,25.5514,15.2676,24.1999,24.1141,19.2404




TrainOutput(global_step=2870, training_loss=0.9005455309505662, metrics={'train_runtime': 4022.4049, 'train_samples_per_second': 1.427, 'train_steps_per_second': 0.714, 'total_flos': 658261755346944.0, 'train_loss': 0.9005455309505662, 'epoch': 5.0})

**Output Prediction**
* trainer.predict: gives us output in encoded form
* tokenizer.decode: decodes the output from encoded form

In [14]:
out = trainer.predict(tokenized_datasets["test_dict"])

predicted_summaries = []
for i in range(0, 101): 
  predicted_summaries.append(tokenizer.decode(out[0][i], skip_special_tokens =  True))


**Rouge Score on test data**

In [15]:
out.metrics

{'test_loss': 1.9140664339065552,
 'test_rouge1': 45.949,
 'test_rouge2': 28.0132,
 'test_rougeL': 39.6724,
 'test_rougeLsum': 39.5181,
 'test_gen_len': 25.4257,
 'test_runtime': 76.4796,
 'test_samples_per_second': 1.321,
 'test_steps_per_second': 0.667}

**Testing model**
Giving set of tweets as input to trained model

In [2]:
input_tweet = ["Just finished an intense workout session at the gym! Feeling energized and ready to take on the day. 💪 #FitnessGoals #Workout. Remember, fitness is not just about the physical aspect. It's also about mental strength and overall well-being. Take care of your mind and body. #Fitness #Wellness. Finding the motivation to exercise can be tough sometimes, but the feeling you get after a great workout is worth it. Push through and stay committed to your fitness journey. #FitnessMotivation #StayActive"]
print(input_tweet)

["Just finished an intense workout session at the gym! Feeling energized and ready to take on the day. 💪 #FitnessGoals #Workout. Remember, fitness is not just about the physical aspect. It's also about mental strength and overall well-being. Take care of your mind and body. #Fitness #Wellness. Finding the motivation to exercise can be tough sometimes, but the feeling you get after a great workout is worth it. Push through and stay committed to your fitness journey. #FitnessMotivation #StayActive"]


**Preprocessing given input**

In [None]:
input_tweet = pd.DataFrame(input_tweet)
input_tweet.columns = ['inputs']

In [19]:
input_tweet['inputs'][0] = p.clean(input_tweet['inputs'][0])
inp = datasets.DatasetDict({'input_dict':datasets.Dataset.from_dict(input_tweet)})
inp

DatasetDict({
    input_dict: Dataset({
        features: ['inputs'],
        num_rows: 1
    })
})

**Tokenizing input**

In [20]:
max_input_length = 512
max_target_length = 32

def prep(examples):
    model_inputs = tokenizer(examples['inputs'], max_length=max_input_length, truncation=True)
    return model_inputs

tokenized_input = inp.map(prep, batched=True)
tokenized_input

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    input_dict: Dataset({
        features: ['inputs', 'input_ids', 'attention_mask'],
        num_rows: 1
    })
})

**Generating summary**

In [21]:
output = trainer.predict(tokenized_input['input_dict'])
predicted_summaries = []
for i in range(0, 1): 
  predicted_summaries.append(tokenizer.decode(output[0][i], skip_special_tokens =  True))

In [22]:
predicted_summaries

['The author completed a challenging workout at the gym and is experiencing a mix of physical and mental fortitude.']