In [None]:
!pip install rouge --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 2.2MB 10.1MB/s 
[K     |████████████████████████████████| 870kB 43.6MB/s 
[K     |████████████████████████████████| 3.3MB 41.3MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import re
import json
import numpy as np
import pandas as pd

from rouge import Rouge 
from tqdm import tqdm_notebook

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import EncoderDecoderModel, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

from sklearn.model_selection import train_test_split

print("GPU Torch Available = {}".format(torch.cuda.is_available()))
print("Torch Version = {}".format(torch.__version__))

GPU Torch Available = True
Torch Version = 1.8.1+cu101


#### **Encoder Decoder Model**

In [None]:
# Model Selection

# Regular Models
bert_base_cased = 'bert-base-cased'
roberta_base = 'roberta-base'
gpt2 = 'gpt2'
electra = 'google/electra-small-discriminator'
t5_base = 't5-base'
bart = 'facebook/bart-base'

# Heavy Memory Dependant Models (For High RAM and High GPU Systems)
bert_large_cased = 'bert-large-cased'
roberta_large = 'roberta-large'
gpt2_medium = 'gpt2-medium'
t5_large = 't5-large'
bart_large = 'facebook/bart-large'

# Select Pretrained Weights
Pretrained_Weight = bert_large_cased                  # Select Pretrained Weights

# Encoder-Decoder
seq2seq = EncoderDecoderModel.from_encoder_decoder_pretrained(Pretrained_Weight, Pretrained_Weight)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(Pretrained_Weight)

# Set Special Tokens
seq2seq.config.decoder_start_token_id = tokenizer.bos_token_id
seq2seq.config.eos_token_id = tokenizer.eos_token_id
seq2seq.config.pad_token_id = tokenizer.pad_token_id

# Parameters for Beam Search
seq2seq.config.vocab_size = seq2seq.config.decoder.vocab_size
seq2seq.config.max_length = 142
seq2seq.config.min_length = 56
seq2seq.config.no_repeat_ngram_size = 3
seq2seq.config.early_stopping = True
seq2seq.config.length_penalty = 2.0
seq2seq.config.num_beams = 4

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




#### **Data Loading and Preprocessing**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading Dataset
file = '/content/drive/MyDrive/Title Generation NLP/Title Generation/Dataset_Title_Summarized_10000.xlsx' 
df = pd.read_excel(file, names = ['ID','Abstract', 'Title'])
df = df.drop(['ID'], axis=1)
df

Unnamed: 0,Abstract,Title
0,GPs do not scale with big traffic data due to ...,Local Gaussian Processes for Efficient Fine G...
1,a central challenge to using first order metho...,A Generic Approach for Escaping Saddle points
2,automakers have in development or in productio...,Formulation of Deep Reinforcement Learning Ar...
3,a visual relational knowledge graph KG is a mu...,Representation Learning for Visual Relational...
4,se presenta un nuevo enfoque en el contexto de...,Semantic Preserving Embeddings for Generalize...
...,...,...
9995,this paper introduces SC2LE StarCraft II Learn...,StarCraft II A New Challenge for Reinforcemen...
9996,GGQ ID3 is a multi relational decision tree le...,Induction of Decision Trees based on Generali...
9997,many real world reinforcement learning problem...,Reinforcement Learning in POMDPs with Memoryl...
9998,the proposed anytime neural networks ANNs prod...,Anytime Neural Network a Versatile Trade off ...


In [None]:
# Train Test Split of Dataset
train_df, test_df = train_test_split(df, test_size = 0.25, random_state = 42)
print('Train Dataset Length = {}'.format(len(train_df)))
print('Test Dataset Length  = {}'.format(len(test_df)))

Train Dataset Length = 7500
Test Dataset Length  = 2500


In [None]:
# Data Preparation into Pandas Dataframe for Model Input
def get_data(dataframe):
  abstract = list(dataframe['Abstract'])
  title = list(dataframe['Title'])
  
  raw_data_train = {'Abstract': abstract, 'Title': title}
  df = pd.DataFrame(raw_data_train, columns = ['Abstract','Title'])
  return df

train_data = get_data(train_df)
test_data = get_data(test_df)

print('Training Data:')
print(train_data[0:3])
print('\nTesting Data:')
print(test_data[0:3])

Training Data:
                                            Abstract                                              Title
0  the paper proposes an idealistic approach for ...   Modeling State Conditional Observation Distri...
1  we reduce this effort by generating inference ...   Composing inference algorithms as program tra...
2  the autoencoder is trained to reconstruct the ...   Learning Multilingual Word Representations us...

Testing Data:
                                            Abstract                                              Title
0  deep neural network is difficult to train and ...   All You Need is Beyond a Good Init Exploring ...
1  a small set of unique phase synchronised patte...   On the Existence of Synchrostates in Multicha...
2  most important aspect of inferring causal effe...   Causal Effect Inference with Deep Latent Vari...


In [None]:
# Data Preparation for Seq2Seq Model Input
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len_enc, max_len_dec):
      self.tokenizer = tokenizer
      self.data = dataframe
      self.abstract = dataframe.Abstract
      self.title = dataframe.Title
      self.encoder_max_len = max_len_enc
      self.decoder_max_len = max_len_dec
        
    def __len__(self):
      return len(self.abstract)

    def __getitem__(self, index):
      # Abstract Tokenization
      abstract_data = str(self.abstract[index])
      inputs = self.tokenizer.encode_plus(abstract_data,
                                          truncation=True,
                                          add_special_tokens=True,
                                          max_length = self.encoder_max_len,
                                          padding = 'max_length',
                                          return_token_type_ids = False)
      input_ids = inputs['input_ids']
      input_mask = inputs['attention_mask']
      
      # Title Tokenization
      title_data = str(self.title[index])
      outputs = self.tokenizer.encode_plus(title_data,
                                           truncation=True,
                                           add_special_tokens=True,
                                           max_length = self.decoder_max_len,
                                           padding = 'max_length',
                                           return_token_type_ids = False)
      output_ids = outputs['input_ids']
      output_mask = outputs['attention_mask']
      
      return {'input_ids': torch.tensor(input_ids, dtype=torch.long),
              'attention_mask': torch.tensor(input_mask, dtype=torch.long),
              'decoder_input_ids': torch.tensor(output_ids, dtype=torch.long),
              'decoder_attention_mask' : torch.tensor(output_mask, dtype=torch.long),
              'labels': torch.tensor(output_ids, dtype=torch.long)}

ENCODER_MAX_LEN = 256                                                                   # Encoder Max Sequence Length (Change)
DECODER_MAX_LEN = 32                                                                    # Decoder Max Sequence Length

# For GPU Memory Restrictions (Will Remove in Main Code)
#train_size = 5000                                                                      
#test_size = 100

training_set = CustomDataset(train_data, tokenizer, ENCODER_MAX_LEN, DECODER_MAX_LEN)   # Training Set
testing_set = CustomDataset(test_data, tokenizer, ENCODER_MAX_LEN, DECODER_MAX_LEN)     # Testing Set

#### **Model Training**

In [None]:
# Device Mapping Select (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if (torch.cuda.is_available() == True):
  print("Model Mapped CUDA::GPU")
  seq2seq = seq2seq.cuda()

# Untuned Parameters (Should be Tuned)
training_args = Seq2SeqTrainingArguments(seed = 42,
                                         output_dir="./models/model_name",
                                         overwrite_output_dir = True,
                                         evaluation_strategy = "epoch",
                                         do_train = True,
                                         do_eval = True,
                                         per_device_train_batch_size = 2,
                                         per_device_eval_batch_size = 2,
                                         predict_with_generate = False,
                                         num_train_epochs = 10,
                                         logging_steps = 2,
                                         save_steps = 0, 
                                         warmup_steps = 2,
                                         load_best_model_at_end = True)

# Early Stopping Callback Setup
early_stop = EarlyStoppingCallback(early_stopping_threshold = 0.01)

# Instantiate Seq2Seq Trainer
trainer = Seq2SeqTrainer(model = seq2seq,
                         callbacks = [early_stop],
                         tokenizer = tokenizer,
                         args = training_args,
                         train_dataset = training_set,
                         eval_dataset = testing_set)

# Train Model
trainer.train()

Model Mapped CUDA::GPU


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,3.0851,4.560835,509.2819,4.909
2,3.9021,4.733779,498.8013,5.012


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,3.0851,4.560835,509.2819,4.909
2,3.9021,4.733779,498.8013,5.012


TrainOutput(global_step=7500, training_loss=4.258297165870666, metrics={'train_runtime': 21478.4863, 'train_samples_per_second': 1.746, 'total_flos': 1.990656839808e+16, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 396685312, 'train_mem_gpu_alloc_delta': 9212246528, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 0})

### **Model Evalation**

In [None]:
trainer.evaluate(testing_set)

{'epoch': 2.0,
 'eval_loss': 4.560834884643555,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 60844544,
 'eval_runtime': 501.601,
 'eval_samples_per_second': 4.984}

#### **Model Save**

In [None]:
# Model Save
model_save_path = '/content/drive/MyDrive/Title Generation NLP/Model Weights/Seq2Seq_state_dict_Bert_Large_10000'
torch.save(seq2seq.state_dict(), model_save_path + '.pth')

#### **Rouge Score Calculation**

In [14]:
# Rouge Score Calculation
test_size = 2500
rouge = Rouge()
test_index_limit = test_size                     # Test Size for GPU Constraints

# Placeholders for Rouge Scores
rouge_1_f = []
rouge_1_p = []
rouge_1_r = []

rouge_2_f = []
rouge_2_p = []
rouge_2_r = []

rouge_l_f = []
rouge_l_p = []
rouge_l_r = []

# Calculation
for i in range(test_index_limit):

  # Inference 
  data = test_data['Abstract'][i]
  input_sentence_ids = torch.tensor(tokenizer.encode(data, add_special_tokens = True)).unsqueeze(0).cuda()
  generated = seq2seq.generate(input_sentence_ids,
                                 max_length = 20, 
                                 decoder_start_token_id = seq2seq.config.decoder.pad_token_id)
  if (i % 50 == 0):
    print("Inferene Done for Test ID = {}".format(i))
  # Reference and Hypothesis for Rouge Score Calculation
  hypothesis = tokenizer.batch_decode(generated, skip_special_tokens = True)[0]      # Predicted Title
  reference = test_data['Title'][i]                                                  # Reference Title

  # Calculating Rouge Scores
  score = rouge.get_scores(hypothesis, reference)
  rouge_1_f.append(score[0]['rouge-1']['f'])
  rouge_1_p.append(score[0]['rouge-1']['p'])
  rouge_1_r.append(score[0]['rouge-1']['r'])

  rouge_2_f.append(score[0]['rouge-2']['f'])
  rouge_2_p.append(score[0]['rouge-2']['p'])
  rouge_2_r.append(score[0]['rouge-2']['r'])

  rouge_l_f.append(score[0]['rouge-l']['f'])
  rouge_l_p.append(score[0]['rouge-l']['p'])
  rouge_l_r.append(score[0]['rouge-l']['r'])

# Final Average Rouge Score Calculation
rouge_1_f_val = sum(rouge_1_f)/test_size
rouge_1_p_val = sum(rouge_1_p)/test_size
rouge_1_r_val = sum(rouge_1_r)/test_size

rouge_2_f_val = sum(rouge_2_f)/test_size
rouge_2_p_val = sum(rouge_2_p)/test_size
rouge_2_r_val = sum(rouge_2_r)/test_size

rouge_l_f_val = sum(rouge_l_f)/test_size
rouge_l_p_val = sum(rouge_l_p)/test_size
rouge_l_r_val = sum(rouge_l_r)/test_size
print('\n Scores:')
print('Avergae Rouge 1 F Score   : {}'.format(rouge_1_f_val))
print('Avergae Rouge 1 Precision : {}'.format(rouge_1_p_val))
print('Avergae Rouge 1 Recall    : {}'.format(rouge_1_r_val))
print('\n')
print('Avergae Rouge 2 F Score   : {}'.format(rouge_2_f_val))
print('Avergae Rouge 2 Precision : {}'.format(rouge_2_p_val))
print('Avergae Rouge 2 Recall    : {}'.format(rouge_2_r_val))
print('\n')
print('Avergae Rouge L F Score   : {}'.format(rouge_l_f_val))
print('Avergae Rouge L Precision : {}'.format(rouge_l_p_val))
print('Avergae Rouge L Recall    : {}'.format(rouge_l_r_val))

Inferene Done for Test ID = 0
Inferene Done for Test ID = 50
Inferene Done for Test ID = 100
Inferene Done for Test ID = 150
Inferene Done for Test ID = 200
Inferene Done for Test ID = 250
Inferene Done for Test ID = 300
Inferene Done for Test ID = 350
Inferene Done for Test ID = 400
Inferene Done for Test ID = 450
Inferene Done for Test ID = 500
Inferene Done for Test ID = 550
Inferene Done for Test ID = 600
Inferene Done for Test ID = 650
Inferene Done for Test ID = 700
Inferene Done for Test ID = 750
Inferene Done for Test ID = 800
Inferene Done for Test ID = 850
Inferene Done for Test ID = 900
Inferene Done for Test ID = 950
Inferene Done for Test ID = 1000
Inferene Done for Test ID = 1050
Inferene Done for Test ID = 1100
Inferene Done for Test ID = 1150
Inferene Done for Test ID = 1200
Inferene Done for Test ID = 1250
Inferene Done for Test ID = 1300
Inferene Done for Test ID = 1350
Inferene Done for Test ID = 1400
Inferene Done for Test ID = 1450
Inferene Done for Test ID = 1500


#### **Model Load**

In [None]:
# Model Load (Load Already Finetuned Model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_load_path = '/content/drive/MyDrive/Title Generation NLP/Model Weights/Seq2Seq_state_dict_Bert_Large_10000.pth'
seq2seq.load_state_dict(torch.load(model_load_path, map_location = device))

#### **Model Inference**

In [None]:
def inference(model, tokenizer, test_list, test_dataset, maximum_length, show_abstracts = True):
  for test_index in test_list:                      
    # Fetch Input and Reference from Test Dataset
    data = test_dataset['Abstract'][test_index]
    reference = test_dataset['Title'][test_index]

    # Inference
    input_sentence_ids = torch.tensor(tokenizer.encode(data, add_special_tokens=True)).unsqueeze(0).cuda()
    generated = model.generate(input_sentence_ids,
                               max_length = maximum_length, 
                               decoder_start_token_id = model.config.decoder.pad_token_id)
    hypothesis = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
    print('\nTest ID = {}'.format(test_index))
    if (show_abstracts == True):
      print('\nAbstract:')
      print(data)
    print('\nActual Title:')
    print(reference)
    print('\nPredicted Title:')
    print(hypothesis)

test_list = [1,4,5,7,9,10,12,20,22,100,134,200]                         # Test Indices for Inference
inference(seq2seq, tokenizer, test_list, test_data, 24, show_abstracts = False)