In [1]:
!pip install transformers -q
!pip install sentencepiece
!pip install datasets
!pip install rouge_score
!pip install tqdm

#torch=1.10.0+cu111
#tqdm=4.63.0
#transformers=4.17.0
#sentencepiece==0.1.96

[K     |████████████████████████████████| 4.0 MB 34.9 MB/s 
[K     |████████████████████████████████| 77 kB 8.4 MB/s 
[K     |████████████████████████████████| 895 kB 82.3 MB/s 
[K     |████████████████████████████████| 6.6 MB 62.3 MB/s 
[K     |████████████████████████████████| 596 kB 62.3 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 25.8 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 23.0 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |██████████████████████

In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!nvidia-smi

Mon Apr 18 17:19:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import numpy as np
import pandas as pd

from numpy.core.defchararray import find
from numpy.core.defchararray import replace

import random

import torch
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer, TrainingArguments, logging
from transformers import EncoderDecoderModel, AutoModelForSeq2SeqLM, AutoModelForCausalLM

In [5]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
#Saving into log (Excel file)
import openpyxl 
def SaveToExperimentLog(Experiments_file, LogEntry, data):
    book = openpyxl.load_workbook(Experiments_file)
    writer = pd.ExcelWriter(Experiments_file, engine='openpyxl') 
    writer.book = book

    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    data.to_excel(writer, LogEntry[0:29],index=False)

    writer.save()
    writer.close()

In [7]:
#Data
Data = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Data/Headlines/'
Model = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/'
Logs = '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/Logs/'

#Train_Messages_filename='Train_Headlines.csv'
#Train_Messages_full_filename=os.path.join(Data, Train_Messages_filename)

#Val_Messages_filename='Val_Headlines.csv'
#al_Messages_full_filename=os.path.join(Data, Val_Messages_filename)


source_field='message'
target_field='Topic'

In [8]:
#Experiment
#Experiments log file 
Experiments_file='/content/drive/MyDrive/Colab Notebooks/Projects/eva/ExperimentLogs/Headlines.xlsx'
Experiments_tab='Experiments'
Experiment_name='Model21'

Validation_tab='Validation'

Prediction_tab='Predictions'

In [9]:
#set random seed
seed=42
random.seed(seed)
np.random.seed(seed)
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:2' #https://pytorch.org/docs/stable/cudnn_rnn_determinism.html
os.environ['PL_GLOBAL_SEED'] = str(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [10]:
#Additional Training and validation 




logging_steps=100
save_steps=500
save_total_limit=1
eval_steps=100
warmup_steps=100


In [11]:
#logging.set_verbosity_info()

## Experiment configuration

In [12]:
Experiment = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Experiments_tab)
Experiment = Experiment[Experiment['Name']==Experiment_name]
Experiment.transpose()

Unnamed: 0,20
Name,Model21
base_model_name,cointegrated/rut5-base
batch_size,8.0
learning_rate,0.0007
num_train_epochs,3.0
gradient_accumulation_steps,96.0
max_target_tokens_count,25.0
max_source_tokens_count,150.0
train_dataset,/content/drive/MyDrive/Colab Notebooks/Project...
val_dataset,/content/drive/MyDrive/Colab Notebooks/Project...


## Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(Experiment['base_model_name'].iloc[0])

Downloading:   0%|          | 0.00/233 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/808k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

## Data load

In [14]:
train_records = pd.read_csv(Experiment['train_dataset'].iloc[0],  index_col=False)
val_records = pd.read_csv(Experiment['val_dataset'].iloc[0],  index_col=False)

## Tokenization

In [15]:
def Seq2SeqDataConvertion(data_to_convert):
  data_converted = []
  for index, row in tqdm(data_to_convert.iterrows()):
    text=row[source_field]
    summary=row[target_field]
    inputs = tokenizer(
            text,
            add_special_tokens=True,
            max_length=int(Experiment['max_source_tokens_count'].iloc[0]),
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
    inputs = {i: j.squeeze(0) for i, j in inputs.items()}
    if summary is not None:
      outputs = tokenizer(
              summary,
              add_special_tokens=True,
              max_length=int(Experiment['max_target_tokens_count'].iloc[0]),
              padding='max_length',
              truncation=True,
              return_tensors='pt'
            )
      labels = outputs['input_ids'].squeeze(0)
      labels[outputs['attention_mask'].squeeze(0) == 0] = -100
      inputs['labels'] = labels
    data_converted.append(inputs)
  return data_converted

In [16]:
train_dataset = Seq2SeqDataConvertion(train_records)

78658it [01:52, 701.17it/s]


In [17]:
val_dataset = Seq2SeqDataConvertion(val_records)

20809it [00:29, 700.48it/s]


## Pretrained model

In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(Experiment['base_model_name'].iloc[0])

Downloading:   0%|          | 0.00/932M [00:00<?, ?B/s]

In [19]:
# Default model generation params
model.config.num_beams = 4
model.config.max_length = 25

## Training

In [20]:
training_args = TrainingArguments(
        output_dir=Model,
        logging_dir=Logs,
        per_device_train_batch_size=int(Experiment['batch_size'].iloc[0]),
        per_device_eval_batch_size=int(Experiment['batch_size'].iloc[0]),
        logging_steps=logging_steps,
        eval_steps=eval_steps,
        evaluation_strategy='steps',
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        learning_rate=float(Experiment['learning_rate'].iloc[0]),
        warmup_steps=warmup_steps,
        num_train_epochs=int(Experiment['num_train_epochs'].iloc[0]),
        gradient_accumulation_steps=int(Experiment['gradient_accumulation_steps'].iloc[0]),
        load_best_model_at_end=True
    )

In [21]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

In [22]:
trainer.train() #resume_from_checkpoint = True
model.save_pretrained(Model)
tokenizer.save_pretrained(Model)

***** Running training *****
  Num examples = 78658
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 768
  Gradient Accumulation steps = 96
  Total optimization steps = 306


Step,Training Loss,Validation Loss
100,6.2825,2.984684
200,3.2283,2.803259
300,2.9764,2.753646


***** Running Evaluation *****
  Num examples = 20809
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20809
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20809
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/special_tokens_map.json
Copy vocab file to /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/spiece.model


('/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/spiece.model',
 '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/tokenizer.json')

## Validation

###Data

In [23]:
test_records = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Prediction_tab)

###Metrics

In [24]:
from datasets import load_metric
metric = load_metric("rouge")

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

###Load Run Configuration

In [25]:
Validation = pd.read_excel(open(Experiments_file, 'rb'), sheet_name=Validation_tab)  
if not('rouge1' in Validation.columns):
  Validation['rouge1']=0.0
if not('rouge2' in Validation.columns):
  Validation['rouge2']=0.0
if not('rougeL' in Validation.columns):
  Validation['rougeL']=0.0
if not('rougeLsum' in Validation.columns):
  Validation['rougeLsum']=0.0   
Validation[((Validation['Experiment']==Experiment_name) & (Validation['rouge1'].isna()))]  

Unnamed: 0,Run,Experiment,input_ids_max_length,no_repeat_ngram_size,repetition_penalty,min_length,max_length,num_beams,num_return_sequences,do_sample,...,temperature,rouge1,rouge2,rougeL,rougeLsum,Comment,Abstractive,From End,From Middle,From Start
28,27.0,Model21,150.0,4.0,5.0,3.0,25.0,4.0,1.0,False,...,1.0,,,,,Model trained in with v3 without fix of the o...,,,,


###Tokenizer and Model (can be just created/trained and no load is needed)

In [26]:
tokenizer = AutoTokenizer.from_pretrained(Model)
model = AutoModelForSeq2SeqLM.from_pretrained(Model)

Didn't find file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/added_tokens.json. We won't load it.
loading file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/spiece.model
loading file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/tokenizer.json
loading file None
loading file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/special_tokens_map.json
loading file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/tokenizer_config.json
loading configuration file /content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/config.json
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/Projects/eva/Models/Headlines/",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer

In [27]:
def seq2seqDataValidation(validation_data,prediction_field,input_ids_max_length, no_repeat_ngram_size, repetition_penalty, min_length, max_length, num_beams, num_return_sequences, do_sample, early_stopping, top_k, temperature):
  predicted_data = pd.DataFrame()
  for chunk in tqdm(np.array_split(validation_data, chunk_size)):
    texts = [r[source_field] for i,r in chunk.iterrows()]
    input_ids = tokenizer(
            texts,                                                                                                     
            add_special_tokens=True,
            max_length=input_ids_max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )['input_ids']   
    output_ids = model.generate(
            input_ids=input_ids,
            no_repeat_ngram_size=no_repeat_ngram_size,
            repetition_penalty=repetition_penalty,
            min_length=min_length,    
            max_length=max_length, 
            num_beams=num_beams, 
            num_return_sequences=num_return_sequences, 
            do_sample=do_sample, 
            early_stopping=early_stopping,
            top_k=top_k,
            temperature=temperature
        )
    summaries = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    chunk[prediction_field]=summaries
    predicted_data = pd.concat([predicted_data,chunk])
  score = calc_rouge_scores(list(predicted_data[target_field]), list(predicted_data[prediction_field]))
  return (predicted_data, score)

In [28]:
chunk_size=200

In [29]:
def HeaderType(row,column):
  if row[column]==-1:
    return 'Abstractive'
  elif  row[column]<=row['size']/3:
    return 'From Start'
  elif  row[column]>row['size']/3 and row[column]<2*row['size']/3:
    return 'From Middle'   
  elif row[column]>=2*row['size']/3:
    return 'From End'

In [30]:
for index, val in Validation[((Validation['Experiment']==Experiment_name) & (Validation['rouge1'].isna()))].iterrows():
  print(int(val['Run']))
  prediction_field = '%s_Headline'%Experiment_name + '_%s'%int(val['Run'])
  test_records,score = seq2seqDataValidation(test_records, prediction_field,
                                     int(val['input_ids_max_length']), 
                                     int(val['no_repeat_ngram_size']), 
                                     float(val['repetition_penalty']), 
                                     int(val['min_length']), 
                                     int(val['max_length']), 
                                     int(val['num_beams']), 
                                     int(val['num_return_sequences']), 
                                     bool(val['do_sample']), 
                                     bool(val['early_stopping']), 
                                     int(val['top_k']), 
                                     int(val['temperature']))
  Validation.at[index,'rouge1']=score['rouge1']
  Validation.at[index,'rouge2']=score['rouge2']
  Validation.at[index,'rougeL']=score['rougeL']
  Validation.at[index,'rougeLsum']=score['rougeLsum']

  a = np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.upper(test_records[source_field].values.astype(str)),'.',''),',',''),'!',''),'?',''),':',''),';','')
  b = np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.replace(np.char.upper(test_records[prediction_field].values.astype(str)),'.',''),',',''),'!',''),'?',''),':',''),';','')
  test_records['%s_pos'%Experiment_name + '_%s'%int(val['Run'])]=find(a, b)
  test_records['%s_Type'%Experiment_name + '_%s'%int(val['Run'])] = test_records.apply(lambda x: HeaderType(x, '%s_pos'%Experiment_name + '_%s'%int(val['Run'])), axis=1)

  stats_df = test_records.groupby(['%s_Type'%Experiment_name + '_%s'%int(val['Run'])]).size().reset_index(name='Topic Stats')
  stats_df.columns = ['Type','Cnt']

  Total = len(test_records)

  try: 
    Validation.at[index,'Abstractive']=100*stats_df[stats_df['Type']=='Abstractive']['Cnt'].values[0]/Total
  except Exception:
    pass
    
  try:
    Validation.at[index,'From End']=100*stats_df[stats_df['Type']=='From End']['Cnt'].values[0]/Total
  except Exception:
    pass

  try:  
    Validation.at[index,'From Middle']=100*stats_df[stats_df['Type']=='From Middle']['Cnt'].values[0]/Total
  except Exception:
    pass

  try:  
    Validation.at[index,'From Start']=100*stats_df[stats_df['Type']=='From Start']['Cnt'].values[0]/Total
  except Exception:
    pass
  

  try:
    SaveToExperimentLog(Experiments_file, Validation_tab, Validation)
    SaveToExperimentLog(Experiments_file, Prediction_tab, test_records)
  except:
    #Continue training even if there is an issue
    print('Error saving to file!') 


27


100%|██████████| 200/200 [51:48<00:00, 15.54s/it]
