In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

from tqdm import tqdm
import time
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BartTokenizer, BartForConditionalGeneration



In [2]:
seed_val = 66
MIN_LEN = 20
MAX_LEN = 300
batch_size = 6
EPOCHS = 30
early_stopping_rounds = 5
lr = 5e-5
eps = .005

In [3]:
torch.cuda.empty_cache()

In [4]:
train_input = pd.read_csv('../input/denoising/ALTA_2017/train_input.csv')
train_output = pd.read_csv('../input/denoising/ALTA_2017/train_output.csv')

In [5]:
train_input.head()

Unnamed: 0,id,original
0,0,'Gondoliers' By Teachers Colleae The Adelaide ...
1,1,"Man Cufc Spoilt Bmbti Of Cimw Hertm BrielloE,..."
2,2,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...
3,3,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...
4,4,REPORTS FROM RURAL CENTRES AVON An evening was...


In [6]:
train_input.original.iloc[0]

"'Gondoliers' By Teachers Colleae The Adelaide . Teachers' College will present its annual Gilbert and Sullivan onera season in the Unley Town ORix irom April £i to 30.- ??-..?;.. Under the direction of Mr. Alva Penrose, who will again conduct, the students will perform 'The Gondoliers' which was given in the Tivoli Theatre two years ago. There will be nine principals and a chorus of 48. Bookings will open at Cawthorne's on April 13."

In [7]:
train_output.head()

Unnamed: 0,id,solution
0,0,"""Gondoliers"" By Teachers College The Adelaide ..."
1,1,"Lion Cub Spoilt Baby of Circus Herts Briellos,..."
2,2,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...
3,3,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...
4,4,REPORTS FROM RURAL CENTRES AVON An evening was...


In [8]:
train_input.shape, train_output.shape

((6000, 2), (6000, 2))

In [9]:
train = pd.merge(train_input, train_output, how='inner')
#train['solution'] = train.solution.apply(lambda x: re.sub('[^a-z0-9 ]', '', x.strip().lower()))
#train['original'] = train.original.apply(lambda x: re.sub(r'[^a-z0-9 ]', '', x.strip().lower()))

In [10]:
train.head()

Unnamed: 0,id,original,solution
0,0,'Gondoliers' By Teachers Colleae The Adelaide ...,"""Gondoliers"" By Teachers College The Adelaide ..."
1,1,"Man Cufc Spoilt Bmbti Of Cimw Hertm BrielloE,...","Lion Cub Spoilt Baby of Circus Herts Briellos,..."
2,2,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...,OFFENSIVE NOISE WITH HOOTER Woman Motorist Fin...
3,3,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...,PARIS TALKS BEFORE ROME MEETING Mr. Chamberlai...
4,4,REPORTS FROM RURAL CENTRES AVON An evening was...,REPORTS FROM RURAL CENTRES AVON An evening was...


In [11]:
train.original.iloc[0]

"'Gondoliers' By Teachers Colleae The Adelaide . Teachers' College will present its annual Gilbert and Sullivan onera season in the Unley Town ORix irom April £i to 30.- ??-..?;.. Under the direction of Mr. Alva Penrose, who will again conduct, the students will perform 'The Gondoliers' which was given in the Tivoli Theatre two years ago. There will be nine principals and a chorus of 48. Bookings will open at Cawthorne's on April 13."

In [12]:
train.original.apply(lambda x: len(x.split())).describe()

count    6000.00000
mean      471.93750
std       644.35035
min        18.00000
25%       115.00000
50%       241.00000
75%       577.25000
max      9323.00000
Name: original, dtype: float64

In [13]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [14]:
kf = KFold(n_splits=5)

for train_index, val_index in kf.split(train):
    break

In [15]:
val = train.iloc[val_index]
train = train.iloc[train_index]

In [16]:
trainX = torch.Tensor(np.asarray([tokenizer.encode(i, max_length=MAX_LEN, truncation=True, padding='max_length', add_special_tokens=True) \
                                  for i in tqdm(train.original.values)]))
trainy = torch.Tensor(np.asarray([tokenizer.encode(i, max_length=MAX_LEN, truncation=True, padding='max_length', add_special_tokens=True) \
                                  for i in tqdm(train.solution.values)]))

valX = torch.Tensor(np.asarray([tokenizer.encode(i, max_length=MAX_LEN, truncation=True, padding='max_length', add_special_tokens=True) \
                                for i in tqdm(val.original.values)]))
valy = torch.Tensor(np.asarray([tokenizer.encode(i, max_length=MAX_LEN, truncation=True, padding='max_length', add_special_tokens=True) \
                                for i in tqdm(val.solution.values)]))


100%|██████████| 4800/4800 [00:58<00:00, 82.13it/s]
100%|██████████| 4800/4800 [00:48<00:00, 98.33it/s]
100%|██████████| 1200/1200 [00:12<00:00, 96.48it/s]
100%|██████████| 1200/1200 [00:12<00:00, 97.20it/s]


In [17]:
trainX.shape, trainy.shape, valX.shape, valy.shape

(torch.Size([4800, 300]),
 torch.Size([4800, 300]),
 torch.Size([1200, 300]),
 torch.Size([1200, 300]))

In [18]:
trainX = torch.tensor(trainX, dtype=torch.long)
trainy = torch.tensor(trainy, dtype=torch.long)
valX = torch.tensor(valX, dtype=torch.long)
valy = torch.tensor(valy, dtype=torch.long)


  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [19]:
try:
    model = BartForConditionalGeneration.from_pretrained('../input/denoising/BART_denoise')
except:
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1553.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=557941479.0, style=ProgressStyle(descri…




In [20]:
for task_name in model.config.task_specific_params.keys():

    model.config.task_specific_params[task_name]['max_length'] = MAX_LEN
    model.config.task_specific_params[task_name]['min_length'] = MIN_LEN
    model.config.task_specific_params[task_name]['num_beams'] = 5

In [21]:
model.config

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel",
    "BartForConditionalGeneration",
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL

In [22]:
train_data_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(trainX, trainy), batch_size=batch_size)

val_data_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(valX, valy), batch_size=batch_size)

In [23]:
print ("Train and val loader length {} and {}".format(len(train_data_loader), len(val_data_loader)))

Train and val loader length 800 and 200


In [24]:
print ("Modeling")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("Device: {}".format(device))

model.to(device)

Modeling
Device: cuda


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): LearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0): EncoderLayer(
          (self_attn): Attention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
     

In [25]:
params = list(model.named_parameters())

In [26]:
from transformers import AdamW
optimizer = AdamW(model.parameters(),
                  lr = lr, # args.learning_rate
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [27]:
from transformers import get_linear_schedule_with_warmup

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_data_loader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [28]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [29]:
class SummarisationLoss(nn.Module):
    def __init__(self):
        super(SummarisationLoss, self).__init__()
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, output, target):
        x = F.log_softmax(output, dim=-1)
        norm = (target != 1).data.sum()
        print(x.contiguous().view(-1, x.size(-1)).shape, target.contiguous().view(-1).shape)
        return self.criterion(x.contiguous().view(-1, x.size(-1)), target.contiguous().view(-1)) / norm

In [30]:
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.cuda.empty_cache()

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

bad_epochs = 0

# For each epoch...
for epoch_i in range(0, EPOCHS):
    
    if bad_epochs < early_stopping_rounds:
        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_data_loader):

            # Progress update every 40 batches.
            if step % 60 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.  Loss: {}'.format(step, len(train_data_loader), elapsed, loss.item()))

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            input_ids = batch[0].to(device)
            output_ids = batch[1].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # are given and what flags are set. For our usage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            outputs = model(input_ids=input_ids, labels=output_ids, return_dict=True)
            loss, logits = outputs.loss, outputs.logits

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            #print (loss.item())
            
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_data_loader)            

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in val_data_loader:

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            input_ids = batch[0].to(device)
            output_ids = batch[1].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                outputs = model(input_ids=input_ids, labels=output_ids, return_dict=True)
                loss, logits = outputs.loss, outputs.logits


            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(val_data_loader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        if epoch_i >= 1:
            if avg_val_loss < training_stats[-1]['Valid. Loss'] and \
                    training_stats[-1]['Valid. Loss']-avg_val_loss >= eps:
                
                model.save_pretrained('BART_denoise')
                #torch.save(model.state_dict(), 'BERT_denoise.bin')
                bad_epochs = 0
            else:
                bad_epochs += 1

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
    
    else:
        print ("Early stopping!!")
        break

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    60  of    800.    Elapsed: 0:00:21.  Loss: 0.839974045753479
  Batch   120  of    800.    Elapsed: 0:00:40.  Loss: 0.41727033257484436
  Batch   180  of    800.    Elapsed: 0:01:00.  Loss: 0.33086392283439636
  Batch   240  of    800.    Elapsed: 0:01:20.  Loss: 0.4258337616920471
  Batch   300  of    800.    Elapsed: 0:01:40.  Loss: 0.27351289987564087
  Batch   360  of    800.    Elapsed: 0:01:59.  Loss: 0.5332403779029846
  Batch   420  of    800.    Elapsed: 0:02:19.  Loss: 0.6342299580574036
  Batch   480  of    800.    Elapsed: 0:02:39.  Loss: 0.7897727489471436
  Batch   540  of    800.    Elapsed: 0:02:58.  Loss: 0.49688342213630676
  Batch   600  of    800.    Elapsed: 0:03:18.  Loss: 0.4459868371486664
  Batch   660  of    800.    Elapsed: 0:03:38.  Loss: 0.11577043682336807
  Batch   720  of    800.    Elapsed: 0:03:57.  Loss: 0.3996255099773407
  Batch   780  of    800.    Elapsed: 0:04:17.  Loss: 0.4667585790157318

  Average training loss: 0.59
  T

In [31]:
#model.load_state_dict(torch.load('BERT_denoise.bin'))

In [32]:
model = BartForConditionalGeneration.from_pretrained('./BART_denoise/')

In [33]:
with open('training_stats.txt','w') as f:
    for l in training_stats:
        f.write(str(l))
        f.write('\n')

In [34]:
model.eval()
generate = model.generate(valX[0:1].to(device), num_beams=5, max_length=MAX_LEN)

RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

In [35]:
val.original.iloc[0], val.solution.iloc[0]

("'Gondoliers' By Teachers Colleae The Adelaide . Teachers' College will present its annual Gilbert and Sullivan onera season in the Unley Town ORix irom April £i to 30.- ??-..?;.. Under the direction of Mr. Alva Penrose, who will again conduct, the students will perform 'The Gondoliers' which was given in the Tivoli Theatre two years ago. There will be nine principals and a chorus of 48. Bookings will open at Cawthorne's on April 13.",
 '"Gondoliers" By Teachers College The Adelaide Teachers\' College will present its an nual Gilbert and Sullivan opera season in the Unley Town Hall from April 27 to 30. Under the direction of Mr. Alva Penrose, who will again conduct, the students will perform "The Gondol iers" which was given in the Tivoli Theatre two years ago. There will be nine prin cipals and a chorus of 48. Bookings will open at Cawthorne\'s on April 13.')

In [36]:
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generate])

NameError: name 'generate' is not defined

In [37]:
def generate_text(test_index):
    generated = model.generate(valX[test_index:test_index+1].to(device), max_length=MAX_LEN, num_beams=5)
    gen_text = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in generated][0]
    gen_text = gen_text.replace('[CLS]','')
    gen_text = gen_text.replace('[SEP]','')
    gen_text = gen_text.replace('[PAD]','')
    gen_text = gen_text.strip()
    
    return gen_text

In [38]:
output = []

for i in tqdm(val.index):
    output.append(generate_text(i))
    
val['predicted_text'] = output

  0%|          | 0/1200 [00:00<?, ?it/s]


RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

In [39]:
import numpy as np
import nltk.translate.bleu_score as bleu

def WRR(text1,text2):
    a = set(text1.lower().split())
    b = set(text2.lower().split())
    
    if (len(a) == 0) and (len(b) == 0):
        return .5
    
    c = a.intersection(b)
    return float(len(c))/(len(a) + len(b) - len(c))

def levenshtein(seq1, seq2):
    seq1 = seq1.lower()
    seq2 = seq2.lower()
    
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    #print (matrix)
    return (matrix[size_x - 1, size_y - 1])

def CRR(text1, text2):
    try:
        return 1 - float(levenshtein(text1,text2))/max(len(text1),len(text2))
    except:
        return 0

def bleu_score(text1,text2):
    return bleu.sentence_bleu([text1.lower().split()],text2.lower().split())


In [40]:
val['WRR_1'] = val.apply(lambda x: WRR(x.original, x.solution), axis=1)
val['WRR_2'] = val.apply(lambda x: WRR(x.predicted_text, x.solution), axis=1)

#val['CRR_1'] = val.apply(lambda x: CRR(x.original, x.solution), axis=1)
#val['CRR_2'] = val.apply(lambda x: CRR(x.predicted_text, x.solution), axis=1)

val['BLEU_1'] = val.apply(lambda x: bleu_score(x.original, x.solution), axis=1)
val['BLEU_2'] = val.apply(lambda x: bleu_score(x.predicted_text, x.solution), axis=1)

AttributeError: 'Series' object has no attribute 'predicted_text'

In [41]:
val[['WRR_1', 'WRR_2', 'BLEU_1', 'BLEU_2']].describe()

KeyError: "['BLEU_2', 'WRR_2', 'BLEU_1'] not in index"

In [42]:
print (val[val.WRR_1 < val.WRR_2].shape, val[val.BLEU_1 < val.BLEU_2].shape)
val.to_csv('validation_output.csv',index=False)

AttributeError: 'DataFrame' object has no attribute 'WRR_2'