## Load librairies 

In [1]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import average_precision_score,f1_score,recall_score
import pickle
from transformers import AdamW


In [4]:
pd.set_option('display.max_colwidth', -1)

In [5]:
# Check if cuda is available 
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Loading data

In [6]:
sentences_data = pickle.load(open('data/tatoeba_sentences.pkl','rb'))
labels_data = pickle.load(open('data/tatoeba_grouped_labels.pkl','rb'))


## Defining the model 

In [8]:
def encode_tags(labels, encodings):
    """
    Function that adds -100 labels to subtokens or to <pad> and <cls> tokens . These artificial labels will be masked at training.
    More details can be found here https://huggingface.co/transformers/custom_datasets.html , "Token Classification with W-NUT Emerging Entities"
    """
    encoded_labels,index = [],[]
    for i,(doc_labels, doc_offset) in enumerate(zip(labels, encodings.offset_mapping)):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        # set labels whose first offset position is 0 and the second is not 0
        try :  
          doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        except :
          pass
        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

class WNUTDataset(torch.utils.data.Dataset):
    """
    Define class for creating torch dataset.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [9]:
def tokenization(sentences,labels):
    """
    This function tokenize every sentence of the training set, than adds masks to ensure padding. 
    """
    encodings = tokenizer(sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
    label_encodings = encode_tags (labels,encodings)

    return encodings,label_encodings
    

In [10]:
def getting_loader(sentences,labels) :
    """
    Creates train and validation loaders, the ratio of validation is 10% of all examples.
    """
    
    train_sents, validation_sents, train_labels, validation_labels = train_test_split(sentences, labels, 
                                                            random_state=2018, test_size=0.1)

    train_inputs , train_enc_labels = tokenization(train_sents,train_labels)
    val_inputs , val_enc_labels = tokenization(validation_sents,validation_labels)
    

    train_inputs.pop("offset_mapping") # we don't want to pass this to the model
    val_inputs.pop("offset_mapping")
    ## create datasets
    train_dataset = WNUTDataset(train_inputs, train_enc_labels)
    val_dataset = WNUTDataset(val_inputs, val_enc_labels)

    batch_size = 8
    ## create loader for training set
    train_sampler = torch.utils.data.RandomSampler(train_dataset)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32 , sampler = train_sampler)

    # Create the DataLoader for our validation set.
    validation_sampler = torch.utils.data.SequentialSampler(val_dataset)
    validation_dataloader = torch.utils.data.DataLoader(val_dataset, sampler=validation_sampler, batch_size=batch_size)

    
    return train_dataloader,validation_dataloader

In [11]:
def get_scores(preds, labels):
    """
    returns f1-score, average precision and recall given the true labels and the predictions of the distillbert model.
    """
    indexes = np.where(labels != -100)
    preds = preds[indexes]
    labels = labels[indexes]
    f1score = f1_score(preds, labels )
    avg_prec_score = average_precision_score(preds, labels)
    recall = recall_score(preds,labels)
    d_score = {'f1':f1score,'avg_precision':avg_prec_score,'recall':recall}
    return d_score

In [13]:
def training(train_loader,epochs,optimizer,model,scheduler) :
    """
    Trains the model and evaluate at the end of each epoch.
    """
    loss_values = []

    # For each epoch...

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
    

        # Reset the total loss for this epoch.
        total_loss = 0


        model.train()

        for step, batch in enumerate(train_loader):
            
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)


            model.zero_grad()        

            outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        

            loss = outputs[0]


            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_loader)            
    
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        


        print("")
        print("Running Validation...")


        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for l,batch in enumerate(validation_loader):
        
            # Add batch to GPU
            b_input_ids = batch['input_ids'].to(device)
            b_input_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
        
        
            # Telling the model not to compute or store gradients, saving memory and
            # speeding up validation
            with torch.no_grad():        
                outputs = model(b_input_ids, 
                            attention_mask=b_input_mask)
        
            logits = outputs[0]
            

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            ## stack all predictions and update scores at the end of the epoch.
            if l == 0 : 
              all_pred_flat = np.argmax(logits, axis=2).flatten()
              all_labels_flat = label_ids.flatten()
            else :
              pred_flat = np.argmax(logits, axis=2).flatten()
              labels_flat = label_ids.flatten()
              all_pred_flat  = np.hstack((all_pred_flat,pred_flat))
              all_labels_flat = np.hstack((all_labels_flat,labels_flat))
            
        eval_scores = get_scores(all_pred_flat, all_labels_flat)
        torch.save(model.state_dict(), 'models/distillbert_model_glose_finetuned_{}.pth'.format(str(epoch_i+2)))
        print('F1-score : {}, average precision : {}, recall : {}'.format(eval_scores['f1'],eval_scores['avg_precision'],eval_scores['recall']))


    return model
    

# Main 

In [14]:
## Defining epochs hyperparameters
epochs=5

## Defining the tokenizer and the pretrained weights of the distillbert model
tokenizer_class, pretrained_weights = (ppb.DistilBertTokenizerFast,'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights,do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [20]:
## Defining the model : we must specify that it is the model for token classification. 

model = ppb.DistilBertForTokenClassification.from_pretrained(
    pretrained_weights, # Use the distillbert model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.to(device)

## load pretrained model
PATH = '/content/drive/MyDrive/glose/distillbert_model_glose_finetuned_3.pth'
model.load_state_dict(torch.load(PATH))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

<All keys matched successfully>

I trained the model only with 40000 examples. With this number the performance

In [None]:
train_loader,validation_loader=getting_loader(sentences_data[:40000],labels_data[:40000])


optimizer =AdamW(model.parameters(), ## Creating the optimizer 
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
            
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_loader) * epochs
            
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)
            
model=training(train_loader,epochs,optimizer,model,scheduler)



k number  5
k number  0

Training...

  Average training loss: 0.03

Running Validation...
F1-score : 0.9235586870211933, average precision : 0.8601030323886507, recall : 0.9102733686067019

Training...

  Average training loss: 0.02

Running Validation...
F1-score : 0.9261238957969954, average precision : 0.8640268459676812, recall : 0.9183218031689355

Training...

  Average training loss: 0.02

Running Validation...
F1-score : 0.9254212780156832, average precision : 0.8640145579218905, recall : 0.9073064340239912

Training...

  Average training loss: 0.02

Running Validation...
F1-score : 0.9245009479201517, average precision : 0.8620838933904672, recall : 0.908692316124082

Training...


KeyboardInterrupt: ignored

Best performance on validation is on epoch 2. The scores are : <br>
F1-score : 0.9235586870211933, average precision : 0.8601030323886507, recall : 0.9102733686067019. 


In [None]:
## Saves the model as pth file 
#torch.save(model.state_dict(), '/content/drive/MyDrive/glose/distillbert_model_glose_finetuned_final.pth')

# Evaluation on one text 

In [27]:
def match_sent(sentences , output_path):
  """
  The reconstruction of sentences with distilbert tokenization to recover 
  the original sentence can't be done with just "join". So we add some preprocessing
  steps to recover approximately the same sentences. Note that we will lose some
  properties such as capital letters. 
  This function adds also spans + save all sentences in a text file.
  """
  sentences = [sentence.replace(' ##','') for sentence in sentences]
  sentences = [sentence.replace(" ' ","'") for sentence in sentences]
  sentences = [sentence.replace("did n't","didn't") for sentence in sentences]

  ## add span 
  sentences = ["<span>"+sentence+'</span>' for sentence in sentences]
  with open(output_path, 'w') as output:
    for sentence in sentences:
        output.write(sentence + '\n')
  return sentences
def normalize (preds): 
  """
  fonction that replaces 11 (i.e two adjacent tokens that both represent the ending of a sentence) 
  with 10 to avoid errors.
  """
  l = list(preds)
  string_list = ''.join(map(str,l))
  string_list = string_list.replace('11', '01')
  new_preds = np.array(list(map(int, list(string_list))))
  return new_preds
def get_sentences (indexes_end,tokens_recov,sentences) :
  """
  given the indexes of tokens that end sentences and the list of all the tokens ,
  This function gives the list of all sentences contained in window_sentences.
  """
  current = []
  for k in range(len(tokens_recov)) :
    current.append(tokens_recov[k])
    if k in indexes_end :
      sentences.append(" ".join(current))
      current = []
  return sentences

In [31]:
import nltk
from scipy.special import softmax

nltk.download('punkt')
full_text = "The first  pig was very lazy he didn't want to work  and he built his house out of straw the second pig worked a little bit harder but he was somewhat lazy too he built his house out of sticks. Then, they sang, danced and played together the rest of the day."


tokenized_text = nltk.word_tokenize(full_text.lower()) ## tokenize all text with nltk
model.eval()

sentences = []
max_length = 10 ## size of sliding window
current_begin = 0 ## beginning index of window_sentences , relative to tokenized_text.
moving_add = 0 ## we will use this if window_sentences is an unfinished sentence.
window_sentences = tokenized_text[:max_length]
j,t=0,0
while len(window_sentences) !=0 : 
  j+=1
  inputs_enc = tokenizer(window_sentences, is_split_into_words= True, return_offsets_mapping=False, 
                       padding=False, truncation=True)
  with torch.no_grad():     
    input_ids_ = torch.tensor(inputs_enc.input_ids).unsqueeze(0).to(device)
    outputs = model(input_ids_)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
  preds = np.argmax(logits, axis=2).flatten()[1:-1] ## take all except cls and sep preds
  
  preds = normalize(preds)
  tokens_recov = tokenizer.convert_ids_to_tokens(inputs_enc['input_ids'])[1:-1]
  
  ## get the indexes of elements that end sentences
  indexes_end = np.where(preds==1)[0]
  sentences = get_sentences (indexes_end,tokens_recov ,sentences)
  

  if len(indexes_end)==1 : # if we have only one ending token , in the end of the sentence
      ## this case means that there is no ending token except the default last one, 
      ## so we add 10 tokens to sentences test
      moving_add +=10  

      ## we stop if we exceed tokenized_text twice.
      if current_begin+max_length+moving_add>len(tokenized_text): 
        t+=1
        if t == 2 : 
          break 
      window_sentences = tokenized_text[current_begin:current_begin+max_length+moving_add]
      sentences.pop(-1)
      continue
      
      #current_begin += max_length

  moving_add=0
  
  ## this is in case we hove more than two ending tokens. 
  last_sent = sentences[-1] # we will remove last sentence.
  first_token = sentences[-1].split()[0]
  indexes_first = np.where(np.array(window_sentences) == first_token)[0]
  for index in reversed(list(indexes_first)) : 
    if index<=(len(window_sentences)-len(sentences[-1].split())+4) :
      index_first = index
      break
  
  ## window_sentences will be defined as the window beginning from the last sentence and we add max_length tokens
  window_sentences = tokenized_text[current_begin+index_first:current_begin+index_first+max_length]
  if current_begin+index_first > len(tokenized_text) : 
    break
  sentences.pop(-1)
  current_begin += index_first
sentences = match_sent(sentences,"sentences.txt")
print(sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
57
['<span>the first pig was very lazy</span>', "<span>he didn't want to work</span>", '<span>and he built his house out of straw</span>', '<span>the second pig worked a little bit harder but he was somewhat lazy too</span>', '<span>he built his house out of sticks .</span>', '<span>then , they sang , danced and played together the rest of the day .</span>']
