In [1]:
import transformers
import json
import numpy as np
import tensorflow as tf
np.object = object
import torch

from datasets import Dataset
print(transformers.__version__)

2024-03-13 09:44:58.963930: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-13 09:44:58.964023: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-13 09:44:59.132907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


4.38.1


We also quickly upload some telemetry - this tells us which examples and software versions are getting used so we know where to prioritize our maintenance efforts. We don't collect (or care about) any personally identifiable information, but if you'd prefer not to be counted, feel free to skip this step or delete this cell entirely.

## Import the trained model (model, tokenizer and data collator)

In [2]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("/kaggle/input/debertav2-512")
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/debertav2-512')
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)

## Import test dataset

In [3]:
data_test = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))


In [4]:
ds_original = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data_test],
    "document": [x["document"] for x in data_test],
    "tokens": [x["tokens"] for x in data_test],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data_test],
})
ds_original_w_labels = Dataset.from_dict({ 
    "full_text": [x["full_text"] for x in data_test],
    "document": [x["document"] for x in data_test],
    "tokens": [x["tokens"] for x in data_test],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data_test],
    "provided_labels": [x["labels"] for x in data_test],
})

In [5]:
train_validtest = ds_original.train_test_split(test_size=0.2, seed = 42)
ds_original = train_validtest['test']
train_validtest_w_labels = ds_original_w_labels.train_test_split(test_size=0.2, seed = 42)
ds_original_w_labels = train_validtest_w_labels['test']

## Define the function to tokenize the dataset:
### - The text is reconstructed from the given tokens to ensure that labels correspond to the exact token (this is not strictly necessary but is to avoid any discrepancy).
### - token_map contains a map character <-> word_index, where word_index is a number corresponding to its position in the document (first word in the document -> 0)
### - The offset mapping is returned too

In [6]:
max_model_input_length = 512

In [7]:
def tokenize(example, tokenizer):

    text = []
    token_map = []
    
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        
        token_map.extend([idx]*len(t))
        
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, padding = 'max_length',max_length = 512, return_overflowing_tokens = True)
    
    return {
        **tokenized,
        "token_map": token_map,
    }

## Tokenize the dataset

In [8]:
ds_original = ds_original.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 4)

     

#0:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/340 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/340 [00:00<?, ?ex/s]

In [9]:
def find_first_zero_index(lst):
    try:
        return lst.index(0)
    except ValueError:
        return -1

In [11]:
def set_last_sequence(row):
    n_seq = len(row['input_ids'])
    
    token_pos_doc = np.array([[-1] * max_model_input_length] * n_seq)
    for seq in range(n_seq):
        token_pos_doc[seq][1:-1] = np.arange(seq*(max_model_input_length - 2), (seq+1)*(max_model_input_length - 2))
    row['token_pos_doc'] = token_pos_doc
    del token_pos_doc
    
    if n_seq >= 2:
        zero_index = find_first_zero_index(row['input_ids'][-1])
        if zero_index != -1:
        # fix input_ids 
            row['input_ids'][-1][-zero_index+1:] = row['input_ids'][-1][1:zero_index]
            row['input_ids'][-1][1:-zero_index+1] = row['input_ids'][-2][zero_index-1: -1]

        # fix offset_mapping
            row['offset_mapping'][-1][-zero_index+1:] = row['offset_mapping'][-1][1:zero_index]
            row['offset_mapping'][-1][1:-zero_index+1] = row['offset_mapping'][-2][zero_index-1 : -1]    

        # fix attention_mask
            row['attention_mask'][-1] = [1] * max_model_input_length
            
        # fix token_pos_doc
            row['token_pos_doc'][-1][-zero_index:-1] = row['token_pos_doc'][-1][:zero_index-1]
            row['token_pos_doc'][-1][1:-zero_index+1] = row['token_pos_doc'][-2][zero_index-1 : -1]    
    
    return row 



In [12]:
tokenized_ds_wo_predictions = ds_original.map(set_last_sequence, num_proc = 4)

     

#0:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/340 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/340 [00:00<?, ?ex/s]

In [13]:
def mean_prediction(dataframe_wo_predictions, predictions_tensor):
    '''
    PARAMETERS:
        -  dataframe_wo_predictions pd.DataFrame containing one input_id for each row 
        -  predictions_tensor a (n_documents, 512, 13) tensor containing the predictions
    
    OUTPUT:
        -  np.array containing the labels for a single document
    '''
    predictions_tensor = tf.math.softmax(predictions_tensor.predictions, axis=-1)
    predictions_array = np.concatenate(predictions_tensor)
    dataframe_wo_predictions['predictions'] = [row for row in predictions_array]
    
    predictions_df_grouped = dataframe_wo_predictions.groupby(['document', 'token_pos_doc'], group_keys = False)['predictions'].apply('mean').reset_index()
    predictions_df_grouped = predictions_df_grouped.sort_values(by=['document', 'token_pos_doc'])
    predictions_df_grouped = predictions_df_grouped.reset_index()
    
    predicted_labels = np.array([np.array(arr) for arr in predictions_df_grouped['predictions']])
    
    return predicted_labels[1:] # no start/end special tokens
    

In [14]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=16, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=data_collator, 
    tokenizer=tokenizer,
)

In [None]:
n_documents = len(tokenized_ds_wo_predictions)
tokenized_ds_wo_predictions.set_format('pandas')
preds_final = []

# predict a single document each time, in this way we are saving in RAM only one document at a time
# while the others stay in the disk
for i in range(n_documents):
    # getting a single document inside RAM memory
    df_document_i = tokenized_ds_wo_predictions[i]
    
    # input_ids is a list of lists so it needs to be exploded in order to make predictions
    df_document_i = df_document_i.drop(columns = ['full_text', 'tokens', 'trailing_whitespace',
     'offset_mapping', 'overflow_to_sample_mapping',
       'token_map', ])
    exploded_df = df_document_i.explode(['input_ids','attention_mask', 'token_pos_doc'])
    
    predictions_document_i = trainer.predict(Dataset.from_pandas(exploded_df))
    
    # we need to explode again in order to make each row corresponding to a single input_id (token)
    exploded_df = exploded_df.drop(columns = ['attention_mask'])
    exploded_df = exploded_df.explode(['input_ids', 'token_pos_doc'])
    predictions_document_i = mean_prediction(exploded_df, predictions_document_i)
    
    preds_final.append(predictions_document_i)

In [17]:
# # getting a single document inside RAM memory
# df_document_i = tokenized_ds_wo_predictions[3]

# # input_ids is a list of lists so it needs to be exploded in order to make predictions
# df_document_i = df_document_i.drop(columns = ['full_text', 'trailing_whitespace',
#  'offset_mapping', 'overflow_to_sample_mapping', 'tokens',
#    'token_map', ])
# exploded_df = df_document_i.explode(['input_ids', 'attention_mask', 'token_pos_doc'])

# # the dataframe is converted into a tf tensor and used to make predictions
# tensor_document_i = (Dataset.from_pandas(exploded_df)).to_tf_dataset(
# columns=['input_ids', 'attention_mask'],
# batch_size=len(tokenized_ds_wo_predictions[-1]),
# shuffle=False,
# collate_fn=data_collator,
# )
# predictions_document_i = model.predict(tensor_document_i)

# # we need to explode again in order to make each row corresponding to a single input_id
# exploded_df = exploded_df.explode(['input_ids', 'token_pos_doc'])
# dataframe_wo_predictions = exploded_df.copy() 
# predictions_tensor = predictions_document_i.copy() 
# predictions_tensor = tf.math.softmax(predictions_tensor.logits, axis=-1)
# predictions_array = np.concatenate(predictions_tensor)
# dataframe_wo_predictions['predictions'] = [row for row in predictions_array]
# predictions_df_grouped = dataframe_wo_predictions.groupby(['document', 'token_pos_doc'], group_keys = False)['predictions'].apply('mean').reset_index()
# predictions_df_grouped = predictions_df_grouped.sort_values(by=['document', 'token_pos_doc'])
# predictions_df_grouped = predictions_df_grouped.reset_index()
# predicted_labels = np.array([np.array(arr) for arr in predictions_df_grouped['predictions']])
# predicted_labels = np.argmax(predicted_labels, axis = -1)



    

In [18]:
import json
from pathlib import Path

config = json.load(open(Path('/kaggle/input/gianni/giannilbert/giannilbert') / "config.json"))
id2label = config["id2label"]  

In [19]:
def fix_offset_mapping(doc):
    '''Remove special tokens (start/end of sequence) from offset mapping column'''
    new_offset = []
    for seq_offset in doc['offset_mapping']:
        reduced_seq_offset = seq_offset[1:-1]
        new_offset.extend(reduced_seq_offset)
    doc['offset_mapping'] = new_offset
    return doc

ds_original = ds_original.map(fix_offset_mapping)
        

  0%|          | 0/1362 [00:00<?, ?ex/s]

In [20]:
# Choose the aggregation strategy in case of conflict:
#     -'first': the prediction for the first subtoken decides how the whole tkoen is classified.
#     -'max': the prediction of the subtoken whose confidence is the highest decides how the whole tkoen is classified.
#     -'average': the prediction for the whole is decided by performing the mean of the predictions for all the subtokens
aggregation_strategy = 'average'

In [21]:
def max_prob_vct(lists):
    max_list = max(lists, key=max)
    return max_list

In [22]:
ds_original

Dataset({
    features: ['full_text', 'document', 'tokens', 'trailing_whitespace', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'token_map'],
    num_rows: 1362
})

In [23]:
import pandas as pd

# Prepare to plunder the data for valuable triplets!
# document, token, label, token_str = [], [], [], []
all_documents_predictions_df = pd.DataFrame(columns = ['document', 'label', 'token','token_str', 'prob'])
i = 0
# Iterate over the documents (each row in the original_ds is a distinct document)
for p, token_map, offsets, tokens, doc in zip(preds_final, 
                                              ds_original["token_map"], 
                                              ds_original["offset_mapping"], 
                                              ds_original["tokens"], 
                                              ds_original["document"]):

    document_predictions_df = pd.DataFrame(columns = ['document', 'label', 'token','token_str', 'prob'])

    # Iterate through each token prediction in the document
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(np.argmax(np.array(token_pred)))]

        
        
        # If start and end indices sum to zero (special token), continue to the next iteration
        # Since we have removed special tokens for beginning/end of a sequence, we should never
        # enter this.
        if start_idx + end_idx == 0:
            continue

        # If the token mapping at the start index is a whitespace (-1), increment start index
        if token_map[start_idx] == -1:
            start_idx += 1

        # Ignore leading whitespace tokens ("\n\n")
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # If start index exceeds the length of token mapping, break the loop (end of document)
        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]  # relative original token
        
        keep = token_pred[-1] < 0.99999
        # Ignore whitespace tokens
        if token_id != -1 and keep:
#         if token_id != -1 :
#             new_prediction = {'document': doc, 'label': label_pred, 'token': token_id, 'token_str': tokens[token_id], 'prob': token_pred  }
            new_prediction = [ doc,label_pred, token_id,  tokens[token_id], token_pred]

            document_predictions_df.loc[len(document_predictions_df)] = new_prediction
#             new_prediction = {'document': doc, 'label': label_pred, 'token': token_id, 'token_str': tokens[token_id], 'prob': token_pred  }
#             document_predictions_df = pd.concat([document_predictions_df,new_prediction], ignore_index=True)
    
#     grouped = document_predictions_df.groupby(['document','token','token_str'])
#     document_predictions_df = grouped.filter(lambda x: (x['label'] != 'O').any())
    
    if aggregation_strategy == 'first':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str'], group_keys = False, sort = False).first().reset_index()
    elif aggregation_strategy == 'average':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str', 'label'], group_keys = False, sort = False)['prob'].mean().reset_index()
    elif aggregation_strategy == 'max':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str'], group_keys = False, sort = False)['prob'].agg(max_prob_vct).reset_index()
    all_documents_predictions_df = pd.concat([all_documents_predictions_df, document_predictions_df], ignore_index=True)
    


In [26]:
all_documents_predictions_df[:60]

Unnamed: 0,document,label,token,token_str,prob
0,11301,B-EMAIL,0,jarviscindy@hotmail.com,"[1.0, 2.2790219e-09, 1.546413e-09, 2.0085469e-..."
1,11301,B-EMAIL,441,jarviscindy@hotmail.com,"[1.0, 2.148762e-09, 1.5796331e-09, 2.286665e-0..."
2,3732,B-NAME_STUDENT,0,Jorge,"[8.0853996e-10, 8.5100954e-10, 1.0, 2.0592634e..."
3,3732,I-NAME_STUDENT,1,Garrido,"[1.2815753e-09, 1.0856567e-09, 1.1890176e-09, ..."
4,5470,B-NAME_STUDENT,10,Alexandra,"[7.0177797e-10, 7.6005435e-10, 1.0, 2.1657924e..."
5,5470,I-NAME_STUDENT,11,Beyer,"[1.1038538e-09, 1.0896252e-09, 1.8227403e-09, ..."
6,22088,B-NAME_STUDENT,17,María,"[5.335832e-10, 7.658418e-10, 1.0, 2.0557904e-0..."
7,22088,I-NAME_STUDENT,18,José,"[1.230865e-09, 1.2614872e-09, 1.4320612e-09, 1..."
8,22088,I-NAME_STUDENT,19,Reyes,"[1.1293931e-09, 1.3400011e-09, 1.3222492e-09, ..."
9,12077,B-NAME_STUDENT,603,Kate,"[9.493654e-10, 8.978064e-10, 1.0, 2.1216033e-0..."


In [33]:
# the original train dataset does not have a column that associates the provided token to a number so here we add it
def add_token_id(row):
    row['token'] = np.arange(len(row['tokens']))
    return row
ds_original_w_labels = ds_original_w_labels.map(add_token_id, num_proc = 4)

     

#0:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/341 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/340 [00:00<?, ?ex/s]

 

#3:   0%|          | 0/340 [00:00<?, ?ex/s]

In [34]:
ds_original_w_labels.set_format('pandas')
ds_original_w_labels = ds_original_w_labels[:]
ds_original_w_labels = ds_original_w_labels.explode(['provided_labels', 'token'])
all_positives = ds_original_w_labels[ds_original_w_labels['provided_labels'] != 'O']
all_positives = all_positives.rename(columns = {'provided_labels': 'label'})

In [35]:
total_predictions_df = all_documents_predictions_df.copy()

In [36]:
def assess_performance_threshold(all_documents_predictions_df, threshold):

    matrix_probs = np.array([np.array(row) for row in all_documents_predictions_df['prob']])
    preds_normal = np.argmax(matrix_probs,axis = -1)
    preds_without_O = np.argmax(matrix_probs[:,:12],axis = -1)
    O_preds = matrix_probs[:,-1]
    labels = np.where(O_preds < threshold, preds_without_O , preds_normal)
    labels = np.array([id2label[str(label)] for label in labels])
    all_documents_predictions_df['label'] = labels
    all_documents_predictions_df = all_documents_predictions_df.drop(columns = ['prob'])
    all_documents_predictions_df = all_documents_predictions_df[all_documents_predictions_df['label'] != 'O']
    all_documents_predictions_df = all_documents_predictions_df.reset_index()
    all_documents_predictions_df = all_documents_predictions_df.drop(columns = ['index'])
    all_documents_predictions_df['row_id'] = all_documents_predictions_df.index
    true_positives = pd.merge(all_documents_predictions_df, all_positives, on=['document', 'label', 'token'])
    precision = len(true_positives)/ len(all_documents_predictions_df)
    recall = len(true_positives)/ len(all_positives)
    f5 = (1+25)*(precision*recall/(25*precision + recall))
    return f5

In [47]:
thresholds = 1-np.logspace(-6, -1, num=1000) 
f5_scores = []
for threshold in thresholds:
    f5_scores.append(assess_performance_threshold(total_predictions_df, threshold))


In [48]:
idx = np.argmax(np.array(f5_scores))

In [52]:
thresholds[idx]

0.9999073240669886

In [51]:
f5_scores[idx]

0.9694511676587949