# Import the libraries

In [None]:
import transformers
import json
import numpy as np
import tensorflow as tf
np.object = object
import torch

from datasets import Dataset
print(transformers.__version__)

We also quickly upload some telemetry - this tells us which examples and software versions are getting used so we know where to prioritize our maintenance efforts. We don't collect (or care about) any personally identifiable information, but if you'd prefer not to be counted, feel free to skip this step or delete this cell entirely.

## Import the trained model (model, tokenizer and data collator)

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("/kaggle/input/distilbert-all-data/giannilbert/giannilbert")
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/distilbert-all-data/giannitokenizer/giannitokenizer')
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 16)

## Import test dataset

In [None]:
data_test = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))

In [None]:
ds_original = Dataset.from_dict({
    "full_text": [x["full_text"] for x in data_test],
    "document": [x["document"] for x in data_test],
    "tokens": [x["tokens"] for x in data_test],
    "trailing_whitespace": [x["trailing_whitespace"] for x in data_test],
})

## Define the function to tokenize the dataset:
### - The text is reconstructed from the given tokens to ensure that labels correspond to the exact token (this is not strictly necessary but is to avoid any discrepancy).
### - token_map contains a map character <-> word_index, where word_index is a number corresponding to its position in the document (first word in the document -> 0)
### - The offset mapping is returned too

In [None]:
max_model_input_length = 512

In [None]:
def tokenize(example, tokenizer):

    text = []
    token_map = []
    
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        
        text.append(t)
        
        token_map.extend([idx]*len(t))
        
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=True, padding = 'max_length',max_length = 512, return_overflowing_tokens = True)
    
    return {
        **tokenized,
        "token_map": token_map,
    }

## Tokenize the dataset

In [None]:
ds_original = ds_original.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc = 4)

In [None]:
def find_first_zero_index(lst):
    try:
        return lst.index(0)
    except ValueError:
        return -1

In [None]:
def set_last_sequence(row):
    n_seq = len(row['input_ids'])
    
    token_pos_doc = np.array([[-1] * max_model_input_length] * n_seq)
    for seq in range(n_seq):
        token_pos_doc[seq][1:-1] = np.arange(seq*(max_model_input_length - 2), (seq+1)*(max_model_input_length - 2))
    row['token_pos_doc'] = token_pos_doc
    del token_pos_doc
    
    if n_seq >= 2:
        zero_index = find_first_zero_index(row['input_ids'][-1])
        if zero_index != -1:
        # fix input_ids 
            row['input_ids'][-1][-zero_index+1:] = row['input_ids'][-1][1:zero_index]
            row['input_ids'][-1][1:-zero_index+1] = row['input_ids'][-2][zero_index-1: -1]

        # fix offset_mapping
            row['offset_mapping'][-1][-zero_index+1:] = row['offset_mapping'][-1][1:zero_index]
            row['offset_mapping'][-1][1:-zero_index+1] = row['offset_mapping'][-2][zero_index-1 : -1]    

        # fix attention_mask
            row['attention_mask'][-1] = [1] * max_model_input_length
            
        # fix token_pos_doc
            row['token_pos_doc'][-1][-zero_index:-1] = row['token_pos_doc'][-1][:zero_index-1]
            row['token_pos_doc'][-1][1:-zero_index+1] = row['token_pos_doc'][-2][zero_index-1 : -1]
            # row['token_type_ids'][-1][-zero_index+1:] = row['token_type_ids'][-1][1:zero_index]
            # row['token_type_ids'][-1][1:-zero_index+1] = row['token_type_ids'][-2][zero_index-1 : -1]
    
    return row 

In [None]:
tokenized_ds_wo_predictions = ds_original.map(set_last_sequence, num_proc = 2)

In [None]:
def mean_prediction(dataframe_wo_predictions, predictions_tensor):
    '''
    PARAMETERS:
        -  dataframe_wo_predictions pd.DataFrame containing one input_id for each row 
        -  predictions_tensor a (n_documents, 512, 13) tensor containing the predictions
    
    OUTPUT:
        -  np.array containing the labels for a single document
    '''
    predictions_tensor = tf.math.softmax(predictions_tensor.predictions, axis=-1)
    predictions_array = np.concatenate(predictions_tensor)
    dataframe_wo_predictions['predictions'] = [row for row in predictions_array]
    
    predictions_df_grouped = dataframe_wo_predictions.groupby(['document', 'token_pos_doc'], group_keys = False)['predictions'].apply('mean').reset_index()
    predictions_df_grouped = predictions_df_grouped.sort_values(by=['document', 'token_pos_doc'])
    predictions_df_grouped = predictions_df_grouped.reset_index()
    
    predicted_labels = np.array([np.array(arr) for arr in predictions_df_grouped['predictions']])

    
    return predicted_labels[1:] # no start/end special tokens
    

In [None]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=16, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=data_collator, 
    tokenizer=tokenizer,
)

In [None]:
n_documents = len(tokenized_ds_wo_predictions)
tokenized_ds_wo_predictions.set_format('pandas')
preds_final = []

# predict a single document each time, in this way we are saving in RAM only one document at a time
# while the others stay in the disk
for i in range(n_documents):
    # getting a single document inside RAM memory
    df_document_i = tokenized_ds_wo_predictions[i]
    
    # input_ids is a list of lists so it needs to be exploded in order to make predictions
    df_document_i = df_document_i.drop(columns = ['full_text', 'tokens', 'trailing_whitespace',
     'offset_mapping', 'overflow_to_sample_mapping',
       'token_map', ])
    exploded_df = df_document_i.explode(['input_ids','attention_mask', 'token_pos_doc',
#                                          'token_type_ids'
                                        ])
    
    predictions_document_i = trainer.predict(Dataset.from_pandas(exploded_df))
    
    # we need to explode again in order to make each row corresponding to a single input_id (token)
    exploded_df = exploded_df.drop(columns = ['attention_mask'])
    exploded_df = exploded_df.explode(['input_ids', 'token_pos_doc'])
    predictions_document_i = mean_prediction(exploded_df, predictions_document_i)
    
    preds_final.append(predictions_document_i)

In [None]:
import json
from pathlib import Path

config = json.load(open(Path('/kaggle/input/distilbert-all-data/giannilbert/giannilbert') / "config.json"))
id2label = config["id2label"]  

In [None]:
def fix_offset_mapping(doc):
    '''Remove special tokens (start/end of sequence) from offset mapping column'''
    new_offset = []
    for seq_offset in doc['offset_mapping']:
        reduced_seq_offset = seq_offset[1:-1]
        new_offset.extend(reduced_seq_offset)
    doc['offset_mapping'] = new_offset
    return doc

ds_original = ds_original.map(fix_offset_mapping)
        

In [None]:
# Choose the aggregation strategy in case of conflict:
#     -'first': the prediction for the first subtoken decides how the whole tkoen is classified.
#     -'max': the prediction of the subtoken whose confidence is the highest decides how the whole tkoen is classified.
#     -'average': the prediction for the whole is decided by performing the mean of the predictions for all the subtokens
aggregation_strategy = 'average'

In [None]:
def max_prob_vct(lists):
    max_list = max(lists, key=max)
    return max_list

# Set the threshold and make predictions

In [None]:
threshold =  0.9998688828828829

In [None]:
import pandas as pd

all_documents_predictions_df = pd.DataFrame(columns = ['document', 'label', 'token','token_str', 'prob'])

# Iterate over the documents (each row in the original_ds is a distinct document)
for p, token_map, offsets, tokens, doc in zip(preds_final, 
                                              ds_original["token_map"], 
                                              ds_original["offset_mapping"], 
                                              ds_original["tokens"], 
                                              ds_original["document"]):
    
    document_predictions_df = pd.DataFrame(columns = ['document', 'label', 'token','token_str', 'prob'])

    # Iterate through each token prediction in the document
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(np.argmax(np.array(token_pred)))]
        
        # If start and end indices sum to zero (special token), continue to the next iteration
        # Since we have removed special tokens for beginning/end of a sequence, we should never
        # enter this.
        if start_idx + end_idx == 0:
            continue

        # If the token mapping at the start index is a whitespace (-1), increment start index
        if token_map[start_idx] == -1:
            start_idx += 1

        # Ignore leading whitespace tokens ("\n\n")
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # If start index exceeds the length of token mapping, break the loop (end of document)
        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]  # relative original token
#         keep = label_pred != 'O'
        keep = token_pred[-1] < 0.99999
        # Ignore whitespace tokens
        if token_id != -1 and keep:
            new_prediction = [ doc,label_pred, token_id,  tokens[token_id], token_pred]

            document_predictions_df.loc[len(document_predictions_df)] = new_prediction

    
    if aggregation_strategy == 'first':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str'], group_keys = False, sort = False).first().reset_index()
    elif aggregation_strategy == 'average':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str'], group_keys = False, sort = False)['prob'].mean().reset_index()
    elif aggregation_strategy == 'max':
        document_predictions_df = document_predictions_df.groupby(['document','token','token_str'], group_keys = False, sort = False)['prob'].agg(max_prob_vct).reset_index()
    all_documents_predictions_df = pd.concat([all_documents_predictions_df, document_predictions_df], ignore_index=True)
    


In [None]:
matrix_probs = np.array([np.array(row) for row in all_documents_predictions_df['prob']])
preds_normal = np.argmax(matrix_probs,axis = -1)
preds_without_O = np.argmax(matrix_probs[:,:12],axis = -1)
O_preds = matrix_probs[:,-1]
labels = np.where(O_preds < threshold, preds_without_O , preds_normal)
labels = np.array([id2label[str(label)] for label in labels])
all_documents_predictions_df['label'] = labels
all_documents_predictions_df = all_documents_predictions_df.drop(columns = ['prob'])
all_documents_predictions_df = all_documents_predictions_df[all_documents_predictions_df['label'] != 'O']
all_documents_predictions_df = all_documents_predictions_df.reset_index()
all_documents_predictions_df = all_documents_predictions_df.drop(columns = ['index'])
all_documents_predictions_df['row_id'] = all_documents_predictions_df.index

In [None]:
all_documents_predictions_df

In [None]:
all_documents_predictions_df.to_csv("submission.csv", index=False)