In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from datasets import Value, ClassLabel, Features, Sequence
from collections import defaultdict
from huggingface_hub import login

This notebook convert raw JSONs of PIE and MAGPIE dataset to HuggingFace Dataset format for NER task. 

In [None]:
PATH_PIE = '../data/raw/PIE_annotations_all_v2.json'
PATH_MAGPIE = '../data/raw/magpie-corpus.jsonl'

SEED = 42

## Read data

In [None]:
pie_df = pd.read_json(PATH_PIE)
magpie_df = pd.read_json(PATH_MAGPIE, lines=True)

In [None]:
pie_df.info()

In [None]:
magpie_df.info()

## Prepare PIE data

In [None]:
def tokenize_text(text: str, offsets: list[list[int]]=None) -> tuple[list[str], list[str]]:
    '''
    Apply word tokenization for input text and marked NER tokens. Each token in the source 
    line is assumed to be separated by a space. Code uses IOB format.
    Args:
        text (str): Input text
        offsets (list[list[int]]): comprehended list with start:end indecies of each NER in text
    Returns:
        word_tokens (list[str]): list of word tokens
        pie_tokens (list[str]): list of coresponded NER labels

    Example:
    text: 'The deal was negotiated behind closed doors .'    
    offsets: [[24, 30], [31, 37], [38, 44]]
    
    Example idiom is 'behind closed doors' and function output will be
    ['The', 'deal', 'was', 'negotiated', 'behind', 'closed', 'doors', '.']
    ['O', 'O', 'O', 'O', 'B-PIE', 'I-PIE', 'I-PIE', 'O']
    '''
    word_tokens = []
    pie_tokens = []
    start = 0

    is_first_pie_token = True

    if not offsets:
        word_tokens = text.split()
        pie_tokens = ['O'] * len(word_tokens)
        return word_tokens, pie_tokens

    for offset in offsets:
        offset_start = offset[0]
        offset_end = offset[1]

        # Add tokens before current offset
        substr = text[start:offset_start]
        substr_tokens = substr.split()
        word_tokens.extend(substr_tokens)
        pie_tokens.extend(['O'] * len(substr_tokens))

        # Add offset tokens
        substr = text[offset_start:offset_end]
        substr_tokens = substr.split()
        word_tokens.extend(substr_tokens)

        sbstr_pie_tokens = ['I-PIE'] * len(substr_tokens)
        if is_first_pie_token:
            sbstr_pie_tokens[0] = 'B-PIE'
            is_first_pie_token = False
        pie_tokens.extend(sbstr_pie_tokens)
        start = offset_end

    # Add the substring after the last offset
    substr = text[start:]
    substr_tokens = substr.split()
    word_tokens.extend(substr_tokens)
    pie_tokens.extend(['O'] * len(substr_tokens))

    return word_tokens, pie_tokens

Remove PIEs offsets from controversial objects. From docs:<br>
`PIE_label: label indicating whether this sentence contains the PIE in question ('y') or not ('n')`

For example 'They do this even though they may break the conventions from time to time .'
doesn't contain 'break even' idiom

In [None]:
pie_df['PIE_label'].value_counts()

In [None]:
pie_df.loc[pie_df['PIE_label'] == 'n', 'offsets'] = None 

Extract NERs

In [None]:
# take only 3-rd sentence with NER inside
pie_df['context_pie_only'] = pd.DataFrame(pie_df['context'].to_list())[2]

In [None]:
pie_df[['tokens', 'ner_tags']] = pie_df.apply(lambda x: tokenize_text(x.context_pie_only, x.offsets), 
             axis=1, result_type='expand')

Select only relevant features for final view 

In [None]:
pie_df['is_pie'] = pie_df['PIE_label'] == 'y'

In [None]:
pie_df = pie_df[['idiom', 'is_pie', 'tokens', 'ner_tags']]

In [None]:
pie_df.head()

## Prepare MAGPIE data

In [None]:
magpie_df.head(2)

In [None]:
plt.figure(figsize=(10, 3))
plt.title('MAGPIE labels confidence distribution')
sns.histplot(magpie_df['confidence'], edgecolor='black', bins=20)
plt.xlabel('Annotation confidence level')
plt.ylabel('Objects amount')
plt.show()

Based on the analogy of the author of the corpus, we will use a threshold value of 75% confidence. 
In analogy with the PIE corpus, all examples with a value below the threshold will receive a False label.

In [None]:
magpie_df['is_pie'] = magpie_df['confidence'] > 0.75
magpie_df.loc[magpie_df['is_pie'] == False, 'offsets'] = None 

In [None]:
magpie_df['context_pie_only'] = pd.DataFrame(magpie_df['context'].to_list())[2]

In [None]:
magpie_df[['tokens', 'ner_tags']] = magpie_df.apply(lambda x: tokenize_text(x.context_pie_only, x.offsets), 
             axis=1, result_type='expand')

In [None]:
magpie_df = magpie_df[['idiom', 'is_pie', 'tokens', 'ner_tags']]

In [None]:
magpie_df.head()

## Convert DataFrames to HuggingFace Dataset

In [None]:
df = pd.concat([pie_df, magpie_df], ignore_index=True)

Remove duplicates

In [None]:
dupl_amount = df['tokens'].str.join(sep='').duplicated().sum()
print(f'Duplicates amount:{dupl_amount}')
print(f'Duplicates ratio {dupl_amount/len(df) :.3f}')


In [None]:
df = df.loc[~df['tokens'].str.join(sep='').duplicated(), :].reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.to_csv('../data/processed/final_data.csv')

In [None]:
fig, ax = plt.subplots(figsize=(10, 3))
plt.title('True vs False PIEs amount')
df['is_pie'].value_counts().plot(kind='bar', ax=ax)
ax.bar_label(ax.containers[-1], label_type='edge')
plt.ylim([None, 60000])
plt.xticks(rotation=0)
plt.show()

Apply stratified split

In [None]:
train, valid_test = train_test_split(df, test_size=0.2, 
                                     stratify = df['is_pie'],
                                     shuffle=True, 
                                     random_state=SEED)

valid, test = train_test_split(valid_test, test_size=0.5, 
                                     stratify = valid_test['is_pie'],
                                     shuffle=True, 
                                     random_state=SEED)

In [None]:
len(train), len(valid), len(test)

In [None]:
print(train['is_pie'].value_counts())
print(valid['is_pie'].value_counts())
print(test['is_pie'].value_counts())

Define feature schema and create DatasetDict

In [None]:
features = Features({"idiom": Value("string"), 
                     "is_pie": Value("bool"),
                     "tokens": Sequence(Value('string')),
                     "ner_tags": Sequence(ClassLabel(num_classes=3, names=['O', 'B-PIE', 'I-PIE'])) })
                     

In [None]:
dataset_dict = DatasetDict()
dataset_dict['train'] = Dataset.from_pandas(train, features=features, preserve_index=False)
dataset_dict['validation'] = Dataset.from_pandas(valid, features=features, preserve_index=False)
dataset_dict['test'] = Dataset.from_pandas(test, features=features, preserve_index=False)

In [None]:
dataset_dict

In [None]:
dataset_dict['train'].features

Save locally and push to hub

In [None]:
dataset_dict.save_to_disk('../data/processed/pie_dataset')

In [None]:
# login()
# dataset_dict.push_to_hub("Gooogr/pie_idioms")