In [1]:
import json 
import re

import numpy as np 
import pandas as pd
from tqdm.auto import tqdm

In [2]:
with open('data/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

text_df = []
for doc_i, doc in enumerate(tqdm(data)):
    for j in range(len(doc['tokens'])):
        text_df.append([doc_i, j, doc['document'], doc['tokens'][j], doc['labels'][j]])
text_df = pd.DataFrame(text_df, columns=['doc_i', 'token_i', 'document', 'token', 'label'])

  0%|          | 0/6807 [00:00<?, ?it/s]

In [3]:
# url PII
url_regex = re.compile(
    r'tps?://'
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?', re.IGNORECASE
)
perc_url = re.compile(r'.htm?|.php\b|.jsp\b|.asp\b|[\./]linkedin|[\./]youtu|[\./]facebook|[\./]tate|[\./]moore')
text_df['is_personal_url'] = text_df.token.apply(lambda x: False if (perc_url.search(x) is None) | (url_regex.search(x) is None) else True)
url_df = text_df.loc[text_df['is_personal_url'], ['document', 'token_i', 'token', 'label']].reset_index(drop=True)

In [4]:
# email PII
email_regex = re.compile(r'[^@]+@[\w]+\.[\w]+') # email reg res
text_df['is_email'] = text_df.token.apply(lambda x: False if email_regex.search(x) is None else True)
email_df = text_df.loc[text_df['is_email'], ['document', 'token_i', 'token', 'label']].reset_index(drop=True)

In [14]:
# phone PII
phone_regex = re.compile(
    r'\(\d{3}\)\d{3}-\d{4}'
    r'x?\d{0,3}|'
    r'\d{3}\.\d{3}.\d{4}'
)

phone_labeling = []
token_pointers = []

for i, doc in enumerate(data):
    full_text = doc['full_text']
    tokens = doc['tokens']
    doc_i = doc['document']

    reg_result = phone_regex.search(full_text)
    if reg_result is None:
        continue 
    
    phone_st, phone_ed = reg_result.span()
    pointer = 0

    for tok_i, it in enumerate(tokens):
        token_len = len(it)
        token_pointers.append([pointer, pointer+token_len, it, doc['labels'][tok_i]])

        if token_len >= len(full_text):
            break

        if pointer > phone_ed:
            reg_result = phone_regex.search(full_text)
            if reg_result is None:
                break
            phone_st, phone_ed = reg_result.span()
            phone_st += pointer
            phone_ed += pointer

        if (pointer == phone_st):
            phone_labeling.append([doc_i, tok_i, it, 'B-PHONE_NUM'])

        elif (pointer < phone_ed) & (pointer+token_len >= phone_ed):
            phone_labeling.append([doc_i, tok_i, it, 'I-PHONE_NUM'])   

        elif (pointer > phone_st) & (pointer+token_len <= phone_ed):
            phone_labeling.append([doc_i, tok_i, it, 'I-PHONE_NUM'])   
    
        if full_text[token_len] == ' ':
            token_len += 1
            if token_len >= len(full_text):
                break

        pointer += token_len
        full_text = full_text[token_len:]

phone_df = pd.DataFrame(phone_labeling, columns=['document', 'token_i', 'token', 'label'])

In [23]:
reg_df = pd.concat((url_df, email_df, phone_df)).sort_values('document').reset_index(drop=True)

In [25]:
reg_df.label.unique()

array(['B-URL_PERSONAL', 'B-EMAIL', 'O', 'I-PHONE_NUM', 'B-PHONE_NUM'],
      dtype=object)

In [27]:
reg_df.loc[reg_df.label == 'O'].shape

(59, 4)

In [28]:
reg_df.shape

(229, 4)

In [29]:
(229 - 59) / 229

0.74235807860262

In [33]:
url_df.loc[url_df.label == 'O'].shape

(58, 4)