In [1]:
import json 
import re

import numpy as np 
import pandas as pd
from tqdm.auto import tqdm

NAME_STUDENT - BI-LSTM
EMAIL - REGEX
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - REGEX
URL_PERSONAL - REGEX
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

In [2]:
with open('data/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

text_df = []
for doc_i, doc in enumerate(tqdm(data)):
    for j in range(len(doc['tokens'])):
        text_df.append([doc_i, doc['document'], doc['tokens'][j], doc['labels'][j]])
text_df = pd.DataFrame(text_df, columns=['doc_i', 'document', 'token', 'label'])

  0%|          | 0/6807 [00:00<?, ?it/s]

In [4]:
# url PII
url_regex = re.compile(
    r'tps?://'
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?', re.IGNORECASE
)
perc_url = re.compile(r'.htm?|.php\b|.jsp\b|.asp\b|[\./]linkedin|[\./]youtu|[\./]facebook|[\./]tate|[\./]moore')
text_df['is_personal_url'] = text_df.token.apply(lambda x: False if (perc_url.search(x) is None) | (url_regex.search(x) is None) else True)

# email PII
email_regex = re.compile(r'[^@]+@[\w]+\.[\w]+') # email reg res
text_df['is_email'] = text_df.token.apply(lambda x: False if email_regex.search(x) is None else True)

In [9]:
text_df.loc[text_df.is_email]

Unnamed: 0,doc_i,document,token,label,is_personal_url,is_email
20978,24,379,djones@gmail.com,B-EMAIL,False,True
71156,86,2769,matthew72@hotmail.com,B-EMAIL,False,True
106633,134,3709,belindarojas@yahoo.com,B-EMAIL,False,True
106635,134,3709,kennethevans@hotmail.com,B-EMAIL,False,True
138705,171,4227,agood@gmail.com,B-EMAIL,False,True
139228,171,4227,agood@gmail.com,B-EMAIL,False,True
150914,185,4381,hwillis@gmail.com,B-EMAIL,False,True
156553,191,4438,kellyharrison@gmail.com,B-EMAIL,False,True
157119,191,4438,kellyharrison@gmail.com,B-EMAIL,False,True
160579,195,4465,lowetyler@hotmail.com,B-EMAIL,False,True


In [5]:
# phone PII
phone_regex = re.compile(
    r'\(\d{3}\)\d{3}-\d{4}'
    r'x?\d{0,3}|'
    r'\d{3}\.\d{3}.\d{4}'
)

full_text = data[0]['full_text']
reg_result = phone_regex.search(full_text)
phone_labeling = []
token_pointers = []

for i, doc in enumerate(data):
    full_text = doc['full_text']
    tokens = doc['tokens']
    doc_i = doc['document']

    reg_result = phone_regex.search(full_text)
    if reg_result is None:
        continue 
    
    phone_st, phone_ed = reg_result.span()
    pointer = 0

    for tok_i, it in enumerate(tokens):
        token_len = len(it)
        token_pointers.append([pointer, pointer+token_len, it, doc['labels'][tok_i]])

        if token_len >= len(full_text):
            break

        if pointer > phone_ed:
            reg_result = phone_regex.search(full_text)
            if reg_result is None:
                break
            phone_st, phone_ed = reg_result.span()
            phone_st += pointer
            phone_ed += pointer

        if (pointer == phone_st):
            phone_labeling.append([i, doc_i, tok_i, it, 'B-PHONE_NUM'])

        elif (pointer < phone_ed) & (pointer+token_len >= phone_ed):
            phone_labeling.append([i, doc_i, tok_i, it, 'I-PHONE_NUM'])   

        elif (pointer > phone_st) & (pointer+token_len <= phone_ed):
            phone_labeling.append([i, doc_i, tok_i, it, 'I-PHONE_NUM'])   
    
        if full_text[token_len] == ' ':
            token_len += 1
            if token_len >= len(full_text):
                break

        pointer += token_len
        full_text = full_text[token_len:]

phone_labeling = pd.DataFrame(phone_labeling, columns=['data_row', 'doc_i', 'token_i', 'token', 'label'])

In [6]:
phone_labeling

Unnamed: 0,data_row,doc_i,token_i,token,label
0,185,4381,18,(,B-PHONE_NUM
1,185,4381,19,320)202,I-PHONE_NUM
2,185,4381,20,-,I-PHONE_NUM
3,185,4381,21,0688x95843,I-PHONE_NUM
4,219,4777,443,(,B-PHONE_NUM
5,219,4777,444,223)392,I-PHONE_NUM
6,219,4777,445,-,I-PHONE_NUM
7,219,4777,446,2765,I-PHONE_NUM
8,350,6243,11,(,B-PHONE_NUM
9,350,6243,12,820)913,I-PHONE_NUM
