In [174]:
import json 
import re

import numpy as np 
import pandas as pd
from tqdm.auto import tqdm

NAME_STUDENT - BI-LSTM
EMAIL - REGEX
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - REGEX
URL_PERSONAL - REGEX
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

In [231]:
with open('data/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

text_df = []
for doc_i, doc in enumerate(tqdm(data)):
    for j in range(len(doc['tokens'])):
        text_df.append([doc_i, doc['document'], doc['tokens'][j], doc['labels'][j]])
text_df = pd.DataFrame(text_df, columns=['doc_i', 'document', 'token', 'label'])

  0%|          | 0/6807 [00:00<?, ?it/s]

In [230]:
for doc in data:
    if doc['document'] == 9854:
        full_text = doc['full_text']
        with open('text_with_email.txt', 'w', encoding='utf-8') as f:
            f.write(full_text)
        break

In [232]:
# url PII
url_regex = re.compile(
    r'tps?://'
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
    r'localhost|'  # localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
    r'(?::\d+)?', re.IGNORECASE
)
perc_url = re.compile(r'.htm?|.php\b|.jsp\b|.asp\b|[\./]linkedin|[\./]youtu|[\./]facebook|[\./]tate|[\./]moore')
text_df['is_personal_url'] = text_df.token.apply(lambda x: False if (perc_url.search(x) is None) | (url_regex.search(x) is None) else True)

# email PII
email_regex = re.compile(r'[^@]+@[\w]+\.[\w]+') # email reg res
text_df['is_email'] = text_df.token.apply(lambda x: False if email_regex.search(x) is None else True)

In [423]:
# phone PII
phone_regex = re.compile(
    r'\(\d{3}\)\d{3}-\d{4}'
    r'x?\d{0,3}|'
    r'\d{3}\.\d{3}.\d{4}'
)

reg_result = phone_regex.search(full_text)
phone_labeling = []
token_pointers = []

for i, doc in enumerate(data):
    full_text = doc['full_text']
    tokens = doc['tokens']
    doc_i = doc['document']

    reg_result = phone_regex.search(full_text)
    if reg_result is None:
        continue 
    
    phone_st, phone_ed = reg_result.span()
    pointer = 0

    for tok_i, it in enumerate(tokens):
        token_len = len(it)
        token_pointers.append([pointer, pointer+token_len, it, doc['labels'][tok_i]])

        if token_len >= len(full_text):
            break

        if pointer > phone_ed:
            reg_result = phone_regex.search(full_text)
            if reg_result is None:
                break
            phone_st, phone_ed = reg_result.span()
            phone_st += pointer
            phone_ed += pointer

        if (pointer == phone_st):
            phone_labeling.append([i, doc_i, tok_i, it, 'B-PHONE_NUM'])

        elif (pointer < phone_ed) & (pointer+token_len >= phone_ed):
            phone_labeling.append([i, doc_i, tok_i, it, 'I-PHONE_NUM'])   

        elif (pointer > phone_st) & (pointer+token_len <= phone_ed):
            phone_labeling.append([i, doc_i, tok_i, it, 'I-PHONE_NUM'])   
    
        if full_text[token_len] == ' ':
            token_len += 1
            if token_len >= len(full_text):
                break

        pointer += token_len
        full_text = full_text[token_len:]

phone_labeling = pd.DataFrame(phone_labeling, columns=['data_row', 'doc_i', 'token_i', 'token', 'label'])

In [541]:
# ID parsing
id_regex = re.compile('[^/]:?\d{8,12}$')
text_df['is_id'] = text_df.token.apply(lambda x: False if id_regex.search(x) is None else True)

In [552]:
text_df.loc[text_df.label == 'I-STREET_ADDRESS']

Unnamed: 0,doc_i,document,token,label,is_personal_url,is_email,is_id
861200,1103,9854,Smith,I-STREET_ADDRESS,False,False,False
861201,1103,9854,Centers,I-STREET_ADDRESS,False,False,False
861202,1103,9854,Apt,I-STREET_ADDRESS,False,False,False
861203,1103,9854,.,I-STREET_ADDRESS,False,False,False
861204,1103,9854,656,I-STREET_ADDRESS,False,False,False
861205,1103,9854,\n,I-STREET_ADDRESS,False,False,False
861206,1103,9854,Joshuamouth,I-STREET_ADDRESS,False,False,False
861207,1103,9854,",",I-STREET_ADDRESS,False,False,False
861208,1103,9854,RI,I-STREET_ADDRESS,False,False,False
861209,1103,9854,95963,I-STREET_ADDRESS,False,False,False


In [553]:
with open('text.txt', 'w', encoding='utf-8') as f:
    f.write(data[1103]['full_text'])

In [554]:
data[1103]['full_text']

"Waseem Mabunda  591 Smith Centers Apt. 656\nJoshuamouth, RI 95963 ( The Netherlands)  410.526.1667  vpi@mn.nl\n\nMind Mapping,      Challenge:     For several years I have been working for an Asset manager in the Netherlands. During this period I have been involved in many  projects. Certainly in the world of asset management, much has changed in recent years in the area of Law and Regulations.  What I mainly experience in these projects is that all departments have a different interest in starting a new project. This  certainly does not benefit the project. How do you get everyone to complete a project in the common interest and how do you  motivate everyone who participate in the project?    Selection:    An improvement project can be approached in different ways. The most common way is the scrum approach. We work in  multidisciplinary teams that work in short sprints, with a fixed length of 1 to 4 weeks. Cooperation is very important and  everyone must be able to respond quickly to