In [49]:
%%capture
!pip install -U -q presidio_analyzer --no-index --find-links=file:///kaggle/input/presidio-wheels/presidio

In [50]:
import json
import re

import pandas as pd
import polars as pl
from tqdm import tqdm
from dateutil import parser

from presidio_analyzer.recognizer_registry import RecognizerRegistry
from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [51]:
train_df = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/train.json"))
test_df = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))
print(f"Length of train data: {len(train_df)}, keys: {train_df[0].keys()}")
print("-" * 100)
labels = set()
for i in range(len(train_df)):
    labels.update(train_df[i]['labels'])
print(f"Data labels: {labels}")

Length of train data: 6807, keys: dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
----------------------------------------------------------------------------------------------------
Data labels: {'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-EMAIL', 'I-URL_PERSONAL', 'B-PHONE_NUM', 'B-ID_NUM', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-USERNAME', 'I-PHONE_NUM', 'O', 'I-ID_NUM', 'I-STREET_ADDRESS'}


In [52]:
df_train = pl.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')
df_test = pl.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')
df_sample_submission = pl.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')

In [53]:
df_train_exploded = df_train.select('tokens', 'trailing_whitespace', 'labels').explode(pl.all())

In [54]:
for label in labels:
    display(df_train_exploded.filter(pl.col('labels') == label))

tokens,trailing_whitespace,labels
str,bool,str
"""591""",True,"""B-STREET_ADDRESS"""
"""743""",True,"""B-STREET_ADDRESS"""


tokens,trailing_whitespace,labels
str,bool,str
"""https://www.jackson.com/list/explorehomepage.htm""",false,"""B-URL_PERSONAL"""
"""https://www.linkedin.com/in/mmartinez""",true,"""B-URL_PERSONAL"""
"""https://youtu.be/rFD2lJuvace""",true,"""B-URL_PERSONAL"""
"""https://www.hall.biz/wp-contenthome.html""",false,"""B-URL_PERSONAL"""
"""http://www.burns-lopez.com/categories/appabout.asp""",false,"""B-URL_PERSONAL"""
"""http://jacobs-fisher.com/listpost.html""",false,"""B-URL_PERSONAL"""
"""https://www.youtube.com/watch?v=n-ajTPJ1h-J""",false,"""B-URL_PERSONAL"""
"""tps://www.facebook.com/bclark""",false,"""B-URL_PERSONAL"""
"""https://www.youtube.com/channel/UC1ElAcppeuhfet""",true,"""B-URL_PERSONAL"""
"""https://oconnell-townsend.com/wp-content/categorieshomepage.html""",false,"""B-URL_PERSONAL"""


tokens,trailing_whitespace,labels
str,bool,str
"""djones@gmail.com""",True,"""B-EMAIL"""
"""matthew72@hotmail.com""",False,"""B-EMAIL"""
"""belindarojas@yahoo.com""",True,"""B-EMAIL"""
"""kennethevans@hotmail.com""",False,"""B-EMAIL"""
"""agood@gmail.com""",False,"""B-EMAIL"""
"""agood@gmail.com""",False,"""B-EMAIL"""
"""hwillis@gmail.com""",True,"""B-EMAIL"""
"""kellyharrison@gmail.com""",True,"""B-EMAIL"""
"""kellyharrison@gmail.com""",True,"""B-EMAIL"""
"""lowetyler@hotmail.com""",False,"""B-EMAIL"""


tokens,trailing_whitespace,labels
str,bool,str
"""nYZqnhEXw""",False,"""I-URL_PERSONAL"""


tokens,trailing_whitespace,labels
str,bool,str
"""(""",False,"""B-PHONE_NUM"""
"""(""",False,"""B-PHONE_NUM"""
"""(""",False,"""B-PHONE_NUM"""
"""(""",False,"""B-PHONE_NUM"""
"""(""",False,"""B-PHONE_NUM"""
"""410.526.1667""",True,"""B-PHONE_NUM"""


tokens,trailing_whitespace,labels
str,bool,str
"""860632713425""",True,"""B-ID_NUM"""
"""530670102508""",True,"""B-ID_NUM"""
"""530670102508""",True,"""B-ID_NUM"""
"""875673967537""",True,"""B-ID_NUM"""
"""860632713425""",True,"""B-ID_NUM"""
"""557349702179""",True,"""B-ID_NUM"""
"""784372734211""",True,"""B-ID_NUM"""
"""054176622314""",True,"""B-ID_NUM"""
"""674915248960""",True,"""B-ID_NUM"""
"""932353568953""",True,"""B-ID_NUM"""


tokens,trailing_whitespace,labels
str,bool,str
"""Nathalie""",true,"""B-NAME_STUDENT"""
"""Nathalie""",true,"""B-NAME_STUDENT"""
"""Nathalie""",true,"""B-NAME_STUDENT"""
"""Diego""",true,"""B-NAME_STUDENT"""
"""Diego""",true,"""B-NAME_STUDENT"""
"""Gilberto""",true,"""B-NAME_STUDENT"""
"""Sindy""",true,"""B-NAME_STUDENT"""
"""Nadine""",true,"""B-NAME_STUDENT"""
"""Eladio""",true,"""B-NAME_STUDENT"""
"""Silvia""",true,"""B-NAME_STUDENT"""


tokens,trailing_whitespace,labels
str,bool,str
"""Sylla""",false,"""I-NAME_STUDENT"""
"""Sylla""",false,"""I-NAME_STUDENT"""
"""Sylla""",false,"""I-NAME_STUDENT"""
"""Estrada""",false,"""I-NAME_STUDENT"""
"""Estrada""",false,"""I-NAME_STUDENT"""
"""Gamboa""",false,"""I-NAME_STUDENT"""
"""Samaca""",false,"""I-NAME_STUDENT"""
"""Born""",false,"""I-NAME_STUDENT"""
"""Amaya""",false,"""I-NAME_STUDENT"""
"""Villalobos""",false,"""I-NAME_STUDENT"""


tokens,trailing_whitespace,labels
str,bool,str
"""castanedagabriel""",True,"""B-USERNAME"""
"""fdixon""",True,"""B-USERNAME"""
"""fdixon""",True,"""B-USERNAME"""
"""meyermichelle""",False,"""B-USERNAME"""
"""jacob59""",False,"""B-USERNAME"""
"""holmespatrick""",False,"""B-USERNAME"""


tokens,trailing_whitespace,labels
str,bool,str
"""320)202""",False,"""I-PHONE_NUM"""
"""-""",False,"""I-PHONE_NUM"""
"""0688x95843""",True,"""I-PHONE_NUM"""
"""223)392""",False,"""I-PHONE_NUM"""
"""-""",False,"""I-PHONE_NUM"""
"""2765""",False,"""I-PHONE_NUM"""
"""820)913""",False,"""I-PHONE_NUM"""
"""-""",False,"""I-PHONE_NUM"""
"""3241x894""",False,"""I-PHONE_NUM"""
"""820)913""",False,"""I-PHONE_NUM"""


tokens,trailing_whitespace,labels
str,bool,str
"""Design""",true,"""O"""
"""Thinking""",true,"""O"""
"""for""",true,"""O"""
"""innovation""",true,"""O"""
"""reflexion""",false,"""O"""
"""-""",false,"""O"""
"""Avril""",true,"""O"""
"""2021""",false,"""O"""
"""-""",false,"""O"""
""" """,false,"""O"""


tokens,trailing_whitespace,labels
str,bool,str
"""30407059""",False,"""I-ID_NUM"""


tokens,trailing_whitespace,labels
str,bool,str
"""Smith""",True,"""I-STREET_ADDRESS"""
"""Centers""",True,"""I-STREET_ADDRESS"""
"""Apt""",False,"""I-STREET_ADDRESS"""
""".""",True,"""I-STREET_ADDRESS"""
"""656""",False,"""I-STREET_ADDRESS"""
""" """,False,"""I-STREET_ADDRESS"""
"""Joshuamouth""",False,"""I-STREET_ADDRESS"""
""",""",True,"""I-STREET_ADDRESS"""
"""RI""",True,"""I-STREET_ADDRESS"""
"""95963""",True,"""I-STREET_ADDRESS"""


In [55]:
nlp_config = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
}

In [56]:
# NLP engine
provider = NlpEngineProvider(nlp_configuration=nlp_config)
nlp_engine = provider.create_engine()

# Address recognizer
address_regex = r'\b\d+\s+\w+(\s+\w+)*\s+((st(\.)?)|(ave(\.)?)|(rd(\.)?)|(blvd(\.)?)|(ln(\.)?)|(ct(\.)?)|(dr(\.)?))\b'
address_pattern = Pattern(name="address", regex=address_regex, score=0.5)
address_recognizer = PatternRecognizer(supported_entity="ADDRESS_CUSTOM",
                                       patterns=[address_pattern], context=["st", "Apt"])

# Email recognizer 
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_pattern = Pattern(name="email address", regex=email_regex, score=0.5)
email_recognizer = PatternRecognizer(supported_entity="EMAIL_CUSTOM",
                                     patterns=[email_pattern])

# URL recognizer 
url_regex = "https?://\S+|www\.\S+"
url_pattern = Pattern(name="url", regex=url_regex, score=0.5)
url_recognizer = PatternRecognizer(supported_entity="URL_CUSTOM",
                                   patterns=[url_pattern])

registry = RecognizerRegistry()
registry.load_predefined_recognizers()
registry.add_recognizer(address_recognizer)
registry.add_recognizer(email_recognizer)
registry.add_recognizer(url_recognizer)

In [57]:
# Pass the parameters to AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine, 
    supported_languages=["en"],
    registry=registry,
    context_aware_enhancer=LemmaContextAwareEnhancer(context_similarity_factor=0.8,
                                                     min_score_with_context_similarity=0.3))

In [58]:
analyzer_results = analyzer.analyze(train_df[0]['full_text'], language='en')
for res in analyzer_results:
    for i in range(2):
        print(train_df[i]['full_text'].replace('\n', ' ')[res.start:res.end])
        print(res)
        print('*' * 50)

commitment
type: IN_PAN, start: 3315, end: 3325, score: 0.8500000000000001
**************************************************

type: IN_PAN, start: 3315, end: 3325, score: 0.8500000000000001
**************************************************
Nathalie Sylla  
type: PERSON, start: 52, end: 68, score: 0.85
**************************************************
tion Tool  Chall
type: PERSON, start: 52, end: 68, score: 0.85
**************************************************
Buzan T.
type: PERSON, start: 263, end: 271, score: 0.85
**************************************************
eeded.  
type: PERSON, start: 263, end: 271, score: 0.85
**************************************************
Buzan B.
type: PERSON, start: 276, end: 284, score: 0.85
**************************************************
 learnin
type: PERSON, start: 276, end: 284, score: 0.85
**************************************************
1999
type: DATE_TIME, start: 286, end: 290, score: 0.85
******************************************

In [59]:
def tokens2index(row): 
    tokens = row['tokens']
    start_index = []
    end_index = []
    prev_index = 0
    
    for token in tokens:  
        start = prev_index + row['full_text'][prev_index:].index(token)
        end = start + len(token)
        start_index.append(start)
        end_index.append(end)
        prev_index = end
    return start_index, end_index

def search_position(arr, target):
    low = 0
    high = len(arr) - 1

    while low <= high:
        mid = (low + high) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            low = mid + 1
        else:
            high = mid - 1
    return low

def count_trailing_whitespaces(word):
    return len(word) - len(word.rstrip())

In [60]:
black_list = ["wiki", "coursera", ".pdf", ".PDF",
              "article", ".png", ".gov", ".work", ".ai",
              ".firm", ".arts", ".store", ".rec", ".travel",
              'miro', 'trello' '.edu']

In [61]:
NAME_STUDENT, URL_PERSONAL, EMAIL, STREET_ADDRESS, ID_NUM, USERNAME = [], [], [], [], [], []

preds = []
# Find the starting and ending position of each word after segmentation
for i in tqdm(range(len(test_df)), desc="Processing tokens2index"):
    start, end = tokens2index(test_df[i])
    # Add starting and ending position of each word into json
    test_df[i]['start'] = start
    test_df[i]['end'] = end
    
for i, d in tqdm(enumerate(test_df), total=len(test_df), desc="Analyzing entities"):
    results = analyzer.analyze(text=d['full_text'],
                               entities=[
                                         "PERSON", 
                                         "URL_CUSTOM",
                                         "EMAIL_ADDRESS",
                                         "EMAIL_CUSTOM", 
                                         "ADDRESS_CUSTOM",
                                         "US_SSN",
                                         "US_ITIN",
                                         "US_PASSPORT",
                                         "US_BANK_NUMBER",
                                         "USERNAME"],
                               language='en'
                              )
    pre_preds = []
    for r in results: 
        s = search_position(d['start'], r.start) 
        end = r.end 
        word = d['full_text'][r.start:r.end]
        end = end - count_trailing_whitespaces(word)
        temp_preds = [s]
        try:
            while d['end'][s+1] <= end:
                temp_preds.append(s+1)
                s += 1
        except:
            pass
        
        # Find out what the entity is, and give it the corresponding label
        tmp = False
        
        if r.entity_type == 'USERNAME':
            label =  'USERNAME'
            USERNAME.append(d['full_text'][r.start:r.end])
            
        if r.entity_type == 'PERSON':
            label =  'NAME_STUDENT'
            NAME_STUDENT.append(d['full_text'][r.start:r.end])
            
        if r.entity_type == 'ADDRESS_CUSTOM':
            label = 'STREET_ADDRESS'
            STREET_ADDRESS.append(d['full_text'][r.start:r.end])
            
        if r.entity_type == 'US_SSN' or r.entity_type == 'US_ITIN' or r.entity_type == 'US_PASSPORT' or r.entity_type == 'US_BANK_NUMBER':
            label = 'ID_NUM'
            ID_NUM.append(d['full_text'][r.start:r.end])
            
        if r.entity_type == 'EMAIL_ADDRESS' or r.entity_type == 'EMAIL_CUSTOM':
            label = 'EMAIL'
            EMAIL.append(d['full_text'][r.start:r.end])
            
        if r.entity_type == 'URL_CUSTOM':
            # Remove tags from blacklist
            for w in black_list:
                if w in word:
                    tmp = True
                    break
            
            label = 'URL_PERSONAL'
            URL_PERSONAL.append(d['full_text'][r.start:r.end])
            
        if tmp:
            continue
        
        for p in temp_preds:
            if len(pre_preds) > 0:
                if pre_preds[-1]['rlabel'] == r.entity_type and (p - pre_preds[-1]['token'] == 1):
                    label_f = "I-" + label # Middle position of an entity
                else:
                    label_f = "B-" + label # Otherwise the start of next entity
            else:
                label_f = "B-" + label
            pre_preds.append(({
                    "document": d['document'],
                    "token": p,
                    "label": label_f,
                    "rlabel": r.entity_type
                }))
    preds.extend(pre_preds)

Processing tokens2index: 100%|██████████| 10/10 [00:00<00:00, 666.11it/s]
Analyzing entities: 100%|██████████| 10/10 [00:01<00:00,  5.02it/s]


In [62]:
submission = pd.DataFrame(preds).iloc[:, :-1].reset_index()
submission.columns = ['row_id','document', 'token', 'label']
submission.to_csv('submission.csv', index=False)
submission.head(50)

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
2,2,7,52,B-NAME_STUDENT
3,3,7,53,I-NAME_STUDENT
4,4,7,55,B-NAME_STUDENT
5,5,7,56,I-NAME_STUDENT
6,6,7,60,B-NAME_STUDENT
7,7,7,61,I-NAME_STUDENT
8,8,7,62,I-NAME_STUDENT
9,9,7,63,I-NAME_STUDENT
