### Parse CRAPII dataset

##### Load json file as dataframe

In [25]:
!pip3 install pandas
# if using homebrew might need to use the commented command below.
# !pip3 install pandas --break-system-packages

from typing import *
import pandas as pd

def read_file(filepath: str):
    return pd.read_json(filepath, orient="records")

[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try brew install
[31m   [0m xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a Python library that isn't in Homebrew,
[31m   [0m use a virtual environment:
[31m   [0m 
[31m   [0m python3 -m venv path/to/venv
[31m   [0m source path/to/venv/bin/activate
[31m   [0m python3 -m pip install xyz
[31m   [0m 
[31m   [0m If you wish to install a Python application that isn't in Homebrew,
[31m   [0m it may be easiest to use 'pipx install xyz', which will manage a
[31m   [0m virtual environment for you. You can install pipx with
[31m   [0m 
[31m   [0m brew install pipx
[31m   [0m 
[31m   [0m You may restore the old behavior of pip by passing
[31m   [0m the '--break-system-packages' flag to pip, or by adding
[31m   [0m 'break-system-packag

In [26]:
df = read_file("obfuscated_data_06.json")

##### Return all PII entities from a dataframe

In [27]:
# Function to return all PII entities in the given dataframe

# each entity is a tuple (essay_index, entity_text, label, (start, end))
# e.g. (3, "Michael", "NAME", (10, 16))
# note both start and end indices are inclusive

# type keyword might raise an error in python 3.11 or older
# just remove the type hints if this happens
type pii_entity = Tuple[int, str, str, Tuple[int, int]]

def parse_essays(df: pd.DataFrame) -> List[pii_entity]:
    # list to store all pii entities
    pii_entities: List[pii_entity] = []
    
    for i, row in df.iterrows():
        cur_index = 0
        entity_found = False
        cur_entity = None
        cur_entity_start = 0
        cur_label = None
        
        # iterate through the tokens in the essay
        for token, ws, label in zip(
            row.tokens, row.trailing_whitespace, row.labels):
            
            if entity_found:
                if label == "O":
                    if cur_entity[-1] == " ":
                        cur_entity = cur_entity[:-1]
                        end = cur_index - 2
                    else:
                        end = cur_index - 1
                    
                    pii_entities.append(
                        (i, cur_entity, cur_label, (cur_entity_start, end))
                    )

                    entity_found = False
                    cur_entity = None
                    cur_label = None
                
                else:
                    cur_entity += token
                    if ws:
                        cur_entity += " "
            
            else:
                if label != "O":
                    entity_found = True
                    cur_entity_start = cur_index
                    cur_entity = token
                    if ws:
                        cur_entity += " "
                    cur_label = label[2:]

            cur_index += len(token)
            if ws:
                cur_index += 1
    
    return sorted(pii_entities)

##### Confirm the function is working as intended

In [46]:
entities = parse_essays(df[:100])
for item in entities:
    print(item)

(4, 'Henry Acosta', 'NAME_STUDENT', (36, 47))
(7, 'Nathalie Sylla', 'NAME_STUDENT', (52, 65))
(7, 'Nathalie Sylla', 'NAME_STUDENT', (2281, 2294))
(7, 'Nathalie Sylla', 'NAME_STUDENT', (3648, 3661))
(8, 'Vanesa Chan', 'NAME_STUDENT', (83, 93))
(10, 'Diego Estrada', 'NAME_STUDENT', (0, 12))
(10, 'Diego Estrada', 'NAME_STUDENT', (2386, 2398))
(11, 'James Cook', 'NAME_STUDENT', (46, 55))
(16, 'Gilberto Gamboa', 'NAME_STUDENT', (22, 36))
(19, 'Alessandro', 'NAME_STUDENT', (262, 271))
(19, 'Alessandro', 'NAME_STUDENT', (420, 429))
(20, 'Sindy Samaca', 'NAME_STUDENT', (32, 43))
(22, 'Gianni', 'NAME_STUDENT', (934, 939))
(56, 'Nadine Born', 'NAME_STUDENT', (53, 63))
(80, 'Karol Ferreira', 'NAME_STUDENT', (2, 15))
(80, 'Karol Ferreira', 'NAME_STUDENT', (2304, 2317))
(80, 'Karol Ferreira', 'NAME_STUDENT', (2322, 2335))
(86, 'Eladio Amaya', 'NAME_STUDENT', (37, 48))
(88, 'Rakesh Singh', 'NAME_STUDENT', (61, 72))
(93, 'Silvia Villalobos', 'NAME_STUDENT', (0, 16))
(99, 'Francesco Boscolo', 'NAME_ST

In [49]:
print("First few examples:")
print("Characters 36-47 in essay 4:", df.loc[4].full_text[36:48])
# confirm no starting or trailing whitespace
first = df.loc[4].full_text[36]
last = df.loc[4].full_text[47]
print("First and last characters are:", first, last)
print()

print("Characters 52-65 in essay 7:", df.loc[7].full_text[52:66])
first = df.loc[7].full_text[52]
last = df.loc[7].full_text[65]
print("First and last characters are:", first, last)
print()

print("Characters 2281-2294 in essay 7:", df.loc[7].full_text[2281:2295])
first = df.loc[7].full_text[2281]
last = df.loc[7].full_text[2294]
print("First and last characters are:", first, last)
print()

# Test if indices are working properly
for i, entity_text, label, (start, end) in entities:
    full_text = df.loc[i].full_text
    assert(entity_text == full_text[start:end+1])
print("All good")

First few examples:
Characters 36-47 in essay 4: Henry Acosta
First and last characters are: H a

Characters 52-65 in essay 7: Nathalie Sylla
First and last characters are: N a

Characters 2281-2294 in essay 7: Nathalie Sylla
First and last characters are: N a

All good


In [47]:
entities = parse_essays(df.sample(100))
for item in entities:
    print(item)

# Test if indices are working properly
for i, entity_text, label, (start, end) in entities:
    full_text = df.loc[i].full_text
    assert(entity_text == full_text[start:end+1])
print("All good")

(3391, 'Diana Vazquez', 'NAME_STUDENT', (27, 39))
(3661, 'Julio Emad', 'NAME_STUDENT', (3943, 3952))
(6187, 'Gino Schiavo', 'NAME_STUDENT', (0, 11))
(7137, 'Jamal Wahab', 'NAME_STUDENT', (126, 136))
(7222, 'Dharmendra Shakya', 'NAME_STUDENT', (6, 22))
(7729, 'Oliver Falkenberg', 'NAME_STUDENT', (4539, 4555))
(11083, 'https://www.youtube.com/watch?v=Mo-fuWQb8oK', 'URL_PERSONAL', (6211, 6253))
(11451, 'Margarita Bonilla', 'NAME_STUDENT', (111, 127))
(11644, 'Nicole Horvath', 'NAME_STUDENT', (41, 54))
(13971, 'Vino Jan', 'NAME_STUDENT', (80, 87))
(14108, 'Fortuna', 'NAME_STUDENT', (106, 112))
(14108, 'Fortuna', 'NAME_STUDENT', (2111, 2117))
(14108, 'david00@yahoo.com', 'EMAIL', (2156, 2172))
(14108, 'jenniferwright@yahoo.com', 'EMAIL', (205, 228))
(14265, 'Mohan Ullah', 'NAME_STUDENT', (14, 24))
(14265, 'Mohan Ullah', 'NAME_STUDENT', (2247, 2257))
(14265, 'davislori@yahoo.com', 'EMAIL', (73, 91))
(14682, 'Liam Singh', 'NAME_STUDENT', (13, 22))
(14956, 'Carlos Valencia', 'NAME_STUDENT', (9