In [1]:
import pandas as pd
import numpy as np 
import json 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import itertools
import re
import plotly
import plotly.graph_objects as go
import plotly.express as px
from itertools import chain
import ast
from IPython.display import HTML
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span
spacy_nlp = spacy.load('en_core_web_sm')
from spacy.lang.en import English
import string
eng_tokenizer = English().tokenizer
from sklearn.preprocessing import MultiLabelBinarizer



## Load train data

In [2]:
# Load data in dictionary and dataframe format
with open('../data/train.json', 'r') as f:
    data = json.load(f)

data_df = pd.read_json('../data/train.json')

In [4]:
data_df.head(2)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."


In [None]:
print('Number of documents:',len(data))

Number of documents: 6807


In [27]:
#preprocess data
data_df = data_df.rename(columns={'full_text':'text'})
data_df['llm_generated'] = False
data_df['prompt_id'] = -1

## Load llm generated data 

In [9]:
# Load more data (llm generated)
pii_dataset = pd.read_csv('../data/pii_dataset.csv')

In [10]:
pii_dataset.head(2)

Unnamed: 0,document,text,tokens,trailing_whitespace,labels,prompt,prompt_id,name,email,phone,job,address,username,url,hobby,len
0,1073d46f-2241-459b-ab01-851be8d26436,"My name is Aaliyah Popova, and I am a jeweler ...","['My', 'name', 'is', 'Aaliyah', 'Popova,', 'an...","[True, True, True, True, True, True, True, Tru...","['O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUD...",\n Aaliyah Popova is a jeweler with 13 year...,1,Aaliyah Popova,aaliyah.popova4783@aol.edu,(95) 94215-7906,jeweler,97 Lincoln Street,,,Podcasting,363
1,5ec717a9-17ee-48cd-9d76-30ae256c9354,"My name is Konstantin Becker, and I'm a develo...","['My', 'name', 'is', 'Konstantin', 'Becker,', ...","[True, True, True, True, True, True, True, Tru...","['O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUD...",\n Konstantin Becker is a developer with 2 ...,1,Konstantin Becker,konstantin.becker@gmail.com,0475 4429797,developer,826 Webster Street,,,Quilting,255


In [11]:
print('Number of documents:',len(pii_dataset))

Number of documents: 4434


In [12]:
# preprocess data
pii_dataset['llm_generated'] = True

# Convert string to list
pii_dataset[["tokens", "trailing_whitespace", "labels"]] = pii_dataset[["tokens", "trailing_whitespace", "labels"]].map(ast.literal_eval)
pii_dataset["document"] = pii_dataset["document"].astype("category").cat.codes + (data_df.document.max() + 1) # make sure document id is unique and changing to int


#### Tokenize using spacy (to have same tokenizer as the original data)

In [13]:
def tokenize_with_spacy(text, tokenizer=eng_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    trailing_whitespace = [bool(token.whitespace_) for token in tokenized_text]
    return {'tokens': tokens, 'trailing_whitespace': trailing_whitespace}

def create_new_tokens_labels(row):
    
    tokens, labels, trailing_whitespace = row.tokens, row.labels, row.trailing_whitespace
    new_tokens, new_labels, new_trailing_whitespaces = [], [], []
    labels = [l.split("-")[1] if l != "O" else l for l in labels]
    
    for i in range(len(tokens)):
        t = tokens[i]
        l = labels[i]
        ws = trailing_whitespace[i]
        
        prev_l = labels[i - 1] if i > 0 else "O"
        next_l = labels[i + 1] if (i + 1) < len(labels) else "O"
        
        # Found a PHONE_NUM token mislabed as STREET_ADDRESS:
        if l != "O" and re.search(r'\+\d+', t):
            l = "PHONE_NUM"
        
        # Found STREET_ADDRESS between 2 PHONE_NUM:
        if l == "STREET_ADDRESS" and prev_l == "PHONE_NUM" and prev_l == next_l:
            l = "PHONE_NUM"
            
        # Found individual mislabeled STREET_ADDRESS tokens:
        elif l == "STREET_ADDRESS" and l != next_l and l != prev_l:
            l = "O"
        
        # Create spacy tokens and their labels
        tok_ = tokenize_with_spacy(t)
        spacy_tokens = tok_["tokens"]
        new_tokens.extend(spacy_tokens)
        
        new_labels.extend([l if st not in string.punctuation else "O" for st in spacy_tokens])
        
        new_trailing_whitespaces.extend(tok_["trailing_whitespace"])
        new_trailing_whitespaces[-1] = ws  
        
    return pd.Series({"document": row.document, "tokens": new_tokens, "trailing_whitespace": new_trailing_whitespaces, "labels": new_labels})


def update_labels(row):    
    tokens = row.tokens
    labels = row.labels
    new_labels = ["O"] * len(labels)
    for i, (t, l) in enumerate(zip(tokens, labels)):
        prev_l = new_labels[i - 1] if i > 0 else "O"
        next_l = labels[i + 1] if i + 1 < len(labels) else "O"
    
        if (prev_l == "NAME_STUDENT" or prev_l == "O") and t == "'s":
            new_labels[i] = "O"

        elif t == "(" and next_l == "PHONE_NUM":
            new_labels[i] = next_l

        elif t == ")" and prev_l == "PHONE_NUM":
            new_labels[i] = prev_l
        
        elif (t in string.punctuation) and (prev_l == next_l) and (prev_l != "O"):
            
            if t == "," and prev_l != "STREET_ADDRESS":
                new_labels[i] = "O"
            elif t == "." and prev_l == "NAME_STUDENT":
                new_labels[i] = "O"
            else:
                new_labels[i] = prev_l
        
        else:
            new_labels[i] = l
    
    return new_labels

def create_bio_labels(labels):
    new_labels = ["O"]*len(labels)
    prev_l = "O"
    for i, l in enumerate(labels):
        if l != "O":
            if l != prev_l:
                new_labels[i] = "B-" + l
            elif l == prev_l:
                new_labels[i] = "I-" + l
        prev_l = l
    return new_labels

In [14]:
new_pii_dataset = pii_dataset.apply(create_new_tokens_labels, axis=1)
new_pii_dataset["labels"] = new_pii_dataset.apply(update_labels, axis=1).apply(create_bio_labels)

In [15]:
new_pii_dataset['text'] = new_pii_dataset['tokens'].apply(lambda x: ' '.join(x))
new_pii_dataset[['prompt_id', 'llm_generated']] = pii_dataset[['prompt_id', 'llm_generated']]


In [16]:
new_pii_dataset.head(4)

Unnamed: 0,document,tokens,trailing_whitespace,labels,text,prompt_id,llm_generated
0,22968,"[My, name, is, Aaliyah, Popova, ,, and, I, am,...","[True, True, True, True, False, True, True, Tr...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O...","My name is Aaliyah Popova , and I am a jeweler...",1,True
1,24398,"[My, name, is, Konstantin, Becker, ,, and, I, ...","[True, True, True, True, False, True, True, Fa...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O...","My name is Konstantin Becker , and I 'm a deve...",1,True
2,23632,"[As, Mieko, Mitsubishi, ,, an, account, manage...","[True, True, False, True, True, True, True, Tr...","[O, B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O...","As Mieko Mitsubishi , an account manager at a ...",3,True
3,25282,"[My, name, is, Kazuo, Sun, ,, and, I, 'm, an, ...","[True, True, True, True, False, True, True, Fa...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O, O...","My name is Kazuo Sun , and I 'm an air traffic...",1,True


## Preprocessing

In [28]:
# combine the two datasets
df = pd.concat([data_df, new_pii_dataset], ignore_index=True)

In [59]:
df.head(2)

Unnamed: 0,document,text,tokens,trailing_whitespace,labels,llm_generated,prompt_id
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",False,-1
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",False,-1


In [60]:
def encode_labels(df):
    df = df.copy()
    df["unique_labels"] = df["labels"].apply(lambda x: set(
        [l.split('-')[1] if l != 'O' else l for l in x]
         ))

    mlb = MultiLabelBinarizer()
    one_hot_encoded = mlb.fit_transform(df['unique_labels'])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=mlb.classes_)
    df = pd.concat([df, one_hot_df], axis=1)
    
    # add 'OTHER' column which is only true when we have no other label in text
    df['OTHER'] = df['unique_labels'].apply(lambda x: 1 if len(x - {"O"}) == 0 else 0)
    
    return df, list(mlb.classes_) + ['OTHER']

In [61]:
df, label_classes = encode_labels(df)

In [62]:
df.head(2)

Unnamed: 0,document,text,tokens,trailing_whitespace,labels,llm_generated,prompt_id,unique_labels,EMAIL,ID_NUM,NAME_STUDENT,O,PHONE_NUM,STREET_ADDRESS,URL_PERSONAL,USERNAME,OTHER
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",False,-1,"{NAME_STUDENT, O}",0,0,1,1,0,0,0,0,0
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",False,-1,"{NAME_STUDENT, O}",0,0,1,1,0,0,0,0,0


# EDA

## Target distribution plot

In [None]:
labels = df['labels'].tolist()

# Flatten the list of lists
flattened_labels = list(itertools.chain.from_iterable(labels))

# Count the occurrences of each label
label_counts = Counter(flattened_labels)

# Separate the labels and their counts for plotting
labels, counts = zip(*label_counts.items())

In [None]:
# Create the bar plot
fig = go.Figure([go.Bar(x=labels[1:], y=counts[1:])])

# Customize the layout
fig.update_layout(title_text='Frequency of Each Label', xaxis_title='Labels', yaxis_title='Frequency')

# Show the plot
fig.show()

## Unique targets in each document

In [72]:
df['unique_labels'] = df['labels'].apply(lambda x: list(set(x)))
df['num_labels'] = df['unique_labels'].apply(len)

In [None]:
#histogram of number of labels per document
fig = px.histogram(df, x='num_labels', nbins=20, title='Histogram of number of unique labels per document')

# Show the plot
fig.show()

## Distribution of tokens in each document (with whitespace) 

In [None]:
df['num_tokens'] = df['tokens'].apply(len)

In [None]:
#histogram of number of tokens per document
fig = px.histogram(df, x='num_tokens', nbins=500, title='Histogram of number of tokens per document')

# Show the plot
fig.show()

## Distribution of text length

In [None]:
df['len_text'] = df['text'].apply(len)
fig = px.histogram(df, x='len_text', nbins=500, title='Histogram of length of text per document')

# Show the plot
fig.show()

## Distribution of documents without labels(only "o")

In [None]:
df_non_outer = df[df['labels'].apply(lambda x: len(set(x)) > 1)] #with labels
df_outer = df[df['labels'].apply(lambda x: 'O' in x and len(set(x)) == 1)] #without labels 

In [None]:
df_non_outer['Label Type'] = 'With Labels'
df_outer['Label Type'] = 'Without Labels'

# Calculate text length
df['len_text'] = df['text'].apply(len)
df_non_outer['len_text'] = df_non_outer['text'].apply(len)
df_outer['len_text'] = df_outer['text'].apply(len)

# Combine the dataframes
combined_df = pd.concat([df_non_outer, df_outer])

# Plotting
fig = px.histogram(combined_df, x='len_text', color='Label Type', barmode='overlay',
                   nbins=500, title='Histogram of Length of Text per Document')

# Show the plot
fig.show()

In [None]:
# Combine the dataframes
combined_df = pd.concat([df_non_outer, df_outer])
combined_df['len_tokens'] = combined_df['tokens'].apply(len)
# Plotting
fig = px.histogram(combined_df, x='len_tokens', color='Label Type', barmode='overlay',
                   nbins=500, title='Histogram of Length of Tokens per Document')

# Show the plot
fig.show()

## Labels Pos

In [None]:
df["labels_pos"] = df["labels"].apply(lambda labels: np.arange(1, len(labels) + 1) / len(labels))
exp_df = df.explode(["tokens", "labels", "labels_pos"])
exp_df["labels"] = pd.Categorical(exp_df["labels"], categories=labels, ordered=True)
exp_df = exp_df.sort_values(by="labels", ascending=False)
label_tokens = exp_df.groupby("labels", observed=False).agg(list)
label_tokens["counts"] = label_tokens["tokens"].apply(len)

In [None]:
fig = px.scatter(exp_df, x='labels_pos', y='labels', title='Scatter Plot of Labels in Documents',)

fig.update_layout(
    xaxis_title='X Axis Label',
    yaxis_title='Y Axis Label',
    legend_title='Legend'
)
fig.show()

In [None]:
# word cloud for each target 
# wordcloud for surronding words of each target


In [None]:
# from wordcloud import WordCloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df_train['full_text']))
# plt.figure(figsize=(10, 6))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud of Essays')
# plt.show()

## NER Visualization with Spacy of train data

In [21]:
def convert_to_spacy_format(text, tokens, labels, trailing_whitespace):
    ents = []  # To store entity dictionaries
    start = 0  # Position tracker for the start of each token in the text
    
    for i, (token, label, space) in enumerate(zip(tokens, labels, trailing_whitespace)):
        if label.startswith('B-') or label.startswith('I-'):
            label_type = label[2:]  # Extract entity type from label
            token_start = text.find(token, start)  # Find the start index of the token in text
            token_end = token_start + len(token)  # Calculate the end index of the token
            
            # If it's a 'B-' label or the first 'I-' label following non-matching or 'O' labels, start a new entity
            if label.startswith('B-') or (label.startswith('I-') and (i == 0 or not labels[i-1].endswith(label_type))):
                ents.append({"start": token_start, "end": token_end, "label": label_type})
            # If it's an 'I-' label continuing an entity, extend the last entity's end index
            elif label.startswith('I-') and ents and ents[-1]["label"] == label_type:
                ents[-1]["end"] = token_end
            
            start = token_end + (1 if space == 'True' else 0)  # Update start position for next token
    
    return [{"text": text, "ents": ents, "title": None}]



In [66]:
def visualize_ner(row):

    display_text = row['text'].values[0] 
    display_labels = row['labels'].values[0] 
    trailing_whitespace = row['trailing_whitespace'].values[0] 
    tokens = row['tokens'].values[0] 

    display_text = display_text.replace("\n\n", "\r\n")
    ex = convert_to_spacy_format(display_text, tokens, display_labels, trailing_whitespace)

    custom_css = """
                <style>    
                    /* Customizing entity appearance */
                    .entities {
                        font-size: 11px !important;
                        font-family: Verdana !important;
                        line-height: 1.25 !important;
                        border-radius: 10px !important; /* Rounded corners */
                        background-color: #f9f9f9 !important; /* Very light gray background */
                        padding: 20px 15px !important; /* Adjust padding */
                    }
                    /* Customizing entity appearance */
                    .entity {
                        font-size: 10px !important;
                        padding: 0.2em 0.4em !important;
                        font-family: Verdana !important;
                        font-weight: bold !important;
                        
                    }
                </style>
                """

    options = {"colors": {"NAME_STUDENT": "#748CAB", "URL_PERSONAL": "#FFFC31", 
                        "ID_NUM": "#E94F37", "EMAIL": "#F8B195", "STREET_ADDRESS": "#BDBF09", "PHONE_NUM": "#D96C06", "USERNAME": "#2292A4"}}

    # Inject custom CSS
    display(HTML(custom_css))

    spacy.displacy.render(ex, style="ent", manual=True, jupyter=True, options=options)


In [80]:
visualize_ner(df[df['document'] == 9854])
visualize_ner(df.sort_values(by=["unique_labels"], ascending=True).reset_index(drop=True).iloc[0:1])
visualize_ner(df.sort_values(by=["num_labels"], ascending=False).reset_index(drop=True).iloc[0:1])

## Word surronding

In [None]:
# pii_data = {
#     'EMAIL':{},
#     'ID_NUM':{},
#     'NAME_STUDENT':{},
#     'PHONE_NUM':{},
#     'STREET_ADDRESS':{},
#     'URL_PERSONAL':{},
#     'USERNAME':{},
# }

# def update_pii_data(pii_type, token, tokens, i):
#     if pii_type not in pii_data:
#         pii_data[pii_type] = {}
    
#     # Extracting the token and surrounding context
#     token_text = tokens[i]
#     surrounding_tokens = tokens[max(0, i-2):i] + tokens[i+1:min(len(tokens), i+3)]
#     sentence = ' '.join(tokens)  # Simplified; consider a more accurate sentence detection
    
#     # Assuming `tokens` is a list of all tokens in the document and `i` is the index of the current token
#     sentence_boundaries = [j for j, token in enumerate(tokens) if token in '.!?'] + [len(tokens)-1]
#     sentence_start = max([boundary for boundary in sentence_boundaries if boundary < i]+[0])
#     sentence_end = min([boundary for boundary in sentence_boundaries if boundary > i]+[len(tokens)-1])
#     sentence_context = tokens[sentence_start:sentence_end+1]
    
#     # Determine PII position in the sentence
#     position_in_sentence = "middle"
#     if i == sentence_start:
#         position_in_sentence = "beginning"
#     elif i == sentence_end:
#         position_in_sentence = "end"
    
#     # PII Token Type
#     if token_text.isalpha():
#         token_type = "alphabetic"
#     elif token_text.isdigit():
#         token_type = "numeric"
#     else:
#         token_type = "alphanumeric" if any(char.isalpha() for char in token_text) else "other"
    
#     # PII Format Pattern
#     format_pattern = ''.join(['d' if char.isdigit() else 'l' if char.isalpha() else char for char in token_text])
    
#     # Capitalization
#     capitalization = "lowercase"
#     if token_text.isupper():
#         capitalization = "uppercase"
#     elif token_text.istitle():
#         capitalization = "titlecase"
    
#     # Special Characters
#     special_chars = any(not char.isalnum() for char in token_text)
    
#     details = {
#         'token_text': token_text,
#         'surrounding_words': surrounding_tokens,
#         'location_in_essay': i,
#         'sentence_context': ' '.join(sentence_context),
#         'pii_length': len(token_text),
#         'position_in_sentence': position_in_sentence,
#         'token_type': token_type,
#         'format_pattern': format_pattern,
#         'capitalization': capitalization,
#         'special_chars': special_chars,
#         # Add more details as needed
#     }
    
#     pii_token_key = f"{token_text}_{i}"  # Unique key for each PII instance
#     if pii_token_key not in pii_data[pii_type]:
#         pii_data[pii_type][pii_token_key] = []
#     pii_data[pii_type][pii_token_key].append(details)

# # Example of how to call this function within your iteration over the DataFrame
# for index, row in tqdm(train_df.iterrows(), total=len(train_df)):
#     tokens = row['tokens']
#     labels = row['labels']
#     for i, label in enumerate(labels):
#         if label != 'O':  # If the label indicates PII
#             pii_type = label[2:]  # Extract PII type (removing the B- or I- prefix)
#             update_pii_data(pii_type, tokens[i], tokens, i)
            
            
# # Initialize an empty list to hold each PII instance as a dictionary
# flat_pii_data = []

# # Iterate through the pii_data dictionary
# for pii_type, pii_instances in pii_data.items():
#     for instance_key, details_list in pii_instances.items():
#         for details in details_list:
#             # Create a flat dictionary for each PII instance
#             flat_instance = details.copy()  # Start with the existing details
#             flat_instance['pii_type'] = pii_type  # Add the PII type
#             flat_instance['instance_key'] = instance_key  # Add the instance key for reference
            
#             # Append this flat dictionary to our list
#             flat_pii_data.append(flat_instance)

# # Store and viz
# train_pii_df = pd.DataFrame(flat_pii_data)
# print("\n... PII DATAFRAME ...\n")
# display(train_pii_df)

# # Count the number of instances for each PII type
# pii_type_counts = train_pii_df['pii_type'].value_counts()
# print("\n... PII TYPE COUNTS ...\n")
# display(pii_type_counts.to_frame().T)

# # Or aggregate to find the average PII length by type
# average_pii_length_by_type = train_pii_df.groupby('pii_type')['pii_length'].mean()
# print("\n\n... AVERAGE PII LENGTH (CHARS) BY TYPE ...\n")
# display(average_pii_length_by_type.to_frame().T)