In [28]:
import re
import pandas as pd
import typing
from typing import List,Tuple, Optional
import tiktoken


FULL_TEXT = 0
D = {'NAME_STUDENT': ('@@@','###'), 
      'URL_PERSONAL':('&&&','$$$'),
      'EMAIL':('QQQ','^^^'),
      'PHONE_NUM':(r'%%%',r'~~~'),
     }
PATTERN = r'@@@(.*?)###|&&&(.*?)\$\$\$|QQQ(.*?)\^\^\^|%%%(.*?)~~~'

IndexTable =[key for key in D.keys()]


In [29]:
def read_csv(path = 'data/pii_true_entities.csv'):
    df = pd.read_csv(path,encoding='utf-8')
    return df

def read_json(path = 'data/obfuscated_data_06.json'):
    df = pd.read_json(path, orient="records",encoding='utf-8')
    return df

label = read_csv()

df = read_json()

In [30]:
#function to create training example and parse return
def count_special_token(text:str)->int:
    #params: text: string of annotated text
    #output: number of special tokens(in num_chars) in the input text, int
    acc = 0
    for val in D.values():
 
        tok1,tok2 = val
        #print(f'{tok1} count:',text.count(tok1))
        #print(f'{tok2} count:',text.count(tok2))
        acc += text.count(tok1)*3 + text.count(tok2)*3
    return acc


def mkTrainingExample(text: str, L: List[Tuple[str,str, int, int]]) -> str:
    #params: text: string of unlabeled text 
    #param: L: string *string * int * int list, where the first string is entity name(not used), second string is the type of entity, int * int is the start:end index of the entity in text
    #output: labeld_text: string of labeled text
    
    offset = 0
    for entity in L: 
        _,label, start, end = entity
        start = start + offset
        end = end + offset

        start_mark,end_mark = D[label]
        offset += 6
        text = text[:start] + start_mark + text[start:end] + end_mark + text[end:]
    return text     

def parse_return(text:str) -> List[Tuple[str,str,int,int]]: 
    #param: string of annotated text
    #output: string * string * int * int list, string of entity name, string of entity type, int * int of start and end index 

    matches = re.finditer(PATTERN, text)
    extracted_matches = []
    #extract all the matched sub-string
    
    for match in matches:
        for i in range(1,len(match.groups())+1):
            match_text = match.group(i)

            if match_text != None:
                label = IndexTable[i-1]
                offset = count_special_token(text[:match.start(i)])
                start_index = match.start(i) - offset
                end_index = match.end(i) - offset

                extracted_matches.append([match_text, label,start_index, end_index])
    return extracted_matches 


def cmp(l1): 
    return int(l1[2])

def get_all_labels(file_idx:int,df :pd.DataFrame) -> List[Tuple[str,str,int,int]]: 
    #params: file_idx as the index of unannotated text 
    #params: df as the .csv file containing entity information
    #output: string * string * int * int list, string of entity name, string of entity type, int * int of start and end index
    df = df[df['file_idx'] == file_idx] 
    L = df.values.tolist()

    for i in range(len(L)):
        L[i] = L[i][1:]
        last = L[i][-1]
        last = last[1:-1]
        last = last.split(',')
        
        L[i] = L[i][:-1]
        L[i].append(int(last[0]))
        L[i].append(int(last[1]))
    L.sort(key =cmp)
    return L

def get_train_example(idx:int)-> str: 
    return df.iloc[idx,FULL_TEXT]

In [9]:
problematic_index_list = []

In [10]:
for i in range(len(df)):
    L = get_all_labels(i,label) 
    try:
        assert(parse_return(mkTrainingExample(get_train_example(i),L)) == L)
    except: 
        problematic_index_list.append(i)
        

In [11]:
print(problematic_index_list)

[11699, 14003, 19360]


### Estimate cost for Fine-tuned GPT-4o-mini only method

In [23]:
import copy
#cost estimation function
def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

def count_price(input_cost,output_cost,system_prmopt = "You are an expert in labeling personally identifiable information",path = 'data/test_indices.txt',num_epoches = 1): 
    #params: input_cost:cost of 1 million input token 
    #params: output_cost:cost of 1 million output token 
    #params: system prompt: string of system prompt
    #params: path of .txt test/train file
    #params: num_epches: number of epoches ran, set to 1 of not finetuning


    #output: cost of operation
    #if use for finetune cost estimation, set output_cost and input_cost all to training cost of 1 million train token
 
    with open(path,'r') as file:
        string = file.read()
    
    ins_prompt = 'Label the entity of the following text: @@@,### to label student name; &&&,$$$ to label personal URL; QQQ,^^^ to label personal email; %%%,~~~ to label phone number\n'

    
    string = string[1:-1]
    D = [int(each) for each in string.split(',')]
    I = copy.deepcopy(D)
    O = copy.deepcopy(D)
    for i in range(len(D)): 
        file_idx = D[i]
        L = get_all_labels(file_idx,label)
        I[i] = count_tokens(system_prmopt + ins_prompt + get_train_example(file_idx))
        O[i] = count_tokens(mkTrainingExample(get_train_example(file_idx),L))
    
     
    input_cost = sum(I) / 1000000 * input_cost
    output_cost = sum(O) /1000000 * output_cost

    return input_cost + output_cost


In [24]:
print(count_price(input_cost = 0.3, output_cost = 1.2 ))

19.7120709


### Estimate cost for prompting only method

In [25]:
# Estimate cost for fine-tuning only
import copy
#cost estimation function
def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

def count_price(input_cost,output_cost,system_prompt = "You are an expert in labeling Personally Identifiable Information. Start your response rightaway without adding any prefix(such as Response:) and suffix",path = 'data/test_indices.txt',num_epoches = 1): 
    #params: input_cost:cost of 1 million input token 
    #params: output_cost:cost of 1 million output token 
    #params: system prompt: string of system prompt
    #params: path of .txt test/train file
    #params: num_epches: number of epoches ran, set to 1 of not finetuning


    #output: cost of operation
    #if use for finetune cost estimation, set output_cost and input_cost all to training cost of 1 million train token
 
    with open(path,'r') as file:
        string = file.read()
    
    ins_prompt = '''Label the entity of the following text: @@@,### to label student name;
&&&,$$$ to label personal URL; QQQ,^^^ to label personal email; %%%,~~~ to label phone number\n
Ensure that the rest of the text remains unchanged, word for word.
Maintain the original punctuation, quotation marks, spaces, and line breaks. 
If the text does not contain any PII, return it as is.
For example, if the input is:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    Dharmendra Asiri    Washington,DC / March 8, 2019    email djones@gmail.com  linkedIn https://www.linkedin.com/in/mmartinez
The output should be:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    @@@Dharmendra Asiri###    Washington,DC / March 8, 2019    email QQQdjones@gmail.com^^^  linkedIn &&&https://www.linkedin.com/in/mmartinez$$$
Another example:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\nEslam Abo Fatma\n\nRwanda- Africa\n\nEmail: murraythomas@gmail.com\n\nTel: (223)392-2765\n\n'
The output should be:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\n@@@Eslam Abo Fatma###\n\nRwanda- Africa\n\nEmail: QQQmurraythomas@gmail.com^^^\n\nTel: %%%(223)392-2765~~~\n\n'
Another example:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
The output should be exactly the same as input:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
Please repeat this process with the following file:\n
'''

    
    string = string[1:-1]
    D = [int(each) for each in string.split(',')]
    I = copy.deepcopy(D)
    O = copy.deepcopy(D)
    for i in range(len(D)): 
        file_idx = D[i]
        L = get_all_labels(file_idx,label)
        I[i] = count_tokens(system_prompt + ins_prompt + get_train_example(file_idx))
        O[i] = count_tokens(mkTrainingExample(get_train_example(file_idx),L))
    
     
    input_cost = sum(I) / 1000000 * input_cost
    output_cost = sum(O) /1000000 * output_cost

    return sum(I), sum(O), input_cost + output_cost  
        

In [26]:
print(count_price(input_cost = 0.3, output_cost = 1.2 ))

(21338307, 12959655, 21.9530781)


In [27]:
21338307 + 12959655

34297962

### Estimate cost for Presidio + Fine-tuned GPT-4o-mini method

In [None]:
import copy
#cost estimation function
def count_tokens(text, model="gpt-4o-mini"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

def count_price(input_cost,output_cost,system_prompt = "You are an expert in labeling Personally Identifiable Information. Start your response rightaway without adding any prefix(such as Response:) and suffix",path = 'data/test_indices.txt',num_epoches = 1): 
    #params: input_cost:cost of 1 million input token 
    #params: output_cost:cost of 1 million output token 
    #params: system prompt: string of system prompt
    #params: path of .txt test/train file
    #params: num_epches: number of epoches ran, set to 1 of not finetuning


    #output: cost of operation
    #if use for finetune cost estimation, set output_cost and input_cost all to training cost of 1 million train token
 
    with open(path,'r') as file:
        string = file.read()
    
    ins_prompt = '''Label the entity of the following text: @@@,### to label student name;
&&&,$$$ to label personal URL; QQQ,^^^ to label personal email; %%%,~~~ to label phone number\n
Ensure that the rest of the text remains unchanged, word for word.
Maintain the original punctuation, quotation marks, spaces, and line breaks. 
If the text does not contain any PII, return it as is.
For example, if the input is:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    Dharmendra Asiri    Washington,DC / March 8, 2019    email djones@gmail.com  linkedIn https://www.linkedin.com/in/mmartinez
The output should be:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    @@@Dharmendra Asiri###    Washington,DC / March 8, 2019    email QQQdjones@gmail.com^^^  linkedIn &&&https://www.linkedin.com/in/mmartinez$$$
Another example:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\nEslam Abo Fatma\n\nRwanda- Africa\n\nEmail: murraythomas@gmail.com\n\nTel: (223)392-2765\n\n'
The output should be:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\n@@@Eslam Abo Fatma###\n\nRwanda- Africa\n\nEmail: QQQmurraythomas@gmail.com^^^\n\nTel: %%%(223)392-2765~~~\n\n'
Another example:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
The output should be exactly the same as input:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
Please repeat this process with the following file:\n
'''

    
    string = string[1:-1]
    D = [int(each) for each in string.split(',')]
    I = copy.deepcopy(D)
    O = copy.deepcopy(D)
    for i in range(len(D)): 
        file_idx = D[i]
        L = get_all_labels(file_idx,label)
        I[i] = count_tokens(system_prompt + ins_prompt + get_train_example(file_idx))
        O[i] = count_tokens(mkTrainingExample(get_train_example(file_idx),L))
    
     
    input_cost = sum(I) / 1000000 * input_cost
    output_cost = sum(O) /1000000 * output_cost

    return sum(I), sum(O), input_cost + output_cost  


In [57]:
import random

import pandas as pd
D = {'NAME_STUDENT': ('@@@','###'), 
      'URL_PERSONAL':('&&&','$$$'),
      'EMAIL':('QQQ','^^^'),
      'PHONE_NUM':(r'%%%',r'~~~'),
     }

def stratify(csv_path = 'data/train_set.csv'): 
    df = pd.read_csv(csv_path,encoding='utf-8')

    phone_list = df[df['type'] == 'PHONE_NUM'].iloc[:, 0].tolist()
    print(len(phone_list))
    url_list = df[df['type'] == 'URL_PERSONAL'].iloc[:, 0].tolist()
    print(len(url_list))
    email_list = df[df['type'] == 'EMAIL'].iloc[:, 0].tolist()
    print(len(email_list))

    student_list = df[df['type'] == 'NAME_STUDENT'].iloc[:, 0].tolist()
    print(len(student_list))

    train_idxs = random.sample(phone_list,max(1,len(phone_list)))+ random.sample(url_list,max(1,len(url_list))) + random.sample(email_list,max(len(email_list),1))+ random.sample(student_list,max(len(student_list),1))
    return train_idxs 


stratify()

3
76
29
1091


[10773,
 20211,
 4777,
 8088,
 16654,
 11083,
 13792,
 3709,
 8688,
 6007,
 14255,
 14255,
 15091,
 12898,
 14255,
 6754,
 13278,
 19609,
 12774,
 6257,
 9138,
 14255,
 14284,
 14255,
 17292,
 14255,
 6426,
 9635,
 2700,
 14255,
 6426,
 17100,
 14255,
 6209,
 15765,
 13315,
 17292,
 14151,
 8088,
 20211,
 14255,
 15745,
 13473,
 8276,
 19055,
 15787,
 5358,
 12336,
 5923,
 1309,
 16654,
 17100,
 6784,
 15765,
 1478,
 15765,
 379,
 3709,
 12465,
 8011,
 14197,
 21975,
 5923,
 14284,
 3709,
 9371,
 15787,
 3709,
 14255,
 6732,
 8245,
 19046,
 15321,
 15745,
 17100,
 11049,
 6257,
 16369,
 14255,
 14982,
 9146,
 379,
 4777,
 15091,
 14857,
 2897,
 11954,
 2897,
 13792,
 3709,
 10773,
 13949,
 11107,
 9150,
 3709,
 17317,
 20234,
 2897,
 8344,
 14982,
 11954,
 12895,
 20211,
 3796,
 9143,
 9146,
 11954,
 20234,
 3885,
 13764,
 13243,
 11077,
 18992,
 86,
 19209,
 21959,
 18206,
 20667,
 16527,
 6888,
 8505,
 17775,
 5484,
 7846,
 7427,
 7786,
 14197,
 11277,
 7662,
 14757,
 3427,
 4808,
 4

In [58]:
d = stratify()

3
76
29
1091


In [64]:
for i in [11699, 14003, 19360]:
    assert( i not in d)

In [65]:
import json
def structure_training_set(inpath,outpath,system_prompt = ""):
    with open(inpath,'r') as file:
        string = file.read()
    
    string = string[1:-1]
    D = [int(each) for each in string.split(',')]

    out = []
    for i in range(len(D)): 
        file_idx = D[i]
        L = get_all_labels(file_idx,label)
        system = {"role": "system", "content": system_prompt}  
        input = {"role": "user", "content": get_train_example(file_idx)}
        output = {"role": "assistant", "content": mkTrainingExample(get_train_example(file_idx),L)}
        l = [system, input, output]
        item = {"messages": l}
        out.append(item)

    with open(outpath, 'w',encoding='utf-8') as f:
        for entry in out:
            json_line = json.dumps(entry, ensure_ascii=False)
            f.write(json_line + '\n')
        
    return None


ins_prompt ='''Label the entity of the following text: @@@,### to label student name; &&&,$$$ to label personal URL; QQQ,^^^ to label personal email; %%%,~~~ to label phone number\n'''
def structure_training_set_from_list(d,outpath,system_prompt = 'You are an expert in labeling personally identifiable information'):
                    
    out = []
    for i in range(len(d)): 
        file_idx = d[i]
        L = get_all_labels(file_idx,label)
        system = {"role": "system", "content": system_prompt}  
        input = {"role": "user", "content": ins_prompt + get_train_example(file_idx)}
        output = {"role": "assistant", "content": mkTrainingExample(get_train_example(file_idx),L)}
        l = [system, input, output]
        item = {"messages": l}
        out.append(item)

    with open(outpath, 'w',encoding='utf-8') as f:
        for entry in out:
            json_line = json.dumps(entry, ensure_ascii=False)
            f.write(json_line + '\n')
        
    return None


In [66]:
structure_training_set_from_list(d,'train.jsonl')