In [1]:
import re
import pandas as pd
import typing
from typing import List,Tuple, Optional, NoReturn
import tiktoken
from openai import OpenAI
import json

with open('input/secret.json', 'r', encoding='UTF-8') as file:
    secret_json = json.load(file)
api_key = secret_json['OPENAI_API_KEY']
client = OpenAI(api_key=api_key)

FULL_TEXT = 0
D = {'NAME_STUDENT': ('@@@','###'), 
      'URL_PERSONAL':('&&&','$$$'),
      'EMAIL':('QQQ','^^^'),
      'PHONE_NUM':(r'%%%',r'~~~'),
    }
PATTERN = r'@@@(.*?)###|&&&(.*?)\$\$\$|QQQ(.*?)\^\^\^|%%%(.*?)~~~'

IndexTable =[key for key in D.keys()]

In [2]:
def read_csv(path = 'data/pii_true_entities.csv'):
    df = pd.read_csv(path,encoding='utf-8')
    return df

def read_json(path = 'data/obfuscated_data_06.json'):
    df = pd.read_json(path, orient="records",encoding='utf-8')
    return df


label = read_csv()

df = read_json()

In [3]:
#function to create training example and parse return
def count_special_token(text:str)->int:
    #params: text: string of annotated text
    #output: number of special tokens(in num_chars) in the input text, int
    acc = 0
    for val in D.values():
 
        tok1,tok2 = val
        #print(f'{tok1} count:',text.count(tok1))
        #print(f'{tok2} count:',text.count(tok2))
        acc += text.count(tok1)*3 + text.count(tok2)*3
    return acc


def mkTrainingExample(text: str, L: List[Tuple[str,str, int, int]]) -> str:
    #params: text: string of unlabeled text 
    #param: L: string *string * int * int list, where the first string is entity name(not used), second string is the type of entity, int * int is the start:end index of the entity in text
    #output: labeld_text: string of labeled text
    
    offset = 0
    for entity in L: 
        _,label, start, end = entity
        start = start + offset
        end = end + offset

        start_mark,end_mark = D[label]
        offset += 6
        text = text[:start] + start_mark + text[start:end] + end_mark + text[end:]
    return text     

def parse_return(text:str) -> List[Tuple[str,str,int,int]]: 
    #param: string of annotated text
    #output: string * string * int * int list, string of entity name, string of entity type, int * int of start and end index 

    matches = re.finditer(PATTERN, text)
    extracted_matches = []
    #extract all the matched sub-string
    
    for match in matches:
        for i in range(1,len(match.groups())+1):
            match_text = match.group(i)

            if match_text != None:
                label = IndexTable[i-1]
                offset = count_special_token(text[:match.start(i)])
                start_index = match.start(i) - offset
                end_index = match.end(i) - offset

                extracted_matches.append([match_text, label,start_index, end_index])
    return extracted_matches 


def cmp(l1): 
    return int(l1[2])

def get_all_labels(file_idx:int,df :pd.DataFrame) -> List[Tuple[str,str,int,int]]: 
    #params: file_idx as the index of unannotated text 
    #params: df as the .csv file containing entity information
    #output: string * string * int * int list, string of entity name, string of entity type, int * int of start and end index
    df = df[df['file_idx'] == file_idx] 
    L = df.values.tolist()

    for i in range(len(L)):
        L[i] = L[i][1:]
        last = L[i][-1]
        last = last[1:-1]
        last = last.split(',')
        
        L[i] = L[i][:-1]
        L[i].append(int(last[0]))
        L[i].append(int(last[1]))
    L.sort(key =cmp)
    return L

def get_train_example(idx:int)-> str: 
    return df.iloc[idx,FULL_TEXT]



def get_gpt_response(input_text:str) -> str:
    #param: string of text to be annotated 
    #out: string of text annotated 

    system_prompt = "You are an expert in labeling Personally Identifiable Information. Start your response rightaway without adding any prefix(such as Response:) and suffix"

    ins_prompt = '''Label the entity of the following text: @@@,### to label student name;
&&&,$$$ to label personal URL; QQQ,^^^ to label personal email; %%%,~~~ to label phone number\n
Ensure that the rest of the text remains unchanged, word for word.
Maintain the original punctuation, quotation marks, spaces, and line breaks. 
If the text does not contain any PII, return it as is.
For example, if the input is:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    Dharmendra Asiri    Washington,DC / March 8, 2019    email djones@gmail.com  linkedIn https://www.linkedin.com/in/mmartinez
The output should be:
COURSERA - University of Virginia, Darden School of Business - Design Thinking Assignment    @@@Dharmendra Asiri###    Washington,DC / March 8, 2019    email QQQdjones@gmail.com^^^  linkedIn &&&https://www.linkedin.com/in/mmartinez$$$
Another example:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\nEslam Abo Fatma\n\nRwanda- Africa\n\nEmail: murraythomas@gmail.com\n\nTel: (223)392-2765\n\n'
The output should be:
I do conclude, my assignment by thanking Lecturers, University of Virginia and other  partners who contributed to this online courses.\n\nMay God bless you.\n\n@@@Eslam Abo Fatma###\n\nRwanda- Africa\n\nEmail: QQQmurraythomas@gmail.com^^^\n\nTel: %%%(223)392-2765~~~\n\n'
Another example:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
The output should be exactly the same as input:
An article was published which  described one of the most successful entrepreneurs in the world, Jeff Bezos. It was mentioned  that Bezos insists that no PPTs are shown during the board meetings but stories are told.
Please repeat this process with the following file:\n
'''

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": ins_prompt + input_text}],
        temperature = 0)
    return response.choices[0].message.content 


def get_train_example(idx:int)-> str: 
    return df.iloc[idx,FULL_TEXT]

def get_testing_list(path = 'data/test_indices.txt')-> List[int]:
    with open(path,'r') as file:
        string = file.read() 
    string = string [1:-1]
    L = [int(each) for each in string.split(',')]
    return L 



In [18]:
# import random

# testing_index_list = get_testing_list()

# test_idx = 21275 #random.sample(testing_index_list,1)[0]

# text = get_train_example(test_idx)

# response = get_gpt_response(text)

In [20]:
# print(response)

Ana Kane    Design Thinking Final assignment      2017-06-24

@@@Ana Kane###, 2017-06-24 - Desing Thinking Final assignment

Reflection – Visualization

In this Final assignment I’m presenting my application of Visualization that was a very powerful tool to address the  below challenge.

Challenge

I work for a big international industrial company as a bid manager, based in London. My job consists of delivering  the high quality competitive bids to the client in response to their Request For Proposals resulting in winning the  contracts. The bid preparation process consists of capturing client’s requirement, addressing them to the relevant  internal experts and external consultants, then challenging and consolidating their final responses in the delivery  pack. These activities require a lot of clear and efficient communication, coordination and flexibility due to a  limited availability of the bid contributors and tight submission schedule.

In order to prepare the current bid, I lead

In [4]:
def initialize_csv(csv_path): 
    L  = ['file_idx','entity_text','type','positions']
    df = pd.DataFrame(columns=L)
    df.to_csv(csv_path, index=False,encoding = 'utf-8')

def load_csv_iteratively(file_idx:int, csv_path:str,entity_list)-> NoReturn: 
    df = pd.read_csv(csv_path,encoding = 'utf-8')

    L = [] 
    for entity in entity_list: 
        start,end = entity[-2],entity[-1]
        entity = entity[:-2]
        entity.append(f'({start}, {end})')
        entity = [file_idx] + entity
        L.append(entity)
    new_df = pd.DataFrame(L, columns=['file_idx', 'entity_text', 'type', 'positions'])
    updated_df = pd.concat([df, new_df], ignore_index=True)
    updated_df.to_csv(csv_path, index=False, encoding='utf-8')

In [28]:
import random
testing_index_list = get_testing_list()
# initialize_csv('output/foo.csv')
text_idx_lst = [10]
text_lst = []
# for i in range(10):
for text_idx in text_idx_lst:
    print(f'Pocessing file {text_idx}')
    # text_idx = random.sample(testing_index_list,1)[0]
    # text_idx_lst.append(text_idx)
    text = get_train_example(text_idx)
    text = get_gpt_response(text)
    text_lst.append(text)
    L_p = parse_return(text)
    load_csv_iteratively(text_idx,'output/foo.csv', L_p)


Pocessing file 10


In [26]:
print(text_lst[3])

@@@Diego Estrada### 

Design Thinking Assignment 

Visualization Tool 

Challenge & Selection 

The elderly were having a hard time adapting to the changes we brought in our bank. As a result of a poorly implemented linear solution, a more customer centric approach was needed. 

After learning about design thinking in this course, we decided to apply it to solve this problem. The visualization tool allowed the team to create a dynamic presentation using diagrams, figures and drawings on the go that really resonated among the stakeholders. Previous to this change, none of our solutions seemed to be adequate for them, but the new implementation created a different type of connection with them that helped them understand the problem in the way the team and I did. 

Application 

The process starts in the prep time. The team uses a series of tools and software to develop a presentation using the surveys gathered during research and the solutions we created during the process. The use of gr

In [24]:
# 10,Diego Estrada,NAME_STUDENT,"(2375, 2388)"
print(df.iloc[10].full_text[2375:2388]) # True
print("----------------------")
print(df.iloc[10].full_text[2386:2399]) # Detected

gether to

Di
----------------------
Diego Estrada


In [10]:
print(text)

MY BRILLIANT TEAM

Challenge: My name is @@@Khaled### and I am working as an IT Director for a big food retailer in Austria.  I am responsible for software, software development and hardware in our food stores.  One of my most interesting section at work is working as a business expert in several  IT projects with my whole team (my team = 7 IT- & business managers). In these  projects, we are part of the big project team and our main task is to represent the customer (in our case the customers/end-users are our store managers). For this part  on the one hand it is very important to empatize with the store managers and their  real problems. On the other hand an open mindset and an honest feedback culture in  our team is essential for success. Without good teamwork, it is not possible to  represent the store managers in an open and honest way.

Based on this facts I set up a "one day creative workshop" with my team. We worked  on the following challenge: "What are the most important valu

In [6]:
text_idx_lst

[1793, 5584, 10896, 17781, 2192, 10952, 7652, 15093, 12961, 13212]

In [21]:
# error_log = []
# false_positive_log = [] 
# for i in range(10):
#     print(f'Pocessing {i}')
#     text_idx = random.sample(testing_index_list,1)[0]
#     text = get_train_example(text_idx)
#     text = get_gpt_response(text)
#     L_p = parse_return(text)
#     load_csv_iteratively(text_idx,'foo.csv',L_p)

Pocessing 0
Pocessing 1
Pocessing 2
Pocessing 3
Pocessing 4
Pocessing 5
Pocessing 6
Pocessing 7
Pocessing 8
Pocessing 9


In [24]:
# print(text_idx)


893


In [25]:
# text = get_train_example(text_idx)
# text = get_gpt_response(text)
# L_p = parse_return(text)


In [26]:
# print(text)

Example Reflection – Learning Launch

Challenge

During master's degree, I began to volunteer as part of a student social organization. Having started work  as an activist and helping with organizing and conducting events, I quickly plunged into work and gained  an understanding of the processes. And soon I was invited to join the organization’s leaders as an HR  manager. I agreed, because I loved what I was doing. And we continued our work.

The main goal of the organization is the transfer of knowledge about the industry among other students  and senior colleagues, the transfer of accumulated experience between generations. But this is in theory.  In practice, the transfer of experience was not conducted well enough, since there was no clearly  structured system, and the demand among the team members was constantly growing.

Selection

While still an active member of the team, I faced a similar problem face to face. And after that, I began to  think about what the organization lacks,