In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import time 
import spacy

from tqdm import tqdm
from openai import OpenAI

In [None]:
df = pd.read_json("../kaggle_dataset/train_split.json")
df.head()

In [None]:
def all_O(labels):
    is_all_O = all(label == 'O' for label in labels)
    return is_all_O

df['is_all_O'] = df['labels'].apply(all_O)
df = df[df['is_all_O'] == 0]
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
os.environ['OPENAI_API_KEY'] = ''
client = OpenAI()

In [None]:
def generate_text(model_input, idx):
    messages = [
        {
            'role': 'user',
            'content': model_input
        }
    ]

    random_temperature = random.uniform(0.0, 1.0)
    random_top_p = random.uniform(0.5, 1.0)

    flag = False

    for attempt in range(5):
        try:
            response = client.chat.completions.create(
                model='gpt-3.5-turbo-16k',
                messages=messages,
                max_tokens=4096,
                frequency_penalty=1.12,
                temperature=random_temperature,
                top_p=random_top_p
            )
            output = response.choices[0].message.content
            flag = True
            print(f'Generated Successfully on Idx-{idx}!!!')
            return output
            break
        except Exception as e:
            print(f'Attempt {attempt + 1} on Idx-{idx} Failed: {e}')
            time.sleep(1)

    if flag == False:
        print(f'Idx-{idx} Failed at Last')
        return None

In [None]:
def generate_label(model_input, idx):
    messages = [
        {
            'role': 'user',
            'content': model_input
        }
    ]

    flag = False

    for attempt in range(5):
        try:
            response = client.chat.completions.create(
                model='gpt-3.5-turbo-16k',
                messages=messages,
                max_tokens=512,
                frequency_penalty=1.12,
                temperature=0.0,
                top_p=0.95
            )
            output = response.choices[0].message.content
            flag = True
            print(f'Generated Successfully on Idx-{idx}!!!')
            return output
            break
        except Exception as e:
            print(f'Attempt {attempt + 1} on Idx-{idx} Failed: {e}')
            time.sleep(1)

    if flag == False:
        print(f'Idx-{idx} Failed at Last')
        return None

In [None]:
def split_list(original_list, chunk_size=64):
    for i in range(0, len(original_list), chunk_size):
        yield original_list[i:min(i + chunk_size, len(original_list))]

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    original_text = row['full_text']
    model_input_for_new_text_generation = "Rewrite the following article, completely altering its structure, but do not change the details and main idea of the article. You only need to return the text you have written.\n\n" + original_text
    new_text = generate_text(model_input=model_input_for_new_text_generation, idx=idx)
    new_text = new_text.strip()
    print(new_text)
    
    nlp = spacy.blank('en')
    doc = nlp(new_text)
    tokens = [token.text for token in doc]
    
    split_token_lists = list(split_list(original_list=tokens))
    for i in range(len(split_token_lists)):
        prompt = (
            "It is known that there are currently seven main types of PII (Personally Identifiable Information): \n"
            "(1)NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names. \n"
            "(2)EMAIL - A student’s email address. \n"
            "(3)USERNAME - A student's username on any platform. \n"
            "(4)ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. \n"
            "(5)PHONE_NUM - A phone number associated with a student. \n"
            "(6)URL_PERSONAL - A URL that might be used to identify a student. \n"
            "(7)STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address. \n\n"

            "At the same time,token labels are presented in BIO (Beginning, Inner, Outer) format. The PII type is prefixed with 'B-' when it is the beginning of an entity. If the token is a continuation of an entity, it is prefixed with 'I-'. Tokens that are not PII are labeled 'O', which means labels are like 'B-NAME_STUDENT', 'I-USERNAME'. \n"
            "Thus, we have 15 kinds of labels in total, they are: 'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-EMAIL', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-USERNAME', 'O'. \n\n"

            "Here is an example: \n"
            "token list: ['Design','Thinking','for','innovation','reflexion','-','Avril','2021','-','Nathalie','Sylla','\\n\\n','Challenge','&','selection','\\n\\n','The','tool','I','use','to','help','all','stakeholders','finding','their','way','through','the','complexity','of','a','project','is','the','','mind','map','.','\\n\\n','What','exactly','is','a','mind','map','?'] \n"
            "label list: ['O','O','O','O','O','O','O','O','O','B-NAME_STUDENT','I-NAME_STUDENT','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O','O'] \n\n"

            "Now you should label the following token list for me and return a complete list of labels just like token list format. Don't be lazy and don't include any extra information. Do not include any labels other than the 15 types mentioned above. \n"
            "token list: "
        )
        print(str(split_token_lists[i]))
        model_input_for_label_generation = prompt + str(split_token_lists[i]) + '\n' + 'label list: '
        print(model_input_for_label_generation)
        labels = generate_label(model_input_for_label_generation, idx=i)
        print(labels)
        
    break