In [1]:
import os
import pandas as pd
import string
import random
import ast
import json
import string
from faker import Faker
from spacy.lang.en import English
from tqdm.auto import tqdm
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = ""




In [None]:
VERSION = 0

In [2]:
fake = Faker("en_US")
en_tokenizer = English().tokenizer


def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    trailing_whitespace = [bool(token.whitespace_) for token in tokenized_text]
    return {"tokens": tokens, "trailing_whitespace": trailing_whitespace}


# def pj_to_pj(row):
#     labeldict = {}
#     labels_ = row["labeldict"]

#     for k, v in labels_.items():
#         v_processed = []

#         for x in v:
#             # This splits entities such as "Aaron Smith" to "Aaron", "Smith"
#             x = x.split(" ")
#             for n in x:
#                 while n[-1] in string.punctuation:
#                     n = n[:-1]
#                 v_processed += [n]

#             # Sadly, we need a special rule for some phone numbers containing "-"
#             # Some numbers are such as +1-725-834-5654x4787 are split on "-"
#             if k == "PHONE_NUM":
#                 for x in v:
#                     x = x.split("-")
#                     for n in x:
#                         while n[-1] in string.punctuation:
#                             n = n[:-1]
#                         v_processed += [n]
#         labeldict[k] = v_processed
#     return labeldict

def pj_to_pj(row):
    labeldict = {}

    for label, pii in row["labeldict"].items():
        pii_list = []

        for info in pii:
            pii_list += tokenize_with_spacy(info)["tokens"]

        clean_pii_list=[]
        for token in pii_list:
            # this string.punctuation filters everything except "(" which might be an opening token in phone numbers
            while len(token) > 0 and token[-1] in "".join(string.punctuation.split("(")):
                token = token[:-1]
            if len(token) > 0:
                clean_pii_list += [token]
        labeldict[label] = clean_pii_list
        
    return labeldict


def generate_fake_social_media_urls(num_urls=1):
    social_media_platforms = {
        "LinkedIn": "linkedin.com/in/",
        "YouTube": "youtube.com/c/",
        "Instagram": "instagram.com/",
        "GitHub": "github.com/",
        "Facebook": "facebook.com/",
        "Twitter": "twitter.com/",
        "Pinterest": "pinterest.com/",
        "Snapchat": "snapchat.com/",
        "TikTok": "tiktok.com/",
        "Reddit": "reddit.com/",
        "Tumblr": "tumblr.com/",
        "WhatsApp": "whatsapp.com/",
        "Discord": "discord.com/",
        "Twitch": "twitch.tv/",
        "Vimeo": "vimeo.com/",
        "Flickr": "flickr.com/",
        "Telegram": "telegram.org/",
        "Line": "line.me/",
        "Viber": "viber.com/"
    }

    fake_social_media_urls = []

    for _ in range(num_urls):
        fake_user_name = fake.user_name()
        platform, domain = random.choice(list(social_media_platforms.items()))
        fake_url = f"https://{domain}{fake_user_name}"
        fake_social_media_urls.append(fake_url)

    return fake_social_media_urls[0]


def write_essay(data, client):

    prompt = """
Given some personal information about the student, like name, email, etc as a dictionary. Write an essay in first person or third person that includes all the given information somewhere in the essay.

The essay should detail experience of applying a specific tool or approach to address a complex challenge, and should not only narrate the process but also critically analyze the effectiveness of the chosen tool or approach, reflecting on its strengths and potential limitations.

Every time you write this essay, you need to think of a new experience, use your imagination, but at the same time, it also needs to be consistent with reality.

Given information has the following keys:
NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student"s username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

Each of these will have a list of values associated with them. 

Below are the requirements for the essay:
(1) Ensure that you include each and every one of given student's personal information in the essay, do not miss out any. 
(2) The entire essay needs to ensure coherence and logic. 
(3) The distribution of given information within the essay should ideally be even and personal information must not appear adjacent in the essay.
(4) The essay should contain at least 900 words.
    """

    messages = [
        {
            "role": "system",
            "content": prompt
        },
        {
            "role": "user",
            "content": "Personal Information: " + "\n" + str(data)
        }
    ]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=messages,
        temperature=0.9
    )

    return response.choices[0].message.content

In [3]:
if __name__ == "__main__":
    NUM = 1000
    fake_identities = []
    texts_ = []
    more_data = []
    client = OpenAI()

    for i in tqdm(range(NUM), desc="Generating Data: "):
        temp_d = {
            "NAME_STUDENT": [],
            "EMAIL": [],
            "USERNAME": [],
            "ID_NUM": [],
            "PHONE_NUM": [],
            "URL_PERSONAL": [],
            "STREET_ADDRESS": []
        }
        for j in range(random.choices([0, 1, 2, 3], weights=[0.00, 0.85, 0.10, 0.05], k=1)[0]):
            name = random.choices(
                [fake.name, fake.first_name, fake.last_name],
                weights=[0.80, 0.10, 0.10],
                k=1
            )[0]()
            temp_d["NAME_STUDENT"].append(name)

        for j in range(random.choices([0, 1, 2], weights=[0.00, 0.9, 0.10], k=1)[0]):
            temp_d["EMAIL"].append(fake.ascii_free_email())

        for j in range(random.choices([0, 1, 2, 3], weights=[0.00, 0.85, 0.10, 0.05], k=1)[0]):
            temp_d["USERNAME"].append(fake.user_name())

        for j in range(random.choices([0, 1, 2], weights=[0.00, 0.90, 0.10], k=1)[0]):
            name = random.choices(
                [fake.ssn, fake.passport_number, fake.bban, fake.iban, fake.license_plate],
                weights=[0.30, 0.30, 0.10, 0.10, 0.20],
                k=1
            )[0]()
            temp_d["ID_NUM"].append(name)

        for j in range(random.choices([0, 1, 2], weights=[0.00, 0.90, 0.10], k=1)[0]):
            temp_d["PHONE_NUM"].append(fake.phone_number())

        for j in range(random.choices([0, 1, 2], weights=[0.00, 0.90, 0.10], k=1)[0]):
            temp_d["URL_PERSONAL"].append(generate_fake_social_media_urls(1))

        for j in range(random.choices([0, 1, 2], weights=[0.00, 0.90, 0.10], k=1)[0]):
            temp_d["STREET_ADDRESS"].append(fake.address())

        fake_identities.append(temp_d)
        generated_text = write_essay(temp_d, client)
        texts_.append(generated_text)

    df = pd.DataFrame({
        "text": texts_,
        "labeldict": fake_identities
    })

    labels_ = []
    for i, row in df.iterrows():
        labeldict = pj_to_pj(row)
        labels_.append(labeldict)

    tokens = [tokenize_with_spacy(r["text"]) for idx, r in df.iterrows()]

    # This loop is very inefficient, but it takes 0.3 seconds - so who cares
    new_ = []
    for tok, l in zip(tokens, labels_):

        # these will just be forwarded to the final result, as we do not change these
        t = tok["tokens"]
        ws = tok["trailing_whitespace"]

        # Create "O" label as standard value to overwrite on specific indices
        new_labels = ["O"] * len(t)

        # Find entities from labels_ in the text
        for ent_type, ent_list in l.items():
            for ent_ in ent_list:
                # find occurence of tagged entities in the list
                # - this assumes that entities are not containing commong words such as "the"
                indices = [i for i, x in enumerate(t) if x == ent_]
                for i in indices:
                    # overwrite "O" label with correct label
                    new_labels[i] = ent_type

        new_.append({
            "tokens": t,
            "trailing_whitespace": ws,
            "labels": new_labels
        })

    # As we only labelled words, but not punctuation inbetween these words, we need to fill the gaps
    new_2 = []
    punctuation = [p for p in string.punctuation]
    for r, labeldict in zip(new_, labels_):

        sandwich_on_comma = ["STREET_ADDRESS"]
        # again these are just forwarded
        t = r["tokens"]
        ws = r["trailing_whitespace"]
        # again, these may get overwritten
        label = r["labels"]
        new_labels = ["O"] * len(label)
        for i, l in enumerate(label):
            # get prior label if possible
            if i != 0:
                prior_label = label[i - 1]
            else:
                prior_label = "O"

            # get next label
            if i + 1 < len(label):
                next_label = label[i + 1]
            elif i + 1 == len(label):
                next_label = "O"

            # skip filler / list words that split multiple entities
            if (t[i] == "and" and l == "O") or (t[i] == "or" and l == "O"):
                new_labels[i] = "O"

            elif t[i] == "(" and l == "PHONE_NUM" and next_label != "PHONE_NUM":
                new_labels[i] = "O"

            elif t[i] == "(" and l != "PHONE_NUM":
                # print(l, t[i-2:i+5]) # this is whenever we have something like "my name is Valentin (Valle)"
                new_labels[i] = "O"

            elif prior_label == "EMAIL" and t[i] == "to":
                new_labels[i] = "O"

            # only street addresses should contain commas - this avoids labelling sandwiches
            # which chain multiple entities, such as "Valentin Werner, Thomas Müller, and Manuel Neuer"
            # As these should be three separate entities
            elif t[i] == "," and prior_label not in sandwich_on_comma:
                new_labels[i] = "O"

            elif (prior_label == "NAME_STUDENT" or prior_label == "O") and t[i] == "'s":
                new_labels[i] = "O"

            # replace if we got a sandwich ("LABEL"-"O"-"LABEL", such as "Berlin" - "," - "Germany")
            elif prior_label == next_label and prior_label != "O":
                new_labels[i] = prior_label

            elif l != "O":
                new_labels[i] = l
            
            else:
                new_labels[i] = "O"

        new_2.append({
            "tokens": t,
            "trailing_whitespace": ws,
            "labels": new_labels
        })

    # Turn labels into BIO Labels
    new_bio = []
    for i, r in enumerate(new_2):

        # again, these are just forwarded
        t = r["tokens"]
        ws = r["trailing_whitespace"]
        # again, these might get overwritten
        label = r["labels"]

        # keep track of last label to identify when to use B or I
        last_label = "O"
        for i, l in enumerate(label):
            if l != last_label and l != "O":
                label[i] = "B-" + l
            elif l == last_label and last_label != "O":
                label[i] = "I-" + l
            last_label = l
        new_bio.append({
            "doc_prelim": i,
            "tokens": t,
            "trailing_whitespace": ws,
            "labels": label
        })

    new_ = pd.DataFrame(new_bio)
    new_ = new_.explode(["tokens", "trailing_whitespace", "labels"])
    new_.shape  # note that this produces even more tokens than my prior approach

    # new_ = new_.reset_index(names="doc_")
    new_ = new_.reset_index().rename(columns={"index": "doc_"})

    new_["document"] = new_.doc_.apply(lambda x: f"pj_{x}")

    # get text from original
    new_ = pd.merge(new_, df.reset_index().rename(columns={"index": "doc_"})[["doc_", "text"]], on="doc_", how="left")
    new_ = new_.drop(columns=["doc_prelim", "doc_"])
    new_ = new_.rename(columns={"text_x": "text"})
    new_ = new_[["document", "text", "tokens", "trailing_whitespace", "labels"]]
    print(new_.head(5))

    # Sanity checks¶
    target = [
        "B-EMAIL", "B-ID_NUM", "B-NAME_STUDENT", "B-PHONE_NUM",
        "B-STREET_ADDRESS", "B-URL_PERSONAL", "B-USERNAME", "I-ID_NUM",
        "I-NAME_STUDENT", "I-PHONE_NUM", "I-STREET_ADDRESS", "I-URL_PERSONAL"
    ]

    for l in new_.labels.unique():
        if l not in target:
            print(l)

    for l in target:
        if l not in new_.labels.unique():
            print(l)

    # unexplode columns to lists (technically you can skip this step and safe it exploded instead too)
    fixed = (
        new_.groupby("document")
        .agg(
            {
                "text": lambda x: x,
                "tokens": lambda x: x.tolist(),
                "trailing_whitespace": lambda x: x.tolist(),
                "labels": lambda x: x.tolist(),
            }
        )
        .reset_index()
    )
    # fix text that was turned into list
    fixed["text"] = fixed.text.apply(lambda x: x[0])
    json_format = []
    for idx, row in fixed.iterrows():
        doc = row["document"]
        text = row["text"]
        tokens = row["tokens"]
        ws = row["trailing_whitespace"]
        labels = row["labels"]

        json_format.append({
            "document": doc,
            "full_text": text,
            "tokens": tokens,
            "trailing_whitespace": ws,
            "labels": labels
        })

    out_file = open(f"./gpt-faker/more_data{VERSION}.json", "w")
    json.dump(json_format, out_file)
    out_file.close()

Generating Data:   0%|          | 0/1000 [00:00<?, ?it/s]

APIConnectionError: Connection error.