In [1]:
import pandas as pd
import regex as re
import random
import os
from faker import Faker
from tqdm import tqdm
from spacy.lang.en import English
fake = Faker("en_US")




In [2]:
def concat_csv(directory_path):
    dataframes = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path)
            dataframes.append(df)

    concatenated_df = pd.concat(dataframes, ignore_index=True)
    concatenated_df.reset_index(drop=True, inplace=True)

    return concatenated_df

directory_path = "./gpt4-faker/"
df = concat_csv(directory_path)

df.shape

(13994, 1)

In [3]:
final_df = pd.DataFrame(
    columns=["document", "full_text", "tokens", "trailing_whitespace", "labels"]
)
final_df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels


In [4]:
df.head()

Unnamed: 0,full_text
0,Phones\n\nModern humans today are always on th...
1,This essay will explain if drivers should or s...
2,Driving while the use of cellular devices\n\nT...
3,Phones & Driving\n\nDrivers should not be able...
4,Cell Phone Operation While Driving\n\nThe abil...


In [5]:
def convert_chinese_to_english(text):
    punctuations = {
        ord('，'): ', ', ord('。'): '.', ord('！'): '!', 
        ord('？'): '?', ord('：'): ':', ord('；'): ';',
        ord('“'): '"', ord('”'): '"', ord('‘'): "'", 
        ord('’'): "'", ord('（'): '(', ord('）'): ')',
        ord('【'): '[', ord('】'): ']', ord('《'): '<', 
        ord('》'): '>'
    }
    text = text.translate(punctuations)

    def full_to_half(match):
        return chr(ord(match.group(0)) - 0xFEE0)

    text = re.sub(r'[Ａ-Ｚａ-ｚ０-９]', full_to_half, text)
    
    return text


def format_text(input_text):
    pattern = r'\[.*?([A-Z_]+).*?\].*?\[.*?\1.*?\]'
    replaced_text = re.sub(pattern, lambda m: m.group(1), input_text)
    replaced_text = re.sub(r'([.?!])(?![\s])', r'\1 ', replaced_text)
    return replaced_text

input_text1 = "[+NAME_STUDENT]Emily Thompson[NAME_STUDENT], He is a great student."
input_text2 = "[USERNAME]TravelBuddy22[-USERNAME]"
input_text3 = "[EMAIL]emilythompson@example.com[*EMAIL]"

print(format_text(input_text1))  
print(format_text(input_text2))  
print(format_text(input_text3))

NAME_STUDENT, He is a great student. 
USERNAME
EMAIL


In [6]:
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    trailing_whitespace = [bool(token.whitespace_) for token in tokenized_text]
    return {"tokens": tokens, "trailing_whitespace": trailing_whitespace}

print(tokenize_with_spacy("(682) 387-3863x23756 and James Kitty"))

{'tokens': ['(', '682', ')', '387', '-', '3863x23756', 'and', 'James', 'Kitty'], 'trailing_whitespace': [False, False, True, False, False, True, True, True, False]}


In [7]:
def generate_fake_social_media_urls(num_urls=1):
    social_media_platforms = {
        "LinkedIn": "linkedin.com/in/",
        "YouTube": "youtube.com/c/",
        "Instagram": "instagram.com/",
        "GitHub": "github.com/",
        "Facebook": "facebook.com/",
        "Twitter": "twitter.com/",
        "Pinterest": "pinterest.com/",
        "Snapchat": "snapchat.com/",
        "TikTok": "tiktok.com/",
        "Reddit": "reddit.com/",
        "Tumblr": "tumblr.com/",
        "WhatsApp": "whatsapp.com/",
        "Discord": "discord.com/",
        "Twitch": "twitch.tv/",
        "Vimeo": "vimeo.com/",
        "Flickr": "flickr.com/",
        "Telegram": "telegram.org/",
        "Line": "line.me/",
        "Viber": "viber.com/"
    }

    fake_social_media_urls = []

    for _ in range(num_urls):
        fake_user_name = fake.user_name()
        platform, domain = random.choice(list(social_media_platforms.items()))
        fake_url = f"https://{domain}{fake_user_name}"
        fake_social_media_urls.append(fake_url)

    return fake_social_media_urls[0]

In [8]:
# df = df[:10]

for idx, row in tqdm(df.iterrows(), total=len(df)):
    full_text = row["full_text"]
    # full_text = convert_text(full_text)
    try:
        full_text = convert_chinese_to_english(full_text)
        full_text = format_text(full_text)
    except:
        # print(full_text)
        continue

    # print(full_text)

    tokens = [token.text for token in en_tokenizer(full_text)]
    trailing_whitespace = [bool(token.whitespace_) for token in en_tokenizer(full_text)]

    label_names = ["NAME_STUDENT", "EMAIL", "USERNAME", "ID_NUM", "PHONE_NUM", "URL_PERSSONAL", "STREET_ADDRESS"]

    new_tokens = []
    new_labels = []
    new_trailing_whitespace = []

    for i in range(len(tokens)):
        if tokens[i] not in label_names:
            new_tokens.append(tokens[i])
            new_labels.append("O")
            new_trailing_whitespace.append(trailing_whitespace[i])
        else:
            # print(1)
            if tokens[i] == "NAME_STUDENT":
                name = random.choices(
                    [fake.name, fake.first_name, fake.last_name],
                    weights=[0.80, 0.10, 0.10],
                    k=1
                )[0]()
                # print(name)
                name_tokens = [name_token.text for name_token in en_tokenizer(name)]
                tmp_trailing_whitespace = [bool(name_token.whitespace_) for name_token in en_tokenizer(name)]
                if len(name_tokens) > 1:
                    new_tokens.extend(name_tokens)
                    new_labels.extend(["B-NAME_STUDENT"] + ["I-NAME_STUDENT"] * (len(name_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(name_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(name_tokens)
                    new_labels.extend(["B-NAME_STUDENT"])
                    new_trailing_whitespace.append(trailing_whitespace[i])

            elif tokens[i] == "EMAIL":
                email = fake.ascii_free_email()
                # print(email)
                email_tokens = [email_token.text for email_token in en_tokenizer(email)]
                tmp_trailing_whitespace = [bool(email_token.whitespace_) for email_token in en_tokenizer(email)]
                if len(email_tokens) > 1:
                    new_tokens.extend(email_tokens)
                    new_labels.extend(["B-EMAIL"] + ["I-EMAIL"] * (len(email_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(email_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(email_tokens)
                    new_labels.extend(["B-EMAIL"])
                    new_trailing_whitespace.append(trailing_whitespace[i])

            elif tokens[i] == "USERNAME":
                username = fake.user_name()
                # print(username)
                username_tokens = [username_token.text for username_token in en_tokenizer(username)]
                tmp_trailing_whitespace = [bool(username_token.whitespace_) for username_token in en_tokenizer(username)]
                if len(username_tokens) > 1:
                    new_tokens.extend(username_tokens)
                    new_labels.extend(["B-USERNAME"] + ["I-USERNAME"] * (len(username_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(username_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(username_tokens)
                    new_labels.extend(["B-USERNAME"])
                    new_trailing_whitespace.append(trailing_whitespace[i])

            elif tokens[i] == "ID_NUM":
                id_num = random.choices(
                    [fake.ssn, fake.passport_number, fake.bban, fake.iban, fake.license_plate],
                    weights=[0.20, 0.20, 0.20, 0.20, 0.20],
                    k=1
                )[0]()
                # print(id_num)
                id_num_tokens = [id_num_token.text for id_num_token in en_tokenizer(id_num)]
                tmp_trailing_whitespace = [bool(id_num_token.whitespace_) for id_num_token in en_tokenizer(id_num)]
                if len(id_num_tokens) > 1:
                    new_tokens.extend(id_num_tokens)
                    new_labels.extend(["B-ID_NUM"] + ["I-ID_NUM"] * (len(id_num_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(id_num_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(id_num_tokens)
                    new_labels.extend(["B-ID_NUM"])
                    new_trailing_whitespace.append(trailing_whitespace[i])

            elif tokens[i] == "PHONE_NUM":
                phone_number = fake.phone_number()
                # print(phone_number)
                phone_number_tokens = [phone_number_token.text for phone_number_token in en_tokenizer(phone_number)]
                tmp_trailing_whitespace = [bool(phone_number_token.whitespace_) for phone_number_token in en_tokenizer(phone_number)]
                if len(phone_number_tokens) > 1:
                    new_tokens.extend(phone_number_tokens)

                    # tmp_new_labels = []
                    # flag = False
                    # for j in range(len(phone_number_tokens)):
                    #     if phone_number_tokens[j] in ["(", ")", "-", ",", "+", "."]:
                    #         tmp_new_labels.append("O")
                    #     else:
                    #         if not flag:
                    #             tmp_new_labels.append("B-PHONE_NUM")
                    #             flag = True
                    #         else:
                    #             tmp_new_labels.append("I-PHONE_NUM")

                    # new_labels.extend(tmp_new_labels)

                    new_labels.extend(["B-PHONE_NUM"] + ["I-PHONE_NUM"] * (len(phone_number_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(phone_number_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(phone_number_tokens)
                    new_labels.extend(["B-PHONE_NUM"])
                    new_trailing_whitespace.append(trailing_whitespace[i])

            elif tokens[i] == "URL_PERSONAL":
                url_personal = generate_fake_social_media_urls(1)
                # print(url_personal)
                url_personal_tokens = [url_personal_token.text for url_personal_token in en_tokenizer(url_personal)]
                tmp_trailing_whitespace = [bool(url_personal_token.whitespace_) for url_personal_token in en_tokenizer(url_personal)]
                if len(url_personal_tokens) > 1:
                    new_tokens.extend(url_personal_tokens)

                    # tmp_new_labels = []
                    # flag = False
                    # for j in range(len(url_personal_tokens)):
                    #     if url_personal_tokens[j] in ["(", ")", "-", ","]:
                    #         tmp_new_labels.append("O")
                    #     else:
                    #         if not flag:
                    #             tmp_new_labels.append("B-URL_PERSONAL")
                    #             flag = True
                    #         else:
                    #             tmp_new_labels.append("I-URL_PERSONAL")

                    new_labels.extend(["B-URL_PERSONAL"] + ["I-URL_PERSONAL"] * (len(url_personal_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(url_personal_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(url_personal_tokens)
                    new_labels.extend(["B-URL_PERSONAL"])
                    new_trailing_whitespace.extend(trailing_whitespace[i])

            elif tokens[i] == "STREET_ADDRESS":
                street_address = fake.address()
                # print(street_address)
                street_address_tokens = [street_address_token.text for street_address_token in en_tokenizer(street_address)]
                tmp_trailing_whitespace = [bool(street_address_token.whitespace_) for street_address_token in en_tokenizer(street_address)]

                if len(street_address_tokens) > 1:
                    new_tokens.extend(street_address_tokens)

                    # tmp_new_labels = []
                    # flag = False
                    # for j in range(len(street_address_tokens)):
                    #     if street_address_tokens[j] in ["(", ")", "-", ","]:
                    #         tmp_new_labels.append("O")
                    #     else:
                    #         if not flag:
                    #             tmp_new_labels.append("B-STREET_ADDRESS")
                    #             flag = True
                    #         else:
                    #             tmp_new_labels.append("I-STREET_ADDRESS")

                    new_labels.extend(["B-STREET_ADDRESS"] + ["I-STREET_ADDRESS"] * (len(street_address_tokens) - 1))
                    new_trailing_whitespace.extend(tmp_trailing_whitespace[:len(street_address_tokens) - 1])
                    new_trailing_whitespace.append(trailing_whitespace[i])
                else:
                    new_tokens.extend(street_address_tokens)
                    new_labels.extend(["B-STREET_ADDRESS"])
                    new_trailing_whitespace.extend(trailing_whitespace[i])

    if len(new_labels) != len(new_trailing_whitespace) or len(new_labels) != len(new_tokens):
        print("The lengths of different columns are not equal!!!")
    # else:
    #     print(len(new_labels))

    new_full_text = "".join([token + " " * space for token, space in zip(new_tokens, new_trailing_whitespace)])

    new_row = pd.DataFrame({
        "document": [str(idx)],
        "full_text": [new_full_text],
        "tokens": [new_tokens],
        "trailing_whitespace": [new_trailing_whitespace],
        "labels": [new_labels]
    })

    final_df = pd.concat([final_df, new_row], ignore_index=True)
    final_df.to_json("lzc_persuade_2.0_based_0218.json", orient="records", force_ascii=False)
    
    # break

 80%|████████  | 11229/13994 [1:49:03<26:51,  1.72it/s]  


KeyboardInterrupt: 

In [None]:
final_df.shape