In [1]:
import pandas as pd
import random
from tqdm import tqdm
from spacy.lang.en import English

In [None]:
en_tokenizer = English().tokenizer

def tokenize_with_spacy(text, tokenizer=en_tokenizer):
    tokenized_text = tokenizer(text)
    tokens = [token.text for token in tokenized_text]
    trailing_whitespace = [bool(token.whitespace_) for token in tokenized_text]
    return {"tokens": tokens, "trailing_whitespace": trailing_whitespace}

print(tokenize_with_spacy("http://www.youtube.com/lizhecheng/hdkahdkahdikagk4784279hdkadhdsadadadaXilfshfl"))

In [None]:
df = pd.read_json("./lzc_persuade_2.0_based.json")
df.head()

In [None]:
aug_df = pd.DataFrame(columns=["document", "full_text", "tokens", "trailing_whitespace", "labels"])

In [None]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    if len(row["tokens"]) != len(row["trailing_whitespace"]) or len(row["trailing_whitespace"]) != len(row["labels"]):
        print("The lengths of different columns are not equal!")
    else:
        new_tokens = []
        new_trailing_whitespace = []
        new_labels = []
        new_full_text = ""

        label_lists = ["B-USERNAME", "B-ID_NUM", "B-EMAIL", "B-URL_PERSONAL"]

        total_len = len(row["tokens"])
        for i in range(total_len):
            if row["labels"][i] not in label_lists:
                new_tokens.append(row["tokens"][i])
                new_trailing_whitespace.append(row["trailing_whitespace"][i])
                new_labels.append(row["labels"][i])
            else:
                if i + 1 < total_len and row["labels"][i + 1] != "O":
                    new_tokens.append(row["tokens"][i])
                    new_trailing_whitespace.append(
                        row["trailing_whitespace"][i])
                    new_labels.append(row["labels"][i])
                elif i + 1 < total_len and row["labels"][i + 1] == "O":
                    random_float = random.uniform(0, 1)
                    if random_float <= 0.01:  # 这里我们左右加括号并且留一个空格
                        new_tokens.extend(["(", row["tokens"][i], ")"])
                        new_trailing_whitespace.extend([True, True, row["trailing_whitespace"][i]])
                        new_labels.extend(["O", row["labels"][i], "O"])
                    elif random_float <= 0.03:  # 这里我们左右加括号但是不留空格
                        new_tokens.extend(["(", row["tokens"][i], ")"])
                        new_trailing_whitespace.extend([False, False, row["trailing_whitespace"][i]])
                        new_labels.extend(["O", row["labels"][i], "O"])

        new_full_text = "".join([token + " " * space for token, space in zip(new_tokens, new_trailing_whitespace)])

        if len(new_tokens) != len(new_trailing_whitespace) or len(new_trailing_whitespace) != len(new_labels):
            print("The lengths of different columns are not equal!")

        new_row = pd.DataFrame({
            "document": [row["document"]],
            "full_text": [new_full_text],
            "tokens": [list(new_tokens)],
            "trailing_whitespace": [list(new_trailing_whitespace)],
            "labels": [list(new_labels)]
        })
        aug_df = pd.concat([aug_df, new_row], ignore_index=True)

aug_df.shape

In [None]:
aug_df.to_json("lzc_persuade_2.0_based_augmented.json", orient="records")