**Генерация датасета**

В отличии от файла *create_dataset_env.ipynb* здесь генерируются данные по среде формата сохранения логов с одинаковым
числом логов для каждой подсреды(json, xml и т.д.)

Здесь меньше вариантивность в датасете все персональные данные будут на английском языке и уменьшено число вариантов для форматов выдаваемых по типу сервера

In [None]:
import pandas as pd
from faker import Faker
import random
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer

NUM_SAMPLES_PER_FORMAT = 1000
TRAIN_TEST_SPLIT_RATIO = 0.8
OUTPUT_FILE = 'generated_pii_dataset_structured.csv'

fake = Faker('en_US')

# --- Загрузка токенизатора ---
MODEL_NAME = "iiiorg/piiranha-v1-detect-personal-information"
print(f"Загрузка токенизатора: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Токенизатор успешно загружен.")


def create_labels(text: str, base_label: str) -> tuple[str, str]:
    words = text.split()
    if not words: return "", ""
    labels = [f"B-{base_label}"] + [f"I-{base_label}"] * (len(words) - 1)
    return text, " ".join(labels)

def gen_account_num(): return create_labels(fake.bban(), "ACCOUNTNUM")
def gen_building_num(): return create_labels(fake.building_number(), "BUILDINGNUM")
def gen_city(): return create_labels(fake.city(), "CITY")
def gen_credit_card(): return create_labels(fake.credit_card_number(), "CREDITCARDNUMBER")
def gen_dob(): return create_labels(fake.date_of_birth().isoformat(), "DATEOFBIRTH")
def gen_driver_license(): return create_labels(fake.license_plate(), "DRIVERLICENSENUM")
def gen_email(): return create_labels(fake.email(), "EMAIL")
def gen_given_name(): return create_labels(fake.first_name(), "GIVENNAME")
def gen_id_card(): return create_labels(fake.ssn(), "IDCARDNUM")
def gen_password(): return create_labels(fake.password(), "PASSWORD")
def gen_social_num(): return create_labels(fake.ssn(), "SOCIALNUM")
def gen_street(): return create_labels(fake.street_name(), "STREET")
def gen_surname(): return create_labels(fake.last_name(), "SURNAME")
def gen_tax_num(): return create_labels(fake.itin(), "TAXNUM")
def gen_telephone_num(): return create_labels(fake.phone_number(), "TELEPHONENUM")
def gen_username(): return create_labels(fake.user_name(), "USERNAME")
def gen_zipcode(): return create_labels(fake.zipcode(), "ZIPCODE")

def gen_full_name(): return create_labels(fake.name(), "PERSON")
def gen_address(): return create_labels(fake.address().replace('\n', ' '), "ADDRESS")
def gen_phone_number(): return create_labels(fake.phone_number(), "PHONE")
def gen_ipv4(): return create_labels(fake.ipv4(), "IP_ADDRESS")
def gen_url(): return create_labels(fake.url(), "URL")
def gen_timestamp(): return create_labels(fake.iso8601(), "O")
def gen_status(): return create_labels(random.choice(["SUCCESS", "FAILURE", "PENDING"]), "O")
def gen_request_method(): return create_labels(random.choice(["GET", "POST", "PUT", "DELETE"]), "O")

def generate_user_profile_content():
    return {
        "user_name": gen_full_name(),
        "contact_email": gen_email(),
        "phone": gen_phone_number(),
        "last_login_ip": gen_ipv4(),
        "home_address": gen_address(),
    }

def generate_server_log_content():
    return {
        "timestamp": gen_timestamp(),
        "client_ip": gen_ipv4(),
        "request": gen_request_method(),
        "requested_url": gen_url(),
        "user_email": gen_email(),
        "status": gen_status()
    }

def generate_full_pii_profile():
    return {
        "ACCOUNTNUM": gen_account_num(),
        "BUILDINGNUM": gen_building_num(),
        "CITY": gen_city(),
        "CREDITCARDNUMBER": gen_credit_card(),
        "DATEOFBIRTH": gen_dob(),
        "DRIVERLICENSENUM": gen_driver_license(),
        "EMAIL": gen_email(),
        "GIVENNAME": gen_given_name(),
        "IDCARDNUM": gen_id_card(),
        "PASSWORD": gen_password(),
        "SOCIALNUM": gen_social_num(),
        "STREET": gen_street(),
        "SURNAME": gen_surname(),
        "TAXNUM": gen_tax_num(),
        "TELEPHONENUM": gen_telephone_num(),
        "USERNAME": gen_username(),
        "ZIPCODE": gen_zipcode(),
    }


def render_as_json(content: dict) -> tuple[list, list]:
    tokens, labels = ["{"], ["O"]
    items = list(content.items())
    for i, (key, (value_text, value_labels)) in enumerate(items):
        tokens.extend(['"', key, '"', ':'])
        labels.extend(["O", "O", "O", "O"])
        value_tokens = value_text.split()
        value_labels_split = value_labels.split()
        for vt, vl in zip(value_tokens, value_labels_split):
            tokens.extend(['"', vt, '"'])
            labels.extend(["O", vl, "O"])
        if i < len(items) - 1:
            tokens.append(",")
            labels.append("O")
    tokens.append("}")
    labels.append("O")
    return tokens, labels

def render_as_xml(content: dict) -> tuple[list, list]:
    tokens, labels = ["<", "data", ">"], ["O", "O", "O"]
    for key, (value_text, value_labels) in content.items():
        tokens.extend(["<", key, ">"])
        labels.extend(["O", "O", "O"])
        value_tokens = value_text.split()
        value_labels_split = value_labels.split()
        tokens.extend(value_tokens)
        labels.extend(value_labels_split)
        tokens.extend(["<", "/", key, ">"])
        labels.extend(["O", "O", "O", "O"])
    tokens.extend(["<", "/", "data", ">"])
    labels.extend(["O", "O", "O", "O"])
    return tokens, labels

def render_as_csv(content: dict) -> tuple[list, list]:
    header = list(content.keys())
    tokens, labels = [], []
    for i, (key, (value_text, value_labels)) in enumerate(content.items()):
        value_text_no_space = value_text.replace(" ", "_")
        tokens.append(value_text_no_space)
        labels.append(value_labels.split()[0])
        if i < len(content) - 1:
            tokens.append(",")
            labels.append("O")
    header_tokens = (",".join(header) + " \n ").split()
    tokens = header_tokens + tokens
    labels = ["O"] * len(header_tokens) + labels
    return tokens, labels

def render_as_key_value(content: dict) -> tuple[list, list]:
    tokens, labels = [], []
    for key, (value_text, value_labels) in content.items():
        tokens.extend([key, "="])
        labels.extend(["O", "O"])
        value_tokens = value_text.split()
        value_labels_split = value_labels.split()
        tokens.extend(value_tokens)
        labels.extend(value_labels_split)
    return tokens, labels


FORMAT_CONFIG = {
    "json_log": {
        "renderer": render_as_json,
        "content_generators": [generate_user_profile_content, generate_server_log_content, generate_full_pii_profile]
    },
    "xml_record": {
        "renderer": render_as_xml,
        "content_generators": [generate_user_profile_content, generate_full_pii_profile]
    },
    "csv_line": {
        "renderer": render_as_csv,
        "content_generators": [generate_server_log_content, generate_user_profile_content, generate_full_pii_profile]
    },
    "key_value_config": {
        "renderer": render_as_key_value,
        "content_generators": [generate_server_log_content, generate_full_pii_profile]
    }
}

def tokenize_and_align_labels(tokens: list, labels: list, tokenizer: AutoTokenizer):
    assert len(tokens) == len(labels), f"Tokens and labels length mismatch: {len(tokens)} vs {len(labels)}"
    subword_tokens = []
    aligned_labels = []
    for token, label in zip(tokens, labels):
        tokenized = tokenizer.tokenize(token)
        if not tokenized:
            continue
        subword_tokens.extend(tokenized)
        if label.startswith("B-"):
            aligned_labels.append(label)
            aligned_labels.extend(["I-" + label[2:]] * (len(tokenized) - 1))
        else:
            aligned_labels.extend([label] * len(tokenized))
    return subword_tokens, aligned_labels

all_samples = []
print("\nНачало генерации структурированного датасета...")

for format_name, config in FORMAT_CONFIG.items():
    pbar = tqdm(total=NUM_SAMPLES_PER_FORMAT, desc=f"Формат: {format_name}")
    renderer = config["renderer"]
    content_generators = config["content_generators"]

    for _ in range(NUM_SAMPLES_PER_FORMAT):
        content_generator = random.choice(content_generators)
        content_pack = content_generator()
        word_tokens, word_labels = renderer(content_pack)
        assert len(word_tokens) == len(word_labels), f"Mismatch before subword tokenization: {len(word_tokens)} vs {len(word_labels)}"
        subword_tokens, aligned_labels = tokenize_and_align_labels(word_tokens, word_labels, tokenizer)
        assert len(subword_tokens) == len(aligned_labels), f"Mismatch after subword tokenization: {len(subword_tokens)} vs {len(aligned_labels)}"
        all_samples.append({
            "text": " ".join(word_tokens),
            "mbert_tokens": " ".join(subword_tokens),
            "mbert_token_classes": " ".join(aligned_labels),
            "format_type": format_name,
        })
        pbar.update(1)
    pbar.close()

print(f"\nГенерация завершена. Всего создано {len(all_samples)} семплов.")


if not all_samples:
    print("Не удалось сгенерировать ни одного семпла.")
else:
    df = pd.DataFrame(all_samples)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    split_index = int(len(df) * TRAIN_TEST_SPLIT_RATIO)
    df['split'] = np.where(df.index < split_index, 'train', 'test')
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"\nДатасет успешно сохранен в файл: {OUTPUT_FILE}")
    print("\nСтатистика по сгенерированным данным:")
    print(df['format_type'].value_counts())
    print("\n--- Пример сгенерированной строки (json_log) ---")
    sample_row = df[df['format_type'] == 'json_log'].iloc[0]
    print(f"\nТокены:\n{sample_row['mbert_tokens']}")
    print(f"\nМетки:\n{sample_row['mbert_token_classes']}")
    print("-" * 50)

Загрузка токенизатора: iiiorg/piiranha-v1-detect-personal-information...
Токенизатор успешно загружен.

Начало генерации структурированного датасета...


Формат: xml_record:   0%|          | 1/1000 [09:00<150:06:28, 540.93s/it]
Формат: json_log: 100%|██████████| 1000/1000 [00:05<00:00, 187.03it/s]
Формат: xml_record: 100%|██████████| 1000/1000 [00:05<00:00, 176.77it/s]
Формат: csv_line: 100%|██████████| 1000/1000 [00:02<00:00, 413.31it/s]
Формат: key_value_config: 100%|██████████| 1000/1000 [00:03<00:00, 318.18it/s]



Генерация завершена. Всего создано 4000 семплов.

Датасет успешно сохранен в файл: generated_pii_dataset_structured.csv

Статистика по сгенерированным данным:
format_type
json_log            1000
key_value_config    1000
csv_line            1000
xml_record          1000
Name: count, dtype: int64

--- Пример сгенерированной строки (json_log) ---

Токены:
▁{ ▁" ▁user _ name ▁" ▁ : ▁" ▁ Jennifer ▁" ▁" ▁Adams ▁" ▁ , ▁" ▁contact _ email ▁" ▁ : ▁" ▁ wend y 37 @ example . net ▁" ▁ , ▁" ▁phone ▁" ▁ : ▁" ▁205 263 2143 ▁" ▁ , ▁" ▁last _ login _ ip ▁" ▁ : ▁" ▁140 . 248. 225 . 132 ▁" ▁ , ▁" ▁home _ address ▁" ▁ : ▁" ▁68 184 ▁" ▁" ▁ Ricardo ▁" ▁" ▁ Through way ▁" ▁" ▁Thompson berg , ▁" ▁" ▁SD ▁" ▁" ▁90 102 ▁" ▁}

Метки:
O O O O O O O O O B-PERSON I-PERSON O O I-PERSON O O O O O O O O O O O B-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL I-EMAIL O O O O O O O O O B-PHONE I-PHONE I-PHONE O O O O O O O O O O O O O B-IP_ADDRESS I-IP_ADDRESS I-IP_ADDRESS I-IP_ADDRESS I-IP_ADDRESS I-IP_ADDRESS O