**Генерация датасета**

Этот генератор способен выдавать данные с большим числом сред (по языку, формату, длине токенов, типа сервера), но он имеет недостаток, что
количество прмиеров для каждой подсреды не иидеально равно, и немного варируются, так же из-за такой большой вариации это ухудшает качество обучения.
В первую очередь на это влияет изменение длины логов в токенах.

In [None]:
import json
import csv
import random
import re
from datetime import datetime
from faker import Faker
from datasets import load_dataset
from transformers import AutoTokenizer
import xml.etree.ElementTree as ET
from collections import defaultdict
import pandas as pd
from io import StringIO
import os

class LogGenerator:
    def __init__(self):
        # Инициализация Faker с поддержкой нескольких языков
        self.faker_providers = {
            'en': Faker('en_US'),
            'fr': Faker('fr_FR'),
            'de': Faker('de_DE'),
            'pt': Faker('pt_BR'),
            'it': Faker('it_IT'),
            'es': Faker('es_ES'),
            'sv': Faker('sv_SE')
        }
        
        self.languages = {
            'en': 'english',
            'fr': 'french', 
            'de': 'german',
            'pt': 'portuguese',
            'it': 'italian',
            'es': 'spanish',
            'sv': 'swedish'
        }
        
        self.key_mapping = {
            'full_name': ['fullName', 'user_fullname', 'client_name', 'name', 'firstName', 'lastName'],
            'email': ['email', 'user_email', 'contact_email', 'eMail', 'mail', 'emailAddress'],
            'phone': ['phone', 'mobile', 'contact_phone', 'phoneNumber', 'mobilePhone', 'telephone'],
            'credit_card': ['creditCard', 'payment_card', 'card_number', 'ccNumber', 'cardNum'],
            'password': ['password', 'pass', 'secret_key', 'pwd', 'passphrase', 'secretCode'],
            'address': ['address', 'billing_address', 'home_address', 'street', 'residence'],
            'ssn': ['ssn', 'social_security', 'tax_id', 'socialSecurityNumber', 'ssNumber'],
            'username': ['username', 'login', 'user_id', 'accountName', 'userLogin'],
            'iban': ['iban', 'bank_account', 'account_number', 'bankAccountNumber'],
            'license_plate': ['license_plate', 'car_number', 'vehicle_id', 'plateNumber']
        }
        
        self.generators = {
            'full_name': lambda lang: self._get_faker(lang).name(),
            'email': lambda lang: self._get_faker(lang).email(),
            'phone': lambda lang: re.sub(r'\s+', '-', self._get_faker(lang).phone_number()),
            'credit_card': lambda lang: self._get_faker(lang).credit_card_number(),
            'password': lambda lang: self._get_faker(lang).password(length=12),
            'address': lambda lang: self._get_faker(lang).address().replace('\n', ', '),
            'ssn': lambda lang: self._get_faker(lang).ssn(),
            'username': lambda lang: self._get_faker(lang).user_name(),
            'iban': lambda lang: self._get_faker(lang).iban(),
            'license_plate': lambda lang: self._get_faker(lang).license_plate()
        }
        
        try:
            malicious_logs_ds = load_dataset("u-haru/malicious_logs", split='train')
            if 'log' in malicious_logs_ds.features:
                self.malicious_logs = malicious_logs_ds['log']
            elif 'text' in malicious_logs_ds.features:
                self.malicious_logs = malicious_logs_ds['text']
            else:
                self.malicious_logs = ["Error: 'log' or 'text' column not found in malicious_logs dataset"] * 100
            
        except Exception as e:
            print(f"Error loading malicious logs dataset: {e}")
            self.malicious_logs = ["Error accessing malicious logs database"] * 100
        
        self.server_templates = self._load_server_templates()

        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
        
    def _get_faker(self, lang_code):
        """Возвращает экземпляр Faker для указанного языка"""
        return self.faker_providers.get(lang_code, Faker('en_US')) # По умолчанию английский
    
    def _load_server_templates(self):
        """Загрузить шаблоны серверов из JSON файла"""
        templates_file = 'servers.json'
        shablon_file = 'shablon.json'
        
        if os.path.exists(templates_file):
            try:
                with open(templates_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Error reading {templates_file}: {e}. Falling back to {shablon_file}.")

        if os.path.exists(shablon_file):
            try:
                with open(shablon_file, 'r', encoding='utf-8') as f:
                    # Если shablon.json содержит список, преобразуем его в словарь по первому ключу
                    data = json.load(f)
                    if isinstance(data, list):
                        return {'generic': data} 
                    return data
            except Exception as e:
                print(f"Error reading {shablon_file}: {e}. Generating default templates.")
        
        print("Generating default server templates.")
        return self._generate_default_server_templates()
    
    def _generate_default_server_templates(self):
        """Генерация стандартного списка шаблонов серверов, если файлы не найдены"""
        return {
            "generic": [
                "[{timestamp}] INFO: Processing request: {method} {url} from {ip}",
                "[{timestamp}] {level}: Service {service} encountered an issue: {message}",
                "[{timestamp_with_microseconds}] {component} ({host}): {message} (PID: {pid})"
            ]
        }
    
    def _generate_personal_data(self, lang='en'):
        """Генерация персональных данных"""
        data_types = list(self.generators.keys())
        personal_data = []
        
        num_data = random.randint(5, 15)
        
        for _ in range(num_data):
            data_type = random.choice(data_types)
            try:
                if data_type in self.generators:
                    value = self.generators[data_type](lang)
                    personal_data.append({
                        'type': data_type,
                        'value': value
                    })
                else:
                    personal_data.append({
                        'type': 'full_name',
                        'value': self.generators['full_name'](lang)
                    })
            except Exception as e:
                print(f"Error generating data for type '{data_type}' in lang '{lang}': {e}")
                personal_data.append({
                    'type': 'full_name',
                    'value': self.generators['full_name'](lang)
                })
                
        return personal_data
    
    def _insert_malicious_log(self, log_text):
        """Вставка фрагмента злонамеренного лога"""
        if self.malicious_logs and random.random() < 0.3:  
            malicious_log = random.choice(self.malicious_logs)
            
            tokens = self.tokenizer.tokenize(malicious_log)
            if len(tokens) > 20:
                tokens = tokens[:20]
                malicious_log = self.tokenizer.convert_tokens_to_string(tokens)
            
            insert_pos = random.randint(0, len(log_text))
            log_text = log_text[:insert_pos] + " " + malicious_log + " " + log_text[insert_pos:]
            
        return log_text
    
    def _generate_json_log(self, personal_data, lang='en'):
        """Генерация JSON лога"""
        log_data = {}
        
        log_data['timestamp'] = datetime.now().isoformat()
        log_data['log_level'] = random.choice(['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL'])
        log_data['service'] = f"service_{random.randint(1, 100)}"
        
        for data in personal_data:
            key = random.choice(self.key_mapping.get(data['type'], [data['type']])) # Используем data['type'] если нет маппинга
            log_data[key] = data['value']
        
        extra_fields_count = random.randint(3, 8)
        for _ in range(extra_fields_count):
            field_name = f"field_{random.randint(1, 1000)}"
            field_value = random.choice([
                random.randint(1, 1000),
                f"value_{random.randint(1, 1000)}",
                round(random.uniform(0, 100), 2),
                random.choice([True, False])
            ])
            log_data[field_name] = field_value
        
        return json.dumps(log_data, indent=2, ensure_ascii=False) # ensure_ascii=False для корректного отображения не-ASCII символов
    
    def _generate_xml_log(self, personal_data, lang='en'):
        """Генерация XML лога"""
        root = ET.Element("log")
        
        ET.SubElement(root, "timestamp").text = datetime.now().isoformat()
        ET.SubElement(root, "log_level").text = random.choice(['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL'])
        ET.SubElement(root, "service").text = f"service_{random.randint(1, 100)}"
        
        personal_data_elem = ET.SubElement(root, "personal_data")
        for data in personal_data:
            key = random.choice(self.key_mapping.get(data['type'], [data['type']]))
            elem = ET.SubElement(personal_data_elem, key)
            elem.text = str(data['value']) # Убедимся, что значение - строка
        
        extra_elem = ET.SubElement(root, "extra")
        extra_fields_count = random.randint(3, 8)
        for _ in range(extra_fields_count):
            field_name = f"field_{random.randint(1, 1000)}"
            field_value = str(random.choice([
                random.randint(1, 1000),
                f"value_{random.randint(1, 1000)}",
                round(random.uniform(0, 100), 2),
                random.choice([True, False])
            ]))
            elem = ET.SubElement(extra_elem, field_name)
            elem.text = field_value
        
        # Преобразование XML дерева в строку
        return ET.tostring(root, encoding='unicode', method='xml')
    
    def _generate_csv_log(self, personal_data, lang='en'):
        """Генерация CSV лога"""

        headers = ['timestamp', 'log_level', 'service']
        for data in personal_data:
            headers.append(random.choice(self.key_mapping.get(data['type'], [data['type']])))
        
        extra_fields_count = random.randint(3, 8)
        for _ in range(extra_fields_count):
            headers.append(f"field_{random.randint(1, 1000)}")
        
        values = [datetime.now().isoformat()]
        values.append(random.choice(['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL']))
        values.append(f"service_{random.randint(1, 100)}")
        
        # Добавляем персональные данные
        for data in personal_data:
            values.append(str(data['value'])) # Преобразуем в строку
        
        # Добавляем значения для случайных полей
        for _ in range(extra_fields_count):
            values.append(str(random.choice([
                random.randint(1, 1000),
                f"value_{random.randint(1, 1000)}",
                round(random.uniform(0, 100), 2),
                random.choice([True, False])
            ])))
        
        output = StringIO()
        writer = csv.writer(output)
        writer.writerow(headers)
        writer.writerow(values)
        
        return output.getvalue()
    
    def _generate_raw_text_log(self, personal_data, lang='en'):
        """Генерация raw text лога"""
        log_text = ""
        if random.random() < 0.7 and self.server_templates and any(self.server_templates.values()):
            server_type = random.choice(list(self.server_templates.keys()))
            # Убедимся, что выбранный тип сервера имеет шаблоны
            if self.server_templates[server_type]:
                template = random.choice(self.server_templates[server_type])
            else: # Если у типа сервера нет шаблонов, используем generic или создаем свой
                template = None
                if 'generic' in self.server_templates and self.server_templates['generic']:
                    template = random.choice(self.server_templates['generic'])
                else:
                    template = "[{timestamp}] {level}: {message}" # Запасной вариант
        else:
            template = None
        
        if template is None:
            template = "[{timestamp}] {level}: Processing request for user {user}. Client IP: {ip}. Session ID: sess_{session_id}. {message}"
        
        # Генерация данных для форматирования шаблона
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        timestamp_with_microseconds = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
        
        # Создаем словарь со всеми возможными плейсхолдерами
        format_dict = {
            'timestamp': timestamp,
            'timestamp_with_microseconds': timestamp_with_microseconds,
            'pid': random.randint(1000, 99999),
            'ip': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'message': "User authentication request", # Базовое сообщение
            'version': f"{random.randint(1, 10)}.{random.randint(0, 9)}.{random.randint(0, 9)}",
            'method': random.choice(['GET', 'POST', 'PUT', 'DELETE', 'PATCH', 'HEAD', 'OPTIONS']),
            'url': f"/api/v{random.randint(1, 3)}/resource/{random.randint(1, 1000)}",
            'http_version': f"{random.randint(1, 2)}.{random.randint(0, 1)}",
            'status': random.choice([200, 201, 400, 401, 403, 404, 500, 503]),
            'size': random.randint(50, 50000),
            'path': f"/var/log/app/{random.randint(1, 1000)}.log",
            'level': random.choice(['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL', 'TRACE']),
            'component': random.choice(['core', 'network', 'database', 'security', 'storage', 'ui', 'api']),
            'host': f"server{random.randint(1, 100)}.internal.corp",
            'connection': random.randint(1, 999999),
            'server': f"web{random.randint(1, 10)}.internal.corp",
            'request': f"GET /api/v1/users HTTP/1.1",
            'ssl_version': f"TLSv1.{random.randint(1, 3)}",
            'event_id': random.randint(1000, 99999),
            'source': random.choice(['Application', 'System', 'Security', 'Network', 'Database']),
            'category': random.choice(['Logon/Logoff', 'Object Access', 'Privilege Use', 'System', 'Network Activity', 'Data Access']),
            'date': datetime.now().strftime('%m/%d/%Y'),
            'time': datetime.now().strftime('%H:%M:%S'),
            'user': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f"user_{random.randint(1, 1000)}"]),
            'computer': f"workstation{random.randint(1, 100)}.corp.local",
            'container_id': f"{random.randint(1000000000, 9999999999):x}",
            'image': f"nginx:{random.randint(1, 20)}.{random.randint(0, 9)}.{random.randint(0, 9)}",
            'pod': f"pod-{random.randint(1, 100)}-{random.randint(1000, 9999)}",
            'thread': f"Thread-{random.randint(1, 50)}",
            'logger': f"com.example.service.{random.choice(['UserService', 'OrderService', 'PaymentService', 'AuthService'])}",
            'session_id': f"sess_{random.randint(100000, 999999)}",
            'database': f"db_{random.randint(1, 100)}",
            'table': f"table_{random.randint(1, 1000)}",
            'port': random.randint(1024, 65535),
            'referer': f"https://app.example.com/page{random.randint(1, 100)}",
            'user_agent': f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/{random.randint(80, 120)}.0.0.0 Safari/537.36",
            'elapsed': f"{random.randint(0, 999)}.{random.randint(0, 999):03d}",
            'result': random.choice(['TCP_MISS', 'TCP_HIT', 'TCP_DENIED', 'TCP_REDIRECT']),
            'hierarchy': random.choice(['DIRECT', 'NONE', 'PARENT', 'SIBLING']),
            'peer': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'mime_type': random.choice(['text/html', 'application/json', 'image/png', 'application/pdf', 'text/plain', 'application/xml']),
            'frontend': f"frontend_{random.randint(1, 10)}",
            'backend': f"backend_{random.randint(1, 20)}",
            'server_name': f"server{random.randint(1, 50)}",
            'timer': f"{random.randint(1, 999)}/{random.randint(1, 999)}/{random.randint(1, 999)}/{random.randint(1, 999)}/{random.randint(1, 999)}",
            'bytes': random.randint(100, 50000),
            'req_bytes': random.randint(50, 2000),
            'priority': random.randint(1, 191),
            'version_code': random.randint(1, 3),
            'hostname': f"host{random.randint(1, 100)}.local",
            'app_name': random.choice(['nginx', 'apache', 'mysql', 'postgresql', 'redis', 'java', 'python']),
            'process': f"process_{random.randint(1, 100)}",
            'timezone': random.choice(['UTC', 'EST', 'PST', 'GMT', 'CET', 'CST', 'IST']),
            'thread_id': random.randint(1, 9999),
            'context': f"context-{random.randint(1, 100)}",
            'error': f"Error {random.randint(1000, 9999)}",
            'signal': random.randint(1, 30),
            'fd': random.randint(1, 1024),
            'op': random.randint(1, 3),
            'event': random.randint(1, 999999),
            'reusable': random.choice(['1', '0']),
            'password_status': random.choice(['YES', 'NO', 'CHANGED']),
            'error_code': random.randint(1000, 9999),
            'timeout': random.randint(30, 300),
            'violation': random.choice(['CSRF', 'XSS', 'SQL Injection', 'Rate Limit', 'Authentication Failure']),
            'counter': f"counter.{random.choice(['requests', 'errors', 'latency', 'connections'])}",
            'ssl_port': random.choice([443, 8443, 9443, 433]),
            'local_ip': f"127.0.0.{random.randint(1, 254)}",
            'local_port': random.randint(1024, 65535),
            'module': random.choice(['mod_ssl', 'mod_rewrite', 'mod_auth', 'core', 'logging']),
            'service': random.choice(['httpd', 'nginx', 'mysql', 'postgresql', 'redis', 'sshd', 'tomcat']),
            'time_ms': random.randint(1, 5000),
            'session': random.randint(1000000, 9999999),
            'exception': random.choice(['NullPointerException', 'IOException', 'SQLException', 'KeyError', 'ValueError', 'TypeError']),
            'protocol': random.choice(['HTTP/1.1', 'HTTP/2', 'HTTPS', 'SSH', 'FTP']),
            'socket': f"/var/run/{random.choice(['nginx', 'apache', 'mysql'])}.sock",
            'limit': random.randint(1000, 65536),
            'layer_id': f"sha256:{''.join(random.choices('0123456789abcdef', k=64))}",
            'volume': f"vol_{random.randint(1, 100)}",
            'path_mount': f"/mnt/volume{random.randint(1, 10)}",
            'driver': random.choice(['overlay2', 'aufs', 'btrfs', 'zfs', 'ext4']),
            'network': f"net_{random.randint(1, 100)}",
            'node': f"node{random.randint(1, 10)}",
            'endpoint': f"endpoint-{random.randint(1, 100)}",
            'namespace': f"ns-{random.randint(1, 50)}",
            'container': f"container-{random.randint(1, 1000)}",
            'member': f"member-{random.randint(1, 100)}",
            'location': f"/error/{random.randint(400, 599)}.html",
            'error_code_http': random.randint(400, 599),
            'build': f"build-{random.randint(1, 1000)}",
            'dbpath': f"/var/lib/mysql/db{random.randint(1, 100)}",
            'connection_id': random.randint(1, 999999),
            'elapsed_time': f"{random.randint(0, 999)}.{random.randint(0, 999):03d}",
            'peer_ip': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'mime': random.choice(['text/html', 'application/json', 'image/jpeg', 'application/pdf', 'text/plain', 'application/xml']),
            'hierarchy_info': random.choice(['DIRECT', 'NONE', 'PARENT', 'SIBLING']),
            'commit_id': f"{random.randint(1000000, 9999999):x}",
            'group': f"group_{random.randint(1, 100)}",
            'state': random.choice(['PreparingRebalance', 'Stable', 'CompletingRebalance', 'Active', 'Idle']),
            'topic': f"topic_{random.randint(1, 100)}",
            'partition': random.randint(0, 99),
            'leader': random.randint(1, 10),
            'broker_id': random.randint(1, 100),
            'sid_history': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000, 1999)}",
            'domain': f"{random.choice(['corp', 'internal', 'company'])}.local",
            'logon_type': random.randint(2, 11),
            'process_id': random.randint(1000, 99999),
            'process_name': f"C:\\Windows\\System32\\{random.choice(['svchost.exe', 'explorer.exe', 'chrome.exe', 'firefox.exe', 'cmd.exe'])}",
            'workstation': f"PC{random.randint(1, 999)}",
            'source_ip': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'source_port': random.randint(1024, 65535),
            'logon_process': random.choice(['Kerberos', 'NTLM', 'Negotiate', 'RemoteInteractive']),
            'auth_package': random.choice(['Kerberos', 'NTLM', 'Negotiate', 'Digest', 'Certificate']),
            'transited_services': " -",
            'package_name': "-",
            'key_length': random.choice([0, 128, 256]),
            'sam_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f"user_{random.randint(1, 1000)}"]),
            'display_name': random.choice([data['value'] for data in personal_data if data['type'] == 'full_name'] or [f"User {random.randint(1, 1000)}"]),
            'upn': f"{random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}'])}@{random.choice(['corp.local', 'company.com', 'internal.net'])}",
            'home_dir': f"C:\\Users\\{random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}'])}",
            'home_drive': "C:",
            'script_path': f"\\\\{random.choice(['dc1', 'dc2', 'fileserver'])}\\netlogon\\{random.choice(['default', 'custom'])}.bat",
            'profile_path': f"C:\\Users\\{random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}'])}",
            'workstations': "-", # Поле, которое может быть пустым или содержать список
            'pwd_last_set': datetime.now().strftime('%m/%d/%Y %H:%M:%S %p'),
            'account_expires': "Never",
            'primary_group_id': 513,
            'allowed_delegation': "-",
            'old_uac': 512,
            'new_uac': 512,
            'uac_flags': "Normal Account",
            'user_params': "-",
            'logon_hours': "(value not set)",
            'subject_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000, 1999)}",
            'subject_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'admin_{random.randint(1, 100)}']),
            'subject_domain': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'logon_id': f"0x{random.randint(100000, 999999):x}",
            'new_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(2000, 2999)}",
            'new_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}']),
            'new_domain': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'target_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(2000, 2999)}",
            'target_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}']),
            'target_domain': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'locked_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(2000, 2999)}",
            'locked_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}']),
            'caller_computer': f"PC{random.randint(1, 999)}",
            'group_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(3000, 3999)}",
            'group_name': f"Group{random.randint(1, 100)}",
            'group_domain': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'member_sid': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(4000, 4999)}",
            'member_name': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}']),
            'member_domain': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'auth_package_loaded': random.choice(['Kerberos', 'NTLM', 'Negotiate', 'Digest', 'Certificate']),
            'trust_direction': random.choice(['Bidirectional', 'Inbound', 'Outbound']),
            'trust_type': random.choice(['Windows 2000', 'MIT', 'DCE']),
            'trust_attributes': random.choice(['Within Forest', 'Forest Transitive', 'Treat as External']),
            'target_domain_trust': random.choice(['CORP', 'PARTNER', 'EXTERNAL']),
            'target_sid_trust': f"S-1-5-21-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}",
            'account_name_kerb': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f'user_{random.randint(1, 1000)}']),
            'account_domain_kerb': random.choice(['CORP', 'INTERNAL', 'COMPANY']),
            'logon_guid': f"{{{random.randint(10000000, 99999999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}-{random.randint(1000, 9999)}-{random.randint(100000000000, 999999999999)}}}",
            'service_name': f"{random.choice(['HTTP', 'host', 'ldap'])}/{random.choice(['web1', 'web2', 'dc1', 'dc2'])}.{random.choice(['corp.local', 'company.com', 'internal.net'])}",
            'service_id': f"S-1-5-80-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}-{random.randint(1000000000, 1999999999)}",
            'client_address': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}",
            'client_port': random.randint(1024, 65535),
            'ticket_options': random.choice(['0x40810010', '0x40810000', '0x60810000']),
            'result_code_kerb': random.choice([0, 16, 32, 64]),
            'preauth_type': random.randint(0, 15),
            'realm_name': random.choice(['CORP.LOCAL', 'INTERNAL.NET', 'COMPANY.COM']),
            'database_name': f"Database{random.randint(1, 100)}",
            'check_number': random.randint(1, 1000),
            'error_code_ds': random.randint(1000, 9999),
            'error_description': random.choice(['Successful', 'Failed', 'Warning', 'Information']),
            
            'mode': random.choice(['debug', 'release', 'test', 'verbose', 'maintenance', 'normal']),
            'reason': random.choice(['User action required', 'System maintenance', 'Configuration change', 'Security alert', 'Failed login', 'Data corruption']),
            'channel': random.choice(['email', 'sms', 'api', 'web', 'ssh', 'ftp', 'cli', 'gui']),
            'affected_user': random.choice([data['value'] for data in personal_data if data['type'] == 'username'] or [f"user_{random.randint(1, 1000)}"]),
            'subject': random.choice(['Login attempt', 'File access', 'System update', 'User registration', 'Password reset', 'Resource access', 'Configuration update']),
            'file': f"/path/to/file/{random.choice(['config.yml', 'log.txt', 'data.json', 'script.sh', 'readme.md', 'error.log'])}",
            'proxy': f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}:{random.randint(1000, 65535)}",
            'action': random.choice(['read', 'write', 'delete', 'execute', 'login', 'logout', 'create', 'update', 'view', 'change', 'disable', 'enable']),
            'rule': f"RULE_{random.randint(1, 500)}_v{random.randint(1,5)}"
        }

        try:
            log_text = template.format(**format_dict)
        except KeyError as e:

            print(f"KeyError during template formatting: {e}. Template: '{template[:100]}...'. Generating fallback log.")
            log_text = f"[{timestamp}] ERROR: Failed to format log template. Missing key: {e}. Fallback message."
        except Exception as e:
            print(f"Unexpected error during template formatting: {e}. Generating fallback log.")
            log_text = f"[{timestamp}] ERROR: Unexpected error during log formatting. Fallback message."


        added_personal_data_count = 0
        for data in personal_data:
            if random.random() < 0.7:  # 70% шанс добавить  персональные данные
                if str(data['value']) not in log_text:
                    log_text += f" {data['type'].replace('_', ' ').title()}: {data['value']}."
                    added_personal_data_count += 1
        

        if added_personal_data_count == 0 and personal_data:
             log_text += f" Data: {personal_data[0]['type'].replace('_', ' ').title()}: {personal_data[0]['value']}."

        return log_text
    
    def generate_log_sample(self, format_type='json', lang='en'):
        """Генерация одного sample лога"""
        personal_data = self._generate_personal_data(lang)
        
        if format_type == 'json':
            log_text = self._generate_json_log(personal_data, lang)
        elif format_type == 'xml':
            log_text = self._generate_xml_log(personal_data, lang)
        elif format_type == 'csv':
            log_text = self._generate_csv_log(personal_data, lang)
        elif format_type == 'raw_text':
            log_text = self._generate_raw_text_log(personal_data, lang)
        else:
            raise ValueError(f"Unsupported format type: {format_type}")
        
        log_text = self._insert_malicious_log(log_text)
        
        personal_data_positions = []
        for data in personal_data:
            try:
                start = log_text.lower().find(str(data['value']).lower())
                if start != -1:
                    end = start + len(str(data['value']))
                    personal_data_positions.append({
                        'type': data['type'],
                        'value': data['value'], 
                        'start': start,
                        'end': end
                    })
            except Exception as e:
                print(f"Error finding position for value '{data['value']}': {e}")
                
        return log_text, personal_data_positions

class DatasetGenerator:
    def __init__(self):
        self.log_generator = LogGenerator()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
    
    def tokenize_and_process(self, text, personal_data_positions):
        """Токенизация текста и создание меток классов токенов"""
        tokens = self.tokenizer.tokenize(text)
        
        token_classes = ['O'] * len(tokens)
        
        char_to_token_map = [-1] * len(text)
        current_char_idx = 0
        for token_idx, token in enumerate(tokens):
            # Убираем спецсимволы токенизатора (например, '##' для subwords)
            clean_token = token.replace('##', '')
            token_len = len(clean_token)
            
            # Заполняем карту для каждого символа токена
            for _ in range(token_len):
                if current_char_idx < len(text):
                    char_to_token_map[current_char_idx] = token_idx
                current_char_idx += 1
            while current_char_idx < len(text) and text[current_char_idx].isspace():
                current_char_idx += 1
        
        for data in personal_data_positions:
            start_char = data['start']
            end_char = data['end']
            entity_type = data['type'].upper()
            
            if 0 <= start_char < end_char <= len(text):
                
                start_token_idx = -1
                for i in range(start_char, len(char_to_token_map)):
                    if char_to_token_map[i] != -1:
                        start_token_idx = char_to_token_map[i]
                        break

                end_token_idx = -1
                # Ищем последний не-'-1' индекс начиная с end_char - 1
                for i in range(end_char - 1, -1, -1):
                    if char_to_token_map[i] != -1:
                        end_token_idx = char_to_token_map[i]
                        break
                
                if start_token_idx != -1 and end_token_idx != -1 and start_token_idx <= end_token_idx:
                    for i in range(start_token_idx, min(end_token_idx + 1, len(token_classes))):
                        if i < len(token_classes): 
                            if i == start_token_idx:
                                token_classes[i] = f"B-{entity_type}"
                            else:
                                token_classes[i] = f"I-{entity_type}"
        
        filtered_tokens = []
        filtered_classes = []
        
        for token, label in zip(tokens, token_classes):
            clean_token = token.replace('##', '')
            if clean_token and not all(c in '.,!?;:"\'`~@#$%^&*()[]{}|\\/' for c in clean_token):
                filtered_tokens.append(token)
                filtered_classes.append(label)
        
        return filtered_tokens, filtered_classes
    
    def create_masked_text(self, text, personal_data_positions):
        """Создание маскированного текста"""
        if not personal_data_positions:
            return text
            
        sorted_positions = sorted(personal_data_positions, key=lambda x: x['start'], reverse=True)
        
        type_counts = defaultdict(int)
        for pos in sorted_positions:
            type_counts[pos['type']] += 1
        
        # Создаем словарь замен с индексами
        replacements = {} # Храним замены по начальному индексу
        type_current_count = defaultdict(int)
        
        for pos in sorted_positions:
            type_current_count[pos['type']] += 1
            # Создаем тег вида [*TYPE_N*], где N - порядковый номер этого типа данных
            tag = f"*[{pos['type'].upper()}_{type_current_count[pos['type']]}*]"
            replacements[pos['start']] = (pos['end'], tag)

        masked_text_parts = []
        current_idx = 0
        
        sorted_indices = sorted(replacements.keys(), reverse=True)
        
        for start_idx in sorted_indices:
            end_idx, tag = replacements[start_idx]
            
            # Добавляем часть текста перед текущей заменой
            masked_text_parts.append(text[start_idx:end_idx])
            # Добавляем саму замену (тег)
            masked_text_parts.append(tag)
            
            # Обновляем текущий индекс до начала следующего участка текста
            current_idx = end_idx

        if current_idx < len(text):
             masked_text_parts.append(text[current_idx:])
        
        masked_text = "".join(reversed(masked_text_parts))
            
        return masked_text
    
    def generate_dataset(self, env_type='format', samples_per_subtype=100):
        """Генерация датасета"""
        results = []
        
        # Определение подтипов в зависимости от типа среды
        if env_type == 'format':
            subtypes = ['json', 'xml', 'csv', 'raw_text']
        elif env_type == 'lang':
            subtypes = list(self.log_generator.languages.keys())
        elif env_type == 'len':
            # Делим на диапазоны количества токенов (приблизительно)
            subtypes = ['5-50', '51-100', '101-150', '151-200', '201-256']
        elif env_type == 'type':
            # Берем первые 10 типов серверов, если их больше, иначе все доступные
            if self.log_generator.server_templates:
                subtypes = list(self.log_generator.server_templates.keys())[:10] 
            else:
                subtypes = ['generic'] 
        else:
            raise ValueError(f"Unsupported environment type: {env_type}")
        

        if len(subtypes) > 1:
            test_subtypes = random.sample(subtypes, min(2, len(subtypes)))
        else:
            test_subtypes = subtypes 

        print(f"Starting dataset generation for env_type='{env_type}' with {samples_per_subtype} samples per subtype.")
        
        generated_counts = defaultdict(int)

        for subtype in subtypes:
            print(f"Generating samples for {env_type}: '{subtype}'...")
            
            generated_for_subtype = 0
            attempts = 0 
            max_attempts_per_subtype = samples_per_subtype * 3 

            while generated_for_subtype < samples_per_subtype and attempts < max_attempts_per_subtype:
                attempts += 1
                try:
                    log_text, personal_data_positions = None, None
                    
                    if env_type == 'format':
                        log_text, personal_data_positions = self.log_generator.generate_log_sample(
                            format_type=subtype,
                            lang=random.choice(list(self.log_generator.languages.keys()))
                        )
                    elif env_type == 'lang':
                        log_text, personal_data_positions = self.log_generator.generate_log_sample(
                            format_type=random.choice(['json', 'xml', 'csv', 'raw_text']), # Случайный формат
                            lang=subtype 
                        )
                    elif env_type == 'len':
                        target_min_tokens, target_max_tokens = 0, 256 
                        if subtype == '5-50':
                            target_min_tokens = 5
                            target_max_tokens = 50
                        elif subtype == '51-100':
                            target_min_tokens = 51
                            target_max_tokens = 100
                        elif subtype == '101-150':
                            target_min_tokens = 101
                            target_max_tokens = 150
                        elif subtype == '151-200':
                            target_min_tokens = 151
                            target_max_tokens = 200
                        elif subtype == '201-256':
                            target_min_tokens = 201
                            target_max_tokens = 256
                        
                        current_attempt = 0
                        while current_attempt < 10:
                            current_attempt += 1
                            temp_log_text, temp_personal_data_positions = self.log_generator.generate_log_sample(
                                format_type=random.choice(['json', 'xml', 'csv', 'raw_text']),
                                lang=random.choice(list(self.log_generator.languages.keys()))
                            )
                            
                            temp_tokens = self.tokenizer.tokenize(temp_log_text)
                            
                            if len(temp_tokens) > 256:
                                temp_tokens = temp_tokens[:256]
                                temp_log_text = self.tokenizer.convert_tokens_to_string(temp_tokens)
                                updated_positions = []
                                for pos in temp_personal_data_positions:
                                    if pos['start'] < len(temp_log_text) and pos['end'] <= len(temp_log_text):
                                        updated_positions.append(pos)
                                temp_personal_data_positions = updated_positions

                            token_length = len(self.tokenizer.tokenize(temp_log_text)) 

                            if target_min_tokens <= token_length <= target_max_tokens:
                                log_text, personal_data_positions = temp_log_text, temp_personal_data_positions
                                break 
                        
                        if log_text is None: 
                            print(f"Warning: Could not generate log within length range '{subtype}'. Using a random log.")
                            log_text, personal_data_positions = self.log_generator.generate_log_sample(
                                format_type=random.choice(['json', 'xml', 'csv', 'raw_text']),
                                lang=random.choice(list(self.log_generator.languages.keys()))
                            )
                            tokens = self.tokenizer.tokenize(log_text)
                            if len(tokens) > 256:
                                tokens = tokens[:256]
                                log_text = self.tokenizer.convert_tokens_to_string(tokens)
                                updated_positions = []
                                for pos in personal_data_positions:
                                    if pos['start'] < len(log_text) and pos['end'] <= len(log_text):
                                        updated_positions.append(pos)
                                personal_data_positions = updated_positions

                    elif env_type == 'type':
                        # Для типа сервера используем raw_text формат с соответствующими шаблонами
                        log_text, personal_data_positions = self.log_generator.generate_log_sample(
                            format_type='raw_text',
                            lang=random.choice(list(self.log_generator.languages.keys()))
                        )
                    
                    if log_text is None:
                         raise ValueError("Log text generation failed unexpectedly.")

                    # Обрезаем текст до 256 токенов, если он превышает этот лимит
                    tokens = self.tokenizer.tokenize(log_text)
                    if len(tokens) > 256:
                        tokens = tokens[:256]
                        log_text = self.tokenizer.convert_tokens_to_string(tokens)
                        
                        updated_positions = []
                        for pos in personal_data_positions:
                            if pos['start'] < len(log_text) and pos['end'] <= len(log_text):
                                updated_positions.append(pos)
                        personal_data_positions = updated_positions

                    # Токенизация и создание меток классов токенов
                    mbert_tokens, mbert_token_classes = self.tokenize_and_process(log_text, personal_data_positions)
                    
                    masked_text = self.create_masked_text(log_text, personal_data_positions)
                    

                    split = 'test' if subtype in test_subtypes else 'train'
                    
                    token_length = len(mbert_tokens)
                    
                    # Собираем данные для одного сэмпла
                    sample_data = {
                        'source_text': log_text,
                        'format_type': subtype if env_type == 'format' else random.choice(['json', 'xml', 'csv', 'raw_text']),
                        'mbert_tokens': mbert_tokens, 
                        'mbert_token_classes': mbert_token_classes, # Список меток классов
                        'masked_text': masked_text,
                        'split': split,
                        'len': token_length,
                        'type': subtype if env_type == 'type' else random.choice(list(self.log_generator.server_templates.keys())[:10] if self.log_generator.server_templates else ['generic']), # Случайный тип или тип из env_type
                        'lang': random.choice(list(self.log_generator.languages.keys())) # Случайный язык
                    }
                    results.append(sample_data)
                    generated_for_subtype += 1
                    generated_counts[subtype] += 1

                except Exception as e:
                    print(f"Error generating sample {generated_for_subtype + 1} (attempt {attempts}) for {env_type}='{subtype}': {e}")

            print(f"Finished generating samples for '{subtype}'. Generated: {generated_for_subtype}/{samples_per_subtype}.")

        # Проверка на одинаковое количество сэмплов для каждой подсреды
        print("\n--- Dataset Generation Summary ---")
        all_counts_equal = True
        if subtypes: # Проверяем, если есть подтипы
            print("Sample counts per subtype:")
            for subtype in subtypes:
                count = generated_counts.get(subtype, 0)
                print(f"- {subtype}: {count}")
                if count != samples_per_subtype:
                    all_counts_equal = False
            
            if all_counts_equal:
                print("All subtypes have the requested number of samples.")
            else:
                print(f"Warning: Not all subtypes have the requested {samples_per_subtype} samples. Check logs for errors.")
        else:
            print("No subtypes defined for generation.")

        print(f"Total samples generated: {len(results)}")
        print("----------------------------------\n")

        return results

def save_dataset_to_csv(dataset, filename='generated_logs_dataset.csv'):
    """Сохранение датасета в CSV файл"""
    if not dataset:
        print("No data to save to CSV.")
        return None
        
    df = pd.DataFrame(dataset)
    
    # Преобразуем списки токенов и меток в строки, разделенные пробелами
    df['mbert_tokens'] = df['mbert_tokens'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
    df['mbert_token_classes'] = df['mbert_token_classes'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

    try:
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Dataset saved successfully to {filename}")
    except Exception as e:
        print(f"Error saving dataset to CSV file {filename}: {e}")
    return df


if __name__ == "__main__":
    NUM_SAMPLES_PER_SUBTYPE = 1000 
    
    dataset_generator = DatasetGenerator()

    print("--- Generating dataset by format ---")
    format_dataset = dataset_generator.generate_dataset('format', NUM_SAMPLES_PER_SUBTYPE)
    
    save_dataset_to_csv(format_dataset, 'generated_logs_dataset_by_format_large.csv')
    

    print("\nAll dataset generation tasks completed.")


Using the latest cached version of the dataset since u-haru/malicious_logs couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'access_log' at /home/master/.cache/huggingface/datasets/u-haru___malicious_logs/access_log/0.0.0/7f1ef0b5907d72fee2db66d2b86c1022e896c727 (last modified on Mon Jul 21 13:18:18 2025).


--- Generating dataset by format ---
Starting dataset generation for env_type='format' with 1000 samples per subtype.
Generating samples for format: 'json'...


Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors


Finished generating samples for 'json'. Generated: 1000/1000.
Generating samples for format: 'xml'...


Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


Finished generating samples for 'xml'. Generated: 1000/1000.
Generating samples for format: 'csv'...
Finished generating samples for 'csv'. Generated: 1000/1000.
Generating samples for format: 'raw_text'...
KeyError during template formatting: 'options'. Template: '{timestamp} I CONTROL  [initandlisten] options: {options}...'. Generating fallback log.
KeyError during template formatting: 'app'. Template: '<{priority}>{version} {timestamp} {hostname} {app_name}[{pid}]: {app} {version} starting...'. Generating fallback log.
KeyError during template formatting: 'creator_process_id'. Template: 'Event Type: {level} Event Source: {source} Event Category: {category} Event ID: {event_id} Date: {da...'. Generating fallback log.
KeyError during template formatting: 'object_server'. Template: 'Event Type: {level} Event Source: {source} Event Category: {category} Event ID: {event_id} Date: {da...'. Generating fallback log.
KeyError during template formatting: 'agent'. Template: '{timestamp} {level