In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
from datetime import datetime
import logging
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import queue
import threading


# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('lenta_parser.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class TatarInformParser:
    def __init__(self, max_workers=5, max_pages=150, timeout=30, delay_range=(1.0, 3.0)):
        self.max_workers = max_workers
        self.max_pages = max_pages
        self.timeout = timeout
        self.delay_range = delay_range
        self.session = self.get_session()
        self.lock = threading.Lock()
        self.processed_pages = 0
        self.total_news = 0
        self.failed_pages = 0


    def get_session(self):
        """–°–æ–∑–¥–∞–µ—Ç —Å–µ—Å—Å–∏—é —Å –ø–æ–≤—Ç–æ—Ä–Ω—ã–º–∏ –ø–æ–ø—ã—Ç–∫–∞–º–∏ –∏ —Ä–æ—Ç–∞—Ü–∏–µ–π User-Agent"""
        session = requests.Session()

        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1'
        ]

        headers = {
            'User-Agent': random.choice(user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'Referer': 'https://lenta.ru/',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin'
        }

        session.headers.update(headers)

        retry_strategy = requests.adapters.Retry(
            total=5,  # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ–ø—ã—Ç–æ–∫
            backoff_factor=1.0,  # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º –∑–∞–¥–µ—Ä–∂–∫—É –º–µ–∂–¥—É –ø–æ–ø—ã—Ç–∫–∞–º–∏
            status_forcelist=[429, 500, 502, 503, 504, 403, 404],
        )
        adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy, pool_connections=20, pool_maxsize=20)
        session.mount("http://", adapter)
        session.mount("https://", adapter)

        return session


    def parse_news_page(self, page_url):
        """–ü–∞—Ä—Å–∏—Ç –æ–¥–Ω—É —Å—Ç—Ä–∞–Ω–∏—Ü—É —Å –Ω–æ–≤–æ—Å—Ç—è–º–∏"""
        try:
            delay = random.uniform(*self.delay_range)
            logger.info(f"–ó–∞–¥–µ—Ä–∂–∫–∞ {delay:.2f} —Å–µ–∫ –ø–µ—Ä–µ–¥ –∑–∞–ø—Ä–æ—Å–æ–º –∫ {page_url}")
            time.sleep(delay)

            logger.info(f"–ü–∞—Ä—Å–∏–º —Å—Ç—Ä–∞–Ω–∏—Ü—É: {page_url}")

            response = self.session.get(page_url, timeout=self.timeout)

            if response.status_code != 200:
                logger.warning(f"–û—à–∏–±–∫–∞ –¥–æ—Å—Ç—É–ø–∞ –∫ {page_url}: {response.status_code}")
                with self.lock:
                    self.failed_pages += 1
                return []

            soup = BeautifulSoup(response.content, 'html.parser')
            news_items = soup.find_all('li', class_='newsList__item')

            # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –Ω–æ–≤–æ—Å—Ç–∏ –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ
            if not news_items:
                logger.warning(f"–ü—É—Å—Ç–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞: {page_url}")
                with self.lock:
                    self.failed_pages += 1
                return []

            parsed_news = []
            current_date = datetime.now().strftime("%Y/%m/%d")

            for item in news_items:
                if 'banner' in item.get('class', []) or '_more' in item.get('class', []):
                    continue

                news_link = item.find('a', class_='newsList__item-text')
                if not news_link:
                    continue

                title_elem = news_link.find('h2', class_='newsList__item-title')
                time_elem = news_link.find('a', class_='newsList__item-date')

                if title_elem:
                    href = news_link.get('href', '')
                    if href.startswith('/'):
                        full_url = f'https://tatar-inform.tatar/news{href}'
                    elif href.startswith('http'):
                        full_url = href
                    else:
                        full_url = f'https://tatar-inform.tatar/news{href}'

                    news_data = {
                        'title': title_elem.get_text(strip=True),
                        'url': full_url,
                        'time': time_elem.get_text(strip=True) if time_elem else '–ù–µ —É–∫–∞–∑–∞–Ω–æ',
                        'full_date': f"{current_date} {time_elem.get_text(strip=True) if time_elem else ''}",
                        'page_url': page_url,
                        'timestamp': datetime.now().isoformat()
                    }
                    parsed_news.append(news_data)

            with self.lock:
                self.processed_pages += 1
                self.total_news += len(parsed_news)
                logger.info(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ —Å—Ç—Ä–∞–Ω–∏—Ü: {self.processed_pages}, –≤—Å–µ–≥–æ –Ω–æ–≤–æ—Å—Ç–µ–π: {self.total_news}")

            return parsed_news

        except requests.exceptions.RequestException as e:
            logger.error(f"–°–µ—Ç–µ–≤–∞—è –æ—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ {page_url}: {e}")
            with self.lock:
                self.failed_pages += 1
            return []
        except Exception as e:
            logger.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ {page_url}: {e}")
            with self.lock:
                self.failed_pages += 1
            return []


    def generate_page_urls(self):
        """–ì–µ–Ω–µ—Ä–∏—Ä—É–µ—Ç URL –≤—Å–µ—Ö —Å—Ç—Ä–∞–Ω–∏—Ü –¥–ª—è –ø–∞—Ä—Å–∏–Ω–≥–∞"""
        urls = ["https://tatar-inform.tatar/news"]

        # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º URL –¥–ª—è —Å—Ç—Ä–∞–Ω–∏—Ü —Å–æ 2 –ø–æ max_pages
        for page_num in range(2, self.max_pages + 1):
            urls.append(f"https://tatar-inform.tatar/news?page={page_num}/")

        return urls


    def parse_all_news_pages(self):
        """–ü–∞—Ä—Å–∏—Ç –≤—Å–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –Ω–æ–≤–æ—Å—Ç—è–º–∏ –º–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω–æ"""
        logger.info(f"–ù–∞—á–∏–Ω–∞–µ–º –º–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω—ã–π –ø–∞—Ä—Å–∏–Ω–≥ {self.max_pages} —Å—Ç—Ä–∞–Ω–∏—Ü...")

        # –ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –≤—Å–µ URL –¥–ª—è –ø–∞—Ä—Å–∏–Ω–≥–∞
        urls = self.generate_page_urls()

        logger.info(f"–í—Å–µ–≥–æ URL –¥–ª—è –ø–∞—Ä—Å–∏–Ω–≥–∞: {len(urls)}")

        # –ú–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω—ã–π –ø–∞—Ä—Å–∏–Ω–≥ —Å –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –æ–¥–Ω–æ–≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –∑–∞–ø—Ä–æ—Å–æ–≤
        all_news = []

        # –†–∞–∑–±–∏–≤–∞–µ–º –Ω–∞ –±–∞—Ç—á–∏ –¥–ª—è –ª—É—á—à–µ–≥–æ —É–ø—Ä–∞–≤–ª–µ–Ω–∏—è
        batch_size = self.max_workers * 2
        batches = [urls[i:i + batch_size] for i in range(0, len(urls), batch_size)]

        for batch_num, batch_urls in enumerate(batches, 1):
            logger.info(f"–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –±–∞—Ç—á {batch_num}/{len(batches)} ({len(batch_urls)} URL)")

            with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
                future_to_url = {executor.submit(self.parse_news_page, url): url for url in batch_urls}

                for future in as_completed(future_to_url):
                    url = future_to_url[future]
                    try:
                        page_news = future.result()
                        if page_news:
                            all_news.extend(page_news)
                            logger.info(f"–£—Å–ø–µ—à–Ω–æ: {url} - {len(page_news)} –Ω–æ–≤–æ—Å—Ç–µ–π")
                        else:
                            logger.warning(f"–ü—É—Å—Ç–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞ –∏–ª–∏ –æ—à–∏–±–∫–∞: {url}")
                    except Exception as e:
                        logger.error(f"–ò—Å–∫–ª—é—á–µ–Ω–∏–µ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ {url}: {e}")
                        with self.lock:
                            self.failed_pages += 1

            # –ü–∞—É–∑–∞ –º–µ–∂–¥—É –±–∞—Ç—á–∞–º–∏
            if batch_num < len(batches):
                batch_delay = random.uniform(5.0, 10.0)
                logger.info(f"–ü–∞—É–∑–∞ –º–µ–∂–¥—É –±–∞—Ç—á–∞–º–∏: {batch_delay:.2f} —Å–µ–∫")
                time.sleep(batch_delay)

        return all_news


    def parse_news_page_item(self, news):
      """–ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –ø–æ url"""
      page_url = news['url']
      try:
          delay = random.uniform(*self.delay_range)
          logger.info(f"–ó–∞–¥–µ—Ä–∂–∫–∞ {delay:.2f} —Å–µ–∫ –ø–µ—Ä–µ–¥ –∑–∞–ø—Ä–æ—Å–æ–º –∫ {page_url}")
          time.sleep(delay)

          logger.info(f"–ü–∞—Ä—Å–∏–º —Å—Ç—Ä–∞–Ω–∏—Ü—É: {page_url}")

          response = self.session.get(page_url, timeout=self.timeout)

          if response.status_code != 200:
              logger.warning(f"–û—à–∏–±–∫–∞ –¥–æ—Å—Ç—É–ø–∞ –∫ {page_url}: {response.status_code}")
              with self.lock:
                  self.failed_pages += 1
              return []

          soup = BeautifulSoup(response.content, 'html.parser')
          news_item = soup.find('div', class_='main__news-col')

          # –ü—Ä–æ–≤–µ—Ä—è–µ–º, –µ—Å—Ç—å –ª–∏ –Ω–æ–≤–æ—Å—Ç—å –Ω–∞ —Å—Ç—Ä–∞–Ω–∏—Ü–µ
          if not news_item:
              logger.warning(f"–ü—É—Å—Ç–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞: {page_url}")
              with self.lock:
                  self.failed_pages += 1
              return []

          news_item_title = news_item.find('h1', class_ = 'main__news-title')
          news_item_rubric = news_item.find('a', class_ = 'main__rubric')
          news_item_date = news_item.find('a', class_ = 'main__date')
          news_item_text_div = news_item.find('div', class_ = 'page-main__text')

          news_data = {
              'title': news_item_title.get_text(strip=True),
              'url': page_url,
              'rubric' : news_item_rubric.get_text(strip=True),
              'text' : news_item_text_div.get_text(separator=' ', strip=True),
              'date' : news_item_date.get_text(strip=True)
          }

          return news_data
      except requests.exceptions.RequestException as e:
          logger.error(f"–°–µ—Ç–µ–≤–∞—è –æ—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ {page_url}: {e}")
          with self.lock:
              self.failed_pages += 1
          return {}
      except Exception as e:
          logger.error(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–∞—Ä—Å–∏–Ω–≥–µ {page_url}: {e}")
          with self.lock:
              self.failed_pages += 1
          return {}


    def parse_all_news_page_items(self):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ —Ç–µ–∫—Å—Ç–∞ –ø–æ –≤—Å–µ–º –Ω–æ–≤–æ—Å—Ç—è–º"""
        all_news = self.parse_all_news_pages()
        full_news = []

        for news in all_news:
          news = self.parse_news_page_item(news)
          full_news.append(news)
          print(news)

        return full_news


    def analyze_results(self, news_list):
        """–ê–Ω–∞–ª–∏–∑–∏—Ä—É–µ—Ç –∏ –≤—ã–≤–æ–¥–∏—Ç —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫—É —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤"""
        if not news_list:
            logger.warning("–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞")
            return

        categories = {}
        pages = {}
        words = 0
        for news in news_list:
            category = news['rubric']
            page = news['url']
            categories[category] = categories.get(category, 0) + 1
            pages[page] = pages.get(page, 0) + 1
            words += len(news['text'].split(' '))

        print("\n" + "="*70)
        print("–°–¢–ê–¢–ò–°–¢–ò–ö–ê –ú–ù–û–ì–û–ü–û–¢–û–ß–ù–û–ì–û –ü–ê–†–°–ò–ù–ì–ê TATAR-INFORM.RU")
        print("="*70)
        print(f"–í—Å–µ–≥–æ –Ω–æ–≤–æ—Å—Ç–µ–π: {len(news_list)}")
        print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ —Å—Ç—Ä–∞–Ω–∏—Ü: {self.processed_pages}")
        print(f"–ù–µ—É–¥–∞—á–Ω—ã—Ö —Å—Ç—Ä–∞–Ω–∏—Ü: {self.failed_pages}")
        print(f"–í—Å–µ–≥–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π: {len(categories)}")
        print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Ç–æ–∫–æ–≤: {self.max_workers}")
        print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è —Å—Ç—Ä–∞–Ω–∏—Ü–∞: {self.max_pages}")
        print(f"–ß–∏—Å–ª–æ —Å–ª–æ–≤: {words}")

        print("\n–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω–æ–≤–æ—Å—Ç–µ–π –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º (—Ç–æ–ø-10):")
        for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {category}: {count} –Ω–æ–≤–æ—Å—Ç–µ–π")

        print(f"\n–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–∞–Ω–∏—Ü —Å –Ω–æ–≤–æ—Å—Ç—è–º–∏: {len(pages)}")


    def convert_to_jsonl(self, data_list, filename):
        """
        –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π –≤ —Ñ–æ—Ä–º–∞—Ç JSONL

        Args:
            data_list: —Å–ø–∏—Å–æ–∫ —Å–ª–æ–≤–∞—Ä–µ–π —Å –¥–∞–Ω–Ω—ã–º–∏
            filename: –∏–º—è —Ñ–∞–π–ª–∞ –¥–ª—è —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è
        """
        with open(filename, 'w', encoding='utf-8') as f:
            for item in data_list:
                json_line = json.dumps(item, ensure_ascii=False)
                f.write(json_line + '\n')


def main():
    """–û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è"""
    print("üöÄ –ó–∞–ø—É—Å–∫ –º–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω–æ–≥–æ –ø–∞—Ä—Å–µ—Ä–∞ Tatar-Inform.ru –¥–æ 11 —Å—Ç—Ä–∞–Ω–∏—Ü—ã")
    print("="*60)

    try:
        # –°–æ–∑–¥–∞–µ–º –ø–∞—Ä—Å–µ—Ä —Å –Ω–∞—Å—Ç—Ä–æ–π–∫–∞–º–∏ –¥–ª—è –≥–ª—É–±–æ–∫–æ–≥–æ –ø–∞—Ä—Å–∏–Ω–≥–∞
        parser = TatarInformParser(
            max_workers=8,           # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Ç–æ–∫–æ–≤
            max_pages=11,           # –ü–∞—Ä—Å–∏–º –¥–æ 11 —Å—Ç—Ä–∞–Ω–∏—Ü—ã
            timeout=45,              # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º —Ç–∞–π–º–∞—É—Ç
            delay_range=(1.5, 4.0)   # –£–≤–µ–ª–∏—á–∏–≤–∞–µ–º –∑–∞–¥–µ—Ä–∂–∫—É –º–µ–∂–¥—É –∑–∞–ø—Ä–æ—Å–∞–º–∏
        )

        # –ü–∞—Ä—Å–∏–º –≤—Å–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã –º–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω–æ
        all_news = parser.parse_all_news_page_items()

        if not all_news:
            print("‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å –Ω–æ–≤–æ—Å—Ç–∏")
            return

        # –ê–Ω–∞–ª–∏–∑–∏—Ä—É–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
        parser.analyze_results(all_news)

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –≤ jsonl
        jsonl_filename = f"lenta_news_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
        parser.convert_to_jsonl(all_news, jsonl_filename)
    except KeyboardInterrupt:
        print("\n‚èπÔ∏è  –ü–∞—Ä—Å–∏–Ω–≥ –ø—Ä–µ—Ä–≤–∞–Ω –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–º")
    except Exception as e:
        logger.error(f"–ö—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞ –≤ main: {e}")
        print(f"‚ùå –ü—Ä–æ–∏–∑–æ—à–ª–∞ –∫—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞: {e}")

if __name__ == "__main__":
    main()

üöÄ –ó–∞–ø—É—Å–∫ –º–Ω–æ–≥–æ–ø–æ—Ç–æ—á–Ω–æ–≥–æ –ø–∞—Ä—Å–µ—Ä–∞ Tatar-Inform.ru –¥–æ 11 —Å—Ç—Ä–∞–Ω–∏—Ü—ã

‚èπÔ∏è  –ü–∞—Ä—Å–∏–Ω–≥ –ø—Ä–µ—Ä–≤–∞–Ω –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–º


### –≠—Ç–∞–ø 2. –ü—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ –∏ –æ—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞

In [None]:
import re
import html
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class TextCleaner:
    def __init__(self, lowercase=True, remove_stopwords=True, language='russian'):
        """
        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –æ—á–∏—Å—Ç–∏—Ç–µ–ª—è —Ç–µ–∫—Å—Ç–∞

        Args:
            lowercase: –ü—Ä–∏–≤–æ–¥–∏—Ç—å —Ç–µ–∫—Å—Ç –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
            remove_stopwords: –£–¥–∞–ª—è—Ç—å —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
            language: –Ø–∑—ã–∫ –¥–ª—è —Å—Ç–æ–ø-—Å–ª–æ–≤ ('russian', 'tatar', 'english')
        """
        self.lowercase = lowercase
        self.remove_stopwords = remove_stopwords
        self.language = language

        # –ó–∞–≥—Ä—É–∂–∞–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
        if self.remove_stopwords:
            self.stop_words = self._load_stopwords(language)

    def _load_stopwords(self, language):
        """–ó–∞–≥—Ä—É–∑–∫–∞ —Å—Ç–æ–ø-—Å–ª–æ–≤ –¥–ª—è —Ä–∞–∑–Ω—ã—Ö —è–∑—ã–∫–æ–≤"""
        if language == 'russian':
            try:
                return set(stopwords.words('russian'))
            except:
                # –†–µ–∑–µ—Ä–≤–Ω—ã–π —Å–ø–∏—Å–æ–∫ —Ä—É—Å—Å–∫–∏—Ö —Å—Ç–æ–ø-—Å–ª–æ–≤
                return self._get_russian_stopwords()

        elif language == 'tatar':
            # –¢–∞—Ç–∞—Ä—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
            return self._get_tatar_stopwords()

        elif language == 'english':
            return set(stopwords.words('english'))

        else:
            print(f"–Ø–∑—ã–∫ '{language}' –Ω–µ –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç—Å—è. –ò—Å–ø–æ–ª—å–∑—É–µ–º —Ä—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞.")
            return self._get_russian_stopwords()

    def _get_russian_stopwords(self):
        """–†—É—Å—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞"""
        return {
            '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ', '–∫–∞–∫', '–∞',
            '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞', '—Ç—ã', '–∫', '—É', '–∂–µ',
            '–≤—ã', '–∑–∞', '–±—ã', '–ø–æ', '—Ç–æ–ª—å–∫–æ', '–µ–µ', '–º–Ω–µ', '–±—ã–ª–æ', '–≤–æ—Ç', '–æ—Ç',
            '–º–µ–Ω—è', '–µ—â–µ', '–Ω–µ—Ç', '–æ', '–∏–∑', '–µ–º—É', '—Ç–µ–ø–µ—Ä—å', '–∫–æ–≥–¥–∞', '–¥–∞–∂–µ',
            '–Ω—É', '–≤–¥—Ä—É–≥', '–ª–∏', '–µ—Å–ª–∏', '—É–∂–µ', '–∏–ª–∏', '–Ω–∏', '–±—ã—Ç—å', '–±—ã–ª', '–Ω–µ–≥–æ',
            '–¥–æ', '–≤–∞—Å', '–Ω–∏–±—É–¥—å', '–æ–ø—è—Ç—å', '—É–∂', '–≤–∞–º', '–≤–µ–¥—å', '—Ç–∞–º', '–ø–æ—Ç–æ–º',
            '—Å–µ–±—è', '–Ω–∏—á–µ–≥–æ', '–µ–π', '–º–æ–∂–µ—Ç', '–æ–Ω–∏', '—Ç—É—Ç', '–≥–¥–µ', '–µ—Å—Ç—å', '–Ω–∞–¥–æ',
            '–Ω–µ–π', '–¥–ª—è', '–º—ã', '—Ç–µ–±—è', '–∏—Ö', '—á–µ–º', '–±—ã–ª–∞', '—Å–∞–º', '—á—Ç–æ–±', '–±–µ–∑',
            '–±—É–¥—Ç–æ', '—á–µ–≥–æ', '—Ä–∞–∑', '—Ç–æ–∂–µ', '—Å–µ–±–µ', '–ø–æ–¥', '–±—É–¥–µ—Ç', '–∂', '—Ç–æ–≥–¥–∞',
            '–∫—Ç–æ', '—ç—Ç–æ—Ç', '—Ç–æ–≥–æ', '–ø–æ—Ç–æ–º—É', '—ç—Ç–æ–≥–æ', '–∫–∞–∫–æ–π', '—Å–æ–≤—Å–µ–º', '–Ω–∏–º',
            '–∑–¥–µ—Å—å', '—ç—Ç–æ–º', '–æ–¥–∏–Ω', '–ø–æ—á—Ç–∏', '–º–æ–π', '—Ç–µ–º', '—á—Ç–æ–±—ã', '–Ω–µ–µ', '—Å–µ–π—á–∞—Å',
            '–±—ã–ª–∏', '–∫—É–¥–∞', '–∑–∞—á–µ–º', '–≤—Å–µ—Ö', '–Ω–∏–∫–æ–≥–¥–∞', '–º–æ–∂–Ω–æ', '–ø—Ä–∏', '–Ω–∞–∫–æ–Ω–µ—Ü',
            '–¥–≤–∞', '–æ–±', '–¥—Ä—É–≥–æ–π', '—Ö–æ—Ç—å', '–ø–æ—Å–ª–µ', '–Ω–∞–¥', '–±–æ–ª—å—à–µ', '—Ç–æ—Ç', '—á–µ—Ä–µ–∑',
            '—ç—Ç–∏', '–Ω–∞—Å', '–ø—Ä–æ', '–≤—Å–µ–≥–æ', '–Ω–∏—Ö', '–∫–∞–∫–∞—è', '–º–Ω–æ–≥–æ', '—Ä–∞–∑–≤–µ', '—Ç—Ä–∏',
            '—ç—Ç—É', '–º–æ—è', '–≤–ø—Ä–æ—á–µ–º', '—Ö–æ—Ä–æ—à–æ', '—Å–≤–æ—é', '—ç—Ç–æ–π', '–ø–µ—Ä–µ–¥', '–∏–Ω–æ–≥–¥–∞',
            '–ª—É—á—à–µ', '—á—É—Ç—å', '—Ç–æ–º', '–Ω–µ–ª—å–∑—è', '—Ç–∞–∫–æ–π', '–∏–º', '–±–æ–ª–µ–µ', '–≤—Å–µ–≥–¥–∞',
            '–∫–æ–Ω–µ—á–Ω–æ', '–≤—Å—é', '–º–µ–∂–¥—É'
        }

    def _get_tatar_stopwords(self):
        """–¢–∞—Ç–∞—Ä—Å–∫–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤–∞"""
        return {
            '–≤”ô', '“ª”ô–º', '–±–µ–ª”ô–Ω', '”©—á–µ–Ω', '—Ç—É—Ä—ã–Ω–¥–∞', '–∫–∞—Ä–∞—Ç–∞', '–±—É–µ–Ω—á–∞', '–∞—Ä–∫—ã–ª—ã',
            '–º–∏–Ω', '—Å–∏–Ω', '—É–ª', '–±–µ–∑', '—Å–µ–∑', '–∞–ª–∞—Ä', '–º–æ–Ω–¥–∞', '–∞–Ω–¥–∞', '—à—É–Ω–¥–∞',
            '–Ω–∏', '–Ω”ô—Ä—Å”ô', '–∫–µ–º', '–∫–∞–π—Å—ã', '–Ω–∏—á–µ–∫', '–Ω–∏–∫–∞–¥”ô—Ä', '–∫–∞–π—á–∞–Ω', '–Ω–∏—à–ª”ô–ø',
            '”ô–ª–µ', '–∏–Ω–¥–µ', '“ª–∞–º–∞–Ω', '—Ç–∞–≥—ã–Ω', '–∫“Ø–±—Ä”ô–∫', '–∞–∑—Ä–∞–∫', '–±–∏–∫', '–±–∏–≥–µ—Ä”ô–∫',
            '—Ç”ô', '–¥”ô', '–º—ã', '–º–µ', '–±—ã', '–±–µ', '–≥—ã', '–≥–µ', '–∫–∞', '–∫”ô',
            '—É–∫', '–≥–µ–Ω”ô', '–≥—ã–Ω–∞', '—á–∏–∫', '—Ö”ô—Ç—Ç–∞', '”ô–ª–ª”ô', '–≥–æ–º–µ—Ä', '–∫”©–Ω', '–µ–ª',
            '—è–∫–∏', '—è–≥—ä–Ω–∏', '—è–∏—Å”ô', '”ô–≥”ô—Ä', '—á–∏“£', '—à—É–ª', '–±—É', '–±–µ—Ä', '–∏–∫–µ', '”©—á',
            '“Ø–∑', '–±–∞—à–∫–∞', '–±–∞—Ä–ª—ã–∫', '–±”©—Ç–µ–Ω', '“ª–∏—á', '–±–µ—Ä–Ω–∏–Ω–¥–∏', '–±–µ—Ä–∫–∞–π—á–∞–Ω',
            '–±–∞—Ä', '—é–∫', '”ô–π–µ', '—é–∫', '—à—É–ª–∞–π', '—Ç“Ø–≥–µ–ª', '–º”©–º–∫–∏–Ω', '–∫–∏—Ä”ô–∫', '–±—É–ª–∞',
            '–¥–∏–ø', '–¥–∏', '–¥”ô', '—Ç”ô', '–º—ã–Ω–∏', '”ô–π—Ç”ô', '–∫“Ø—Ä—Å”ô—Ç”ô', '–∞–ª—ã–Ω–∞', '–±–∏—Ä–µ–ª–µ–ø'
        }

    def clean_html(self, text):
        """–£–¥–∞–ª–µ–Ω–∏–µ HTML-—Ä–∞–∑–º–µ—Ç–∫–∏"""
        if not text:
            return ""

        text = html.unescape(text)
        soup = BeautifulSoup(text, 'html.parser')
        clean_text = soup.get_text(separator=' ')
        return clean_text

    def remove_special_characters(self, text):
        """–£–¥–∞–ª–µ–Ω–∏–µ —Å–ª—É–∂–µ–±–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤"""
        if not text:
            return ""

        # –£–¥–∞–ª—è–µ–º email –∏ URL
        text = re.sub(r'\S*@\S*\s?', '', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'www\S+', '', text)

        # –î–ª—è —Ç–∞—Ç–∞—Ä—Å–∫–æ–≥–æ —Å–æ—Ö—Ä–∞–Ω—è–µ–º —Å–ø–µ—Ü–∏—Ñ–∏—á–µ—Å–∫–∏–µ —Å–∏–º–≤–æ–ª—ã
        if self.language == 'tatar':
            text = re.sub(r'[^\w\s\.\,\!\?\-\:\(\)”ô”©“Ø“ó“£“ª]', '', text)
        else:
            text = re.sub(r'[^\w\s\.\,\!\?\-\:\(\)]', '', text)

        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def normalize_whitespace(self, text):
        """–°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—Ä–æ–±–µ–ª—å–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def remove_stopwords_func(self, text):
        """–§–∏–ª—å—Ç—Ä–∞—Ü–∏—è —Å—Ç–æ–ø-—Å–ª–æ–≤"""
        if not text or not self.remove_stopwords:
            return text

        # –ü—Ä–æ—Å—Ç–∞—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è (–º–æ–∂–Ω–æ –∑–∞–º–µ–Ω–∏—Ç—å –Ω–∞ –±–æ–ª–µ–µ —Å–ª–æ–∂–Ω—É—é)
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in self.stop_words]

        return ' '.join(filtered_words)

    def clean_text(self, text, **kwargs):
        """–û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞"""
        if not text:
            return ""

        cleaned_text = text

        # –ü—Ä–∏–º–µ–Ω—è–µ–º –æ—á–∏—Å—Ç–∫—É HTML
        if kwargs.get('clean_html', True):
            cleaned_text = self.clean_html(cleaned_text)

        # –£–¥–∞–ª—è–µ–º —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã
        if kwargs.get('remove_special_chars', True):
            cleaned_text = self.remove_special_characters(cleaned_text)

        # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º –ø—Ä–æ–±–µ–ª—ã
        if kwargs.get('normalize_whitespace', True):
            cleaned_text = self.normalize_whitespace(cleaned_text)

        # –ü—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É
        if kwargs.get('lowercase', self.lowercase):
            cleaned_text = cleaned_text.lower()

        # –£–¥–∞–ª—è–µ–º —Å—Ç–æ–ø-—Å–ª–æ–≤–∞
        if kwargs.get('remove_stopwords', self.remove_stopwords):
            cleaned_text = self.remove_stopwords_func(cleaned_text)

        return cleaned_text


# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —Å —Ç–∞—Ç–∞—Ä—Å–∫–∏–º —è–∑—ã–∫–æ–º
if __name__ == "__main__":
    # –¢–∞—Ç–∞—Ä—Å–∫–∏–π —Ç–µ–∫—Å—Ç
    tatar_text = "–ú–∏–Ω <p> “ª”ô–º —Å–∏–Ω –±—É –∫”©–Ω–Ω–µ –±–∏–∫ —è—Ö—à—ã –≤–∞–∫—ã—Ç “Ø—Ç–∫”ô—Ä–¥–µ–∫<p>. –£–ª –±–µ–∑–≥”ô –∫–∏–ª”ô—á”ô–∫."

    cleaner_ru = TextCleaner(language='russian')
    cleaner_tt = TextCleaner(language='tatar')

    print("–¢–∞—Ç–∞—Ä—Å–∫–∏–π —Ç–µ–∫—Å—Ç:")
    print(tatar_text)
    print("\n–û—á–∏—Å—Ç–∫–∞ –∫–∞–∫ —Ä—É—Å—Å–∫–∏–π —Ç–µ–∫—Å—Ç:")
    print(cleaner_ru.clean_text(tatar_text))
    print("\n–û—á–∏—Å—Ç–∫–∞ –∫–∞–∫ —Ç–∞—Ç–∞—Ä—Å–∫–∏–π —Ç–µ–∫—Å—Ç:")
    print(cleaner_tt.clean_text(tatar_text))

–¢–∞—Ç–∞—Ä—Å–∫–∏–π —Ç–µ–∫—Å—Ç:
–ú–∏–Ω <p> “ª”ô–º —Å–∏–Ω –±—É –∫”©–Ω–Ω–µ –±–∏–∫ —è—Ö—à—ã –≤–∞–∫—ã—Ç “Ø—Ç–∫”ô—Ä–¥–µ–∫<p>. –£–ª –±–µ–∑–≥”ô –∫–∏–ª”ô—á”ô–∫.

–û—á–∏—Å—Ç–∫–∞ –∫–∞–∫ —Ä—É—Å—Å–∫–∏–π —Ç–µ–∫—Å—Ç:
–º–∏–Ω “ª”ô–º —Å–∏–Ω –±—É –∫”©–Ω–Ω–µ –±–∏–∫ —è—Ö—à—ã –≤–∞–∫—ã—Ç “Ø—Ç–∫”ô—Ä–¥–µ–∫ . —É–ª –±–µ–∑–≥”ô –∫–∏–ª”ô—á”ô–∫.

–û—á–∏—Å—Ç–∫–∞ –∫–∞–∫ —Ç–∞—Ç–∞—Ä—Å–∫–∏–π —Ç–µ–∫—Å—Ç:
–∫”©–Ω–Ω–µ —è—Ö—à—ã –≤–∞–∫—ã—Ç “Ø—Ç–∫”ô—Ä–¥–µ–∫ . –±–µ–∑–≥”ô –∫–∏–ª”ô—á”ô–∫.


In [None]:
import json

def read_jsonl_basic(filename):
    """–ü—Ä–æ—Å—Ç–æ–µ —á—Ç–µ–Ω–∏–µ JSONL —Ñ–∞–π–ª–∞"""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—É—Å—Ç—ã–µ —Å—Ç—Ä–æ–∫–∏
                data.append(json.loads(line))
    return data

# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ
all_news = read_jsonl_basic('/content/lenta_news_20251006_192459.jsonl')
cleaned_all_news = []

cleaner = TextCleaner(language='tatar')

for news_item in all_news:
    news_item['text_clean'] = cleaner.clean_text(news_item['text'])
    cleaned_all_news.append(news_item)

# for item in cleaned_all_news:
#   print(item)

corpus = [item["text"] for item in cleaned_all_news]
print(corpus)

['–†–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à–ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω–∞–Ω “Ø—Å—Ç–µ—Ä”ô—á”ô–∫–ª”ô—Ä. –ë—É —Ö–∞–∫—Ç–∞ –†–æ—Å—Å–∏—è –∞–≤—ã–ª —Ö—É“ó–∞–ª—ã–≥—ã –º–∏–Ω–∏—Å—Ç—Ä—ã –û–∫—Å–∞–Ω–∞ –õ—É—Ç ¬´–ë–∏–æ–ø—Ä–æ–º¬ª —Ñ–æ—Ä—É–º—ã–Ω—ã“£ —Ç”©–ø —Å—Ç—Ä–∞—Ç–µ–≥–∏–∫ —Å–µ—Å—Å–∏—è—Å–µ–Ω–¥”ô —Ö”ô–±”ô—Ä –∏—Ç—Ç–µ . ¬´–ë–µ–∑–¥”ô —Ö”ô–∑–µ—Ä –±–∞–Ω–∞–Ω “Ø—Å—Ç–µ—Ä“Ø ”©—á–µ–Ω –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à–ª—ã–π¬ª, ‚Äì –¥–∏–ø –º–∏–Ω–∏—Å—Ç—Ä —Å“Ø–∑–ª”ô—Ä–µ–Ω –∫–∏—Ç–µ—Ä”ô –¢–ê–°–°. –£–∑–≥–∞–Ω –µ–ª–Ω—ã“£ –Ω–æ—è–±—Ä–µ–Ω–¥”ô –õ—É—Ç —Ö”ô–±”ô—Ä –∏—Ç–∫”ô–Ω—á”ô, –†–æ—Å—Å–∏—è –§–µ–¥–µ—Ä–∞—Ü–∏—è—Å–µ–Ω–¥”ô, –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω–¥–∞–≥—ã –∫–µ–±–µ–∫, –±–∞–Ω–∞–Ω “Ø—Å—Ç–µ—Ä–µ—Ä–≥”ô –ø–ª–∞–Ω–ª–∞—à—Ç—ã—Ä–∞–ª–∞—Ä. 2025 –µ–ª–Ω—ã“£ –∏—é–ª–µ–Ω–¥”ô –†–æ—Å—Å–∏—è –•”©–∫“Ø–º”ô—Ç–µ –±–∞–Ω–∞–Ω–Ω–∞—Ä–Ω—ã –∏–ª–Ω–µ“£ –∞–≤—ã–ª —Ö—É“ó–∞–ª—ã–≥—ã –ø—Ä–æ–¥—É–∫—Ü–∏—è—Å–µ –∏—Å–µ–º–ª–µ–≥–µ–Ω”ô –∫–µ—Ä—Ç—Ç–µ. –õ—É—Ç —Å”©–π–ª”ô–≥”ô–Ω—á”ô, —ç–ª–µ–≥—Ä”ô–∫ –º–∏–Ω–∏—Å—Ç—Ä–ª—ã–∫–∫–∞ —Ç”©—Ä–ª–µ “Ø—Å–µ–º–ª–µ–∫–ª”ô—

### –≠—Ç–∞–ø 3. –ü—Ä–æ–µ–∫—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–æ–≥–æ –º–æ–¥—É–ª—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏

In [None]:
import re
import json
from typing import Dict, List, Any, Optional

class UniversalPreprocessor:
    """
    –£–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω—ã–π –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä –¥–ª—è –ø—Ä–∏–≤–µ–¥–µ–Ω–∏—è —Ç–µ–∫—Å—Ç–∞ –∫ –µ–¥–∏–Ω–æ–º—É —Å—Ç–∞–Ω–¥–∞—Ä—Ç—É
    """

    def __init__(self, config_path: Optional[str] = None):
        """
        –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ø—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–∞

        Args:
            config_path: –ø—É—Ç—å –∫ JSON —Ñ–∞–π–ª—É —Å –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–µ–π
        """
        self.default_config = {
            # –°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
            'normalize_punctuation': True,
            'normalize_whitespace': True,

            # –ó–∞–º–µ–Ω–∞ –Ω–∞ —Ç–æ–∫–µ–Ω—ã
            'replace_numbers': True,
            'replace_urls': True,
            'replace_emails': True,
            'replace_currencies': True,
            'replace_phone_numbers': True,

            # –û–±—Ä–∞–±–æ—Ç–∫–∞ —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π
            'expand_abbreviations': True,
            'expand_special_abbreviations': True,

            # –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω—ã–µ –Ω–∞—Å—Ç—Ä–æ–π–∫–∏
            'preserve_sentence_endings': True,
            'remove_extra_spaces': True
        }

        # –ó–∞–≥—Ä—É–∂–∞–µ–º –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é
        self.config = self.default_config.copy()
        if config_path:
            self.load_config(config_path)

        # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∏—Ä—É–µ–º –ø—Ä–∞–≤–∏–ª–∞
        self._init_patterns()
        self._init_abbreviations()

    def load_config(self, config_path: str) -> None:
        """–ó–∞–≥—Ä—É–∑–∫–∞ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ –∏–∑ JSON —Ñ–∞–π–ª–∞"""
        try:
            with open(config_path, 'r', encoding='utf-8') as f:
                user_config = json.load(f)
            self.config.update(user_config)
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –∑–∞–≥—Ä—É–∑–∫–∏ –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏: {e}. –ò—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è –Ω–∞—Å—Ç—Ä–æ–π–∫–∏ –ø–æ —É–º–æ–ª—á–∞–Ω–∏—é.")

    def save_config(self, config_path: str) -> None:
        """–°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ç–µ–∫—É—â–µ–π –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ –≤ —Ñ–∞–π–ª"""
        try:
            with open(config_path, 'w', encoding='utf-8') as f:
                json.dump(self.config, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏: {e}")

    def _init_patterns(self) -> None:
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Ä–µ–≥—É–ª—è—Ä–Ω—ã—Ö –≤—ã—Ä–∞–∂–µ–Ω–∏–π"""

        # –ß–∏—Å–ª–∏—Ç–µ–ª—å–Ω—ã–µ
        self.number_patterns = [
            # –¶–µ–ª—ã–µ —á–∏—Å–ª–∞ —Å —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª—è–º–∏
            (r'\b\d{1,3}(?:[ ,]\d{3})+\b', '<NUM>'),  # 1,000, 10 000
            # –î–µ—Å—è—Ç–∏—á–Ω—ã–µ –¥—Ä–æ–±–∏
            (r'\b\d+[.,]\d+\b', '<NUM>'),  # 3.14, 2,5
            # –ü—Ä–æ—Å—Ç—ã–µ —á–∏—Å–ª–∞
            (r'\b\d+\b', '<NUM>'),  # 123, 45
        ]

        # URL –∏ email
        self.url_pattern = (r'https?://[^\s]+|www\.[^\s]+', '<URL>')
        self.email_pattern = (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '<EMAIL>')

        # –ù–æ–º–µ—Ä–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–æ–≤
        self.phone_patterns = [
            (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '<PHONE>'),  # 123-456-7890
            (r'\b\d{1,2}[-.]?\d{3}[-.]?\d{2}[-.]?\d{2}\b', '<PHONE>'),  # 8-912-34-56
            (r'\b\+?[\d\s\-\(\)]{7,}\b', '<PHONE>'),  # –ú–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–µ —Ñ–æ—Ä–º–∞—Ç—ã
        ]

        # –í–∞–ª—é—Ç—ã
        self.currency_pattern = (r'\b\d+[.,]?\d*\s*[‚ÇΩ$‚Ç¨¬£¬•]\b|\b[‚ÇΩ$‚Ç¨¬£¬•]\s*\d+[.,]?\d*\b', '<CURRENCY>')

        # –ü—É–Ω–∫—Ç—É–∞—Ü–∏—è
        self.punctuation_replacements = {
            '‚Ä¶': '...',
            '¬´': '"',
            '¬ª': '"',
            '‚Äû': '"',
            '‚Äú': '"',
            '‚Äù': '"',
            '‚Äò': "'",
            '‚Äô': "'",
            '‚Äì': '-',
            '‚Äî': '-',
        }

    def _init_abbreviations(self) -> None:
        """–ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è —Å–ª–æ–≤–∞—Ä—è —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π"""

        # –û–±—â–µ—è–∑—ã–∫–æ–≤—ã–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è (—Ä—É—Å—Å–∫–∏–π)
        self.common_abbreviations = {
            '—Ç.–µ.': '—Ç–æ –µ—Å—Ç—å',
            '—Ç.–¥.': '—Ç–∞–∫ –¥–∞–ª–µ–µ',
            '—Ç.–ø.': '—Ç–æ–º—É –ø–æ–¥–æ–±–Ω–æ–µ',
            '–∏ —Ç.–¥.': '–∏ —Ç–∞–∫ –¥–∞–ª–µ–µ',
            '–∏ —Ç.–ø.': '–∏ —Ç–æ–º—É –ø–æ–¥–æ–±–Ω–æ–µ',
            '–∏ –¥—Ä.': '–∏ –¥—Ä—É–≥–∏–µ',
            '–∏ –ø—Ä.': '–∏ –ø—Ä–æ—á–∏–µ',
            '—Ç.–∫.': '—Ç–∞–∫ –∫–∞–∫',
            '—Ç.–Ω.': '—Ç–∞–∫ –Ω–∞–∑—ã–≤–∞–µ–º—ã–π',
            '—Ç.–æ.': '—Ç–∞–∫–∏–º –æ–±—Ä–∞–∑–æ–º',
            '—Å.–≥.': '—Å–µ–≥–æ –≥–æ–¥–∞',
            '–Ω.—ç.': '–Ω–∞—à–µ–π —ç—Ä—ã',
            '–¥–æ –Ω.—ç.': '–¥–æ –Ω–∞—à–µ–π —ç—Ä—ã',
            '–≥.': '–≥–æ–¥',
            '–≥–≥.': '–≥–æ–¥—ã',
            '–≤–≤.': '–≤–µ–∫–∞',
            '—Å–º.': '—Å–º–æ—Ç—Ä–∏',
            '—Å—Ç—Ä.': '—Å—Ç—Ä–∞–Ω–∏—Ü–∞',
            '—Ä–∏—Å.': '—Ä–∏—Å—É–Ω–æ–∫',
            '–Ω–∞–ø—Ä.': '–Ω–∞–ø—Ä–∏–º–µ—Ä',
            '–º–∏–Ω.': '–º–∏–Ω—É—Ç',
            '—Å–µ–∫.': '—Å–µ–∫—É–Ω–¥',
            '—á.': '—á–∞—Å',
            '–∫–≥.': '–∫–∏–ª–æ–≥—Ä–∞–º–º',
            '—Å–º.': '—Å–∞–Ω—Ç–∏–º–µ—Ç—Ä',
            '–º.': '–º–µ—Ç—Ä',
            '–∫–º.': '–∫–∏–ª–æ–º–µ—Ç—Ä',
            '—Ä—É–±.': '—Ä—É–±–ª—å',
            '–¥–æ–ª–ª.': '–¥–æ–ª–ª–∞—Ä',
            '–µ–≤—Ä–æ.': '–µ–≤—Ä–æ',
        }

        # –°–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è (–º–æ–∂–Ω–æ —Ä–∞—Å—à–∏—Ä–∏—Ç—å)
        self.special_abbreviations = {
            '–°–®–ê': '–°–æ–µ–¥–∏–Ω–µ–Ω–Ω—ã–µ –®—Ç–∞—Ç—ã –ê–º–µ—Ä–∏–∫–∏',
            '–†–§': '–†–æ—Å—Å–∏–π—Å–∫–∞—è –§–µ–¥–µ—Ä–∞—Ü–∏—è',
            '–°–°–°–†': '–°–æ—é–∑ –°–æ–≤–µ—Ç—Å–∫–∏—Ö –°–æ—Ü–∏–∞–ª–∏—Å—Ç–∏—á–µ—Å–∫–∏—Ö –†–µ—Å–ø—É–±–ª–∏–∫',
            '–û–û–ù': '–û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è –û–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã—Ö –ù–∞—Ü–∏–π',
            '–ù–ê–¢–û': '–û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏—è –°–µ–≤–µ—Ä–æ–∞—Ç–ª–∞–Ω—Ç–∏—á–µ—Å–∫–æ–≥–æ –¥–æ–≥–æ–≤–æ—Ä–∞',
        }

    def normalize_punctuation(self, text: str) -> str:
        """–°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏"""
        if not self.config['normalize_punctuation']:
            return text

        for old, new in self.punctuation_replacements.items():
            text = text.replace(old, new)

        return text

    def normalize_whitespace(self, text: str) -> str:
        """–°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—Ä–æ–±–µ–ª—å–Ω—ã—Ö —Å–∏–º–≤–æ–ª–æ–≤"""
        if not self.config['normalize_whitespace']:
            return text

        # –ó–∞–º–µ–Ω—è–µ–º –≤—Å–µ –ø—Ä–æ–±–µ–ª—å–Ω—ã–µ —Å–∏–º–≤–æ–ª—ã –Ω–∞ –æ–±—ã—á–Ω—ã–µ –ø—Ä–æ–±–µ–ª—ã
        text = re.sub(r'\s+', ' ', text)

        # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
        text = re.sub(r'\s+([.,!?;:])', r'\1', text)
        text = re.sub(r'([(])\s+', r'\1', text)
        text = re.sub(r'\s+([)])', r'\1', text)

        # –î–æ–±–∞–≤–ª—è–µ–º –ø—Ä–æ–±–µ–ª—ã –ø–æ—Å–ª–µ –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏, –µ—Å–ª–∏ –Ω—É–∂–Ω–æ
        if self.config['preserve_sentence_endings']:
            text = re.sub(r'([.!?])([–ê-–ØA-Z])', r'\1 \2', text)

        return text.strip()

    def replace_with_tokens(self, text: str) -> str:
        """–ó–∞–º–µ–Ω–∞ —á–∏—Å–ª–∏—Ç–µ–ª—å–Ω—ã—Ö, URL, email –Ω–∞ —Ç–æ–∫–µ–Ω—ã"""

        # URL
        if self.config['replace_urls']:
            pattern, replacement = self.url_pattern
            text = re.sub(pattern, replacement, text)

        # Email
        if self.config['replace_emails']:
            pattern, replacement = self.email_pattern
            text = re.sub(pattern, replacement, text)

        # –ß–∏—Å–ª–∏—Ç–µ–ª—å–Ω—ã–µ
        if self.config['replace_numbers']:
            for pattern, replacement in self.number_patterns:
                text = re.sub(pattern, replacement, text)

        # –ù–æ–º–µ—Ä–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–æ–≤
        if self.config['replace_phone_numbers']:
            for pattern, replacement in self.phone_patterns:
                text = re.sub(pattern, replacement, text)

        # –í–∞–ª—é—Ç—ã
        if self.config['replace_currencies']:
            pattern, replacement = self.currency_pattern
            text = re.sub(pattern, replacement, text)

        return text

    def expand_abbreviations(self, text: str) -> str:
        """–†–∞—Å–∫—Ä—ã—Ç–∏–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π"""
        if not self.config['expand_abbreviations']:
            return text

        # –û–±—â–µ—è–∑—ã–∫–æ–≤—ã–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è
        for abbrev, expansion in self.common_abbreviations.items():
            # –ò—Å–ø–æ–ª—å–∑—É–µ–º –≥—Ä–∞–Ω–∏—Ü—ã —Å–ª–æ–≤–∞ –¥–ª—è —Ç–æ—á–Ω–æ–≥–æ —Å–æ–≤–ø–∞–¥–µ–Ω–∏—è
            pattern = r'\b' + re.escape(abbrev) + r'\b'
            text = re.sub(pattern, expansion, text, flags=re.IGNORECASE)

        # –°–ø–µ—Ü–∏–∞–ª—å–Ω—ã–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è
        if self.config['expand_special_abbreviations']:
            for abbrev, expansion in self.special_abbreviations.items():
                pattern = r'\b' + re.escape(abbrev) + r'\b'
                text = re.sub(pattern, expansion, text)

        return text

    def preprocess_text(self, text: str, **kwargs) -> str:
        """
        –û—Å–Ω–æ–≤–Ω–∞—è —Ñ—É–Ω–∫—Ü–∏—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞

        Args:
            text: –∏—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç
            **kwargs: –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –Ω–∞—Å—Ç—Ä–æ–π–∫–∏ (–ø–µ—Ä–µ–æ–ø—Ä–µ–¥–µ–ª—è—é—Ç –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é)

        Returns:
            –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç
        """
        if not text:
            return ""

        # –°–æ—Ö—Ä–∞–Ω—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—É—é –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é
        original_config = self.config.copy()

        try:
            # –í—Ä–µ–º–µ–Ω–Ω–æ –ø—Ä–∏–º–µ–Ω—è–µ–º –ø–µ—Ä–µ–¥–∞–Ω–Ω—ã–µ –Ω–∞—Å—Ç—Ä–æ–π–∫–∏
            for key, value in kwargs.items():
                if key in self.config:
                    self.config[key] = value

            # –ü—Ä–∏–º–µ–Ω—è–µ–º –≤—Å–µ —ç—Ç–∞–ø—ã –æ–±—Ä–∞–±–æ—Ç–∫–∏
            processed_text = text

            # 1. –°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—É–Ω–∫—Ç—É–∞—Ü–∏–∏
            processed_text = self.normalize_punctuation(processed_text)

            # 2. –ó–∞–º–µ–Ω–∞ –Ω–∞ —Ç–æ–∫–µ–Ω—ã
            processed_text = self.replace_with_tokens(processed_text)

            # 3. –†–∞—Å–∫—Ä—ã—Ç–∏–µ —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π
            processed_text = self.expand_abbreviations(processed_text)

            # 4. –°—Ç–∞–Ω–¥–∞—Ä—Ç–∏–∑–∞—Ü–∏—è –ø—Ä–æ–±–µ–ª–æ–≤
            processed_text = self.normalize_whitespace(processed_text)

            return processed_text

        finally:
            # –í–æ—Å—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—É—é –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—é
            self.config = original_config

    def batch_preprocess(self, texts: List[str], **kwargs) -> List[str]:
        """–ü–∞–∫–µ—Ç–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞ —Å–ø–∏—Å–∫–∞ —Ç–µ–∫—Å—Ç–æ–≤"""
        return [self.preprocess_text(text, **kwargs) for text in texts]


# –§—É–Ω–∫—Ü–∏—è –¥–ª—è –±—ã—Å—Ç—Ä–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–∏ –±–µ–∑ —Å–æ–∑–¥–∞–Ω–∏—è —ç–∫–∑–µ–º–ø–ª—è—Ä–∞
def quick_preprocess(text: str, **kwargs) -> str:
    """
    –ë—ã—Å—Ç—Ä–∞—è –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞

    Args:
        text: –∏—Å—Ö–æ–¥–Ω—ã–π —Ç–µ–∫—Å—Ç
        **kwargs: –Ω–∞—Å—Ç—Ä–æ–π–∫–∏ –æ–±—Ä–∞–±–æ—Ç–∫–∏

    Returns:
        –æ–±—Ä–∞–±–æ—Ç–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç
    """
    processor = UniversalPreprocessor()
    return processor.preprocess_text(text, **kwargs)


# –ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è –∏ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
if __name__ == "__main__":
    # –¢–µ—Å—Ç–æ–≤—ã–µ –ø—Ä–∏–º–µ—Ä—ã
    test_texts = [
        "–¶–µ–Ω–∞: 1,000.50‚ÇΩ. –°–∞–π—Ç: https://example.com. Email: test@mail.ru",
        "–¢–µ–∫—Å—Ç —Å —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è–º–∏: —Ç.–µ., –∏ —Ç.–¥., —Å–º. —Å—Ç—Ä. 5. –ù–æ–º–µ—Ä: 8-912-345-67-89",
        "–†–∞–∑–Ω—ã–µ –∫–∞–≤—ã—á–∫–∏ ¬´–ø—Ä–∏–º–µ—Ä¬ª –∏ ‚Äú–ø—Ä–∏–º–µ—Ä‚Äù. –ß–∏—Å–ª–∞: 3.14, 2,5, 1000000",
        "–û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏: –°–®–ê, –û–û–ù, –ù–ê–¢–û. –î–∞—Ç–∞: 2023 –≥.",
        "–ú–∏–Ω “ª”ô–º —Å–∏–Ω –±—É –∫”©–Ω–Ω–µ –±–∏–∫ —è—Ö—à—ã –≤–∞–∫—ã—Ç “Ø—Ç–∫”ô—Ä–¥–µ–∫. –£–ª –±–µ–∑–≥”ô –∫–∏–ª”ô—á”ô–∫. 5 —Ç–∞–ø–∫—ã—Ä."
    ]

    # –°–æ–∑–¥–∞–µ–º –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä
    processor = UniversalPreprocessor()

    print("=== –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ Universal Preprocessor ===\n")

    for i, text in enumerate(test_texts, 1):
        print(f"–ü—Ä–∏–º–µ—Ä {i}:")
        print(f"–î–æ: {text}")
        print(f"–ü–æ—Å–ª–µ: {processor.preprocess_text(text)}")
        print("-" * 50)

    # –î–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏—è –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏
    print("\n=== –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è ===")
    for key, value in processor.config.items():
        print(f"{key}: {value}")

    # –ü—Ä–∏–º–µ—Ä —Å–æ–∑–¥–∞–Ω–∏—è –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–æ–Ω–Ω–æ–≥–æ —Ñ–∞–π–ª–∞
    config_example = {
        'normalize_punctuation': True,
        'normalize_whitespace': True,
        'replace_numbers': True,
        'replace_urls': True,
        'replace_emails': True,
        'expand_abbreviations': True,
        'expand_special_abbreviations': False  # –û—Ç–∫–ª—é—á–∞–µ–º —Ä–∞—Å–∫—Ä—ã—Ç–∏–µ —Å–ø–µ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–æ–∫—Ä–∞—â–µ–Ω–∏–π
    }

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–∏–º–µ—Ä –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏
    with open('preprocessor_config.json', 'w', encoding='utf-8') as f:
        json.dump(config_example, f, ensure_ascii=False, indent=2)

    print("\n–ü—Ä–∏–º–µ—Ä –∫–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω –≤ 'preprocessor_config.json'")

=== –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ Universal Preprocessor ===

–ü—Ä–∏–º–µ—Ä 1:
–î–æ: –¶–µ–Ω–∞: 1,000.50‚ÇΩ. –°–∞–π—Ç: https://example.com. Email: test@mail.ru
–ü–æ—Å–ª–µ: –¶–µ–Ω–∞: <NUM>.<NUM>‚ÇΩ. –°–∞–π—Ç: <URL> Email: <EMAIL>
--------------------------------------------------
–ü—Ä–∏–º–µ—Ä 2:
–î–æ: –¢–µ–∫—Å—Ç —Å —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è–º–∏: —Ç.–µ., –∏ —Ç.–¥., —Å–º. —Å—Ç—Ä. 5. –ù–æ–º–µ—Ä: 8-912-345-67-89
–ü–æ—Å–ª–µ: –¢–µ–∫—Å—Ç —Å —Å–æ–∫—Ä–∞—â–µ–Ω–∏—è–º–∏: —Ç.–µ., –∏ —Ç.–¥., —Å–º. —Å—Ç—Ä. <NUM>. –ù–æ–º–µ—Ä: <NUM>-<NUM>-<NUM>-<NUM>-<NUM>
--------------------------------------------------
–ü—Ä–∏–º–µ—Ä 3:
–î–æ: –†–∞–∑–Ω—ã–µ –∫–∞–≤—ã—á–∫–∏ ¬´–ø—Ä–∏–º–µ—Ä¬ª –∏ ‚Äú–ø—Ä–∏–º–µ—Ä‚Äù. –ß–∏—Å–ª–∞: 3.14, 2,5, 1000000
–ü–æ—Å–ª–µ: –†–∞–∑–Ω—ã–µ –∫–∞–≤—ã—á–∫–∏ "–ø—Ä–∏–º–µ—Ä" –∏ "–ø—Ä–∏–º–µ—Ä". –ß–∏—Å–ª–∞: <NUM>, <NUM>, <NUM>
--------------------------------------------------
–ü—Ä–∏–º–µ—Ä 4:
–î–æ: –û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏: –°–®–ê, –û–û–ù, –ù–ê–¢–û. –î–∞—Ç–∞: 2023 –≥.
–ü–æ—Å–ª–µ: –û—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏: –°–æ–µ–¥–∏–

### –≠—Ç–∞–ø 4. –°—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –º–µ—Ç–æ–¥–æ–≤ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏


In [None]:
preprocessed_data = []

for data in cleaned_all_news:
    data['text'] = quick_preprocess(data['text'])
    preprocessed_data.append(data)

In [None]:
import pandas as pd
import numpy as np
import time
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
import re
import os

# –û—Å–Ω–æ–≤–Ω—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏ –¥–ª—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–æ–≤
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers import normalizers

import sentencepiece as spm

class SubwordModelComparator:
    def __init__(self, corpus: List[str]):
        self.corpus = corpus
        self.results = []

    def prepare_corpus_file(self):
        """–°–æ—Ö—Ä–∞–Ω—è–µ—Ç –∫–æ—Ä–ø—É—Å –≤–æ –≤—Ä–µ–º–µ–Ω–Ω—ã–π —Ñ–∞–π–ª –¥–ª—è sentencepiece"""
        with open('temp_corpus.txt', 'w', encoding='utf-8') as f:
            for text in self.corpus:
                if text.strip():  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—É—Å—Ç—ã–µ —Å—Ç—Ä–æ–∫–∏
                    f.write(text.strip() + '\n')
        return 'temp_corpus.txt'

    def calculate_fragmentation(self, tokenized_texts: List[List[str]]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –ø—Ä–æ—Ü–µ–Ω—Ç —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞—Ü–∏–∏ —Å–ª–æ–≤"""
        total_words = 0
        fragmented_words = 0

        for tokens in tokenized_texts:
            for token in tokens:
                # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã
                if token.startswith('##') or '‚ñÅ' in token or len(token) < 3:
                    fragmented_words += 1
                total_words += 1

        return (fragmented_words / total_words * 100) if total_words > 0 else 0

    def calculate_compression_ratio(self, original_texts: List[str], tokenized_texts: List[List[str]]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Å–∂–∞—Ç–∏—è"""
        total_original_tokens = sum(len(text.split()) for text in original_texts if text.strip())
        total_subword_tokens = sum(len(tokens) for tokens in tokenized_texts)

        return total_subword_tokens / total_original_tokens if total_original_tokens > 0 else 1

    def reconstruct_text_for_model(self, tokens: List[str], model_name: str) -> str:
        """–ü—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –∫–∞–∂–¥–æ–π –º–æ–¥–µ–ª–∏"""
        if not tokens:
            return ""

        if model_name == "Unigram_SP":
            # SentencePiece: '‚ñÅ' –æ–∑–Ω–∞—á–∞–µ—Ç –Ω–∞—á–∞–ª–æ —Å–ª–æ–≤–∞
            text = ''.join(tokens).replace('‚ñÅ', ' ').strip()

        elif model_name == "WordPiece":
            # WordPiece: '##' –æ–∑–Ω–∞—á–∞–µ—Ç –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ —Å–ª–æ–≤–∞
            if not tokens:
                return ""
            text = tokens[0]  # –ü–µ—Ä–≤—ã–π —Ç–æ–∫–µ–Ω –≤—Å–µ–≥–¥–∞ —Ü–µ–ª—ã–π
            for token in tokens[1:]:
                if token.startswith('##'):
                    text += token[2:]  # –£–±–∏—Ä–∞–µ–º '##'
                else:
                    text += ' ' + token

        elif model_name == "BPE":
            # BPE: –∏—Å–ø–æ–ª—å–∑—É–µ–º Whitespace, –ø–æ—ç—Ç–æ–º—É –ø—Ä–æ—Å—Ç–æ–π join
            text = ' '.join(tokens)
            # –£–±–∏—Ä–∞–µ–º –≤–æ–∑–º–æ–∂–Ω—ã–µ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã –º–∞—Ä–∫–∏—Ä–æ–≤–∫–∏
            text = text.replace(' ##', '').strip()

        elif model_name == "Unigram_HF":
            # Hugging Face Unigram: –æ–±—ã—á–Ω–æ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç –ø—Ä–æ–±–µ–ª—ã
            text = ' '.join(tokens).replace(' ##', '').strip()

        else:
            # Fallback
            text = ' '.join(tokens).strip()

        return text

    # def calculate_reconstruction_accuracy(self, original_texts: List[str], reconstructed_texts: List[str]) -> float:
    #     """–í—ã—á–∏—Å–ª—è–µ—Ç —Ç–æ—á–Ω–æ—Å—Ç—å —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ —Å —É–ª—É—á—à–µ–Ω–Ω—ã–º —Å—Ä–∞–≤–Ω–µ–Ω–∏–µ–º"""
    #     correct = 0
    #     total = min(len(original_texts), len(reconstructed_texts))

    #     for i in range(total):
    #         orig = original_texts[i]
    #         rec = reconstructed_texts[i]

    #         if not orig.strip() or not rec.strip():
    #             continue

    #         # –ù–æ—Ä–º–∞–ª–∏–∑—É–µ–º —Ç–µ–∫—Å—Ç—ã –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
    #         orig_norm = re.sub(r'\s+', ' ', orig.lower().strip())
    #         rec_norm = re.sub(r'\s+', ' ', rec.lower().strip())

    #         # –°—Ä–∞–≤–Ω–∏–≤–∞–µ–º
    #         if orig_norm == rec_norm:
    #             correct += 1
    #         else:
    #             # –î–ª—è –æ—Ç–ª–∞–¥–∫–∏: –ø–æ–∫–∞–∑—ã–≤–∞–µ–º —Ä–∞–∑–ª–∏—á–∏—è –¥–ª—è –ø–µ—Ä–≤—ã—Ö –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Ç–µ–∫—Å—Ç–æ–≤
    #             if i < 2 and total > 1:  # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º —Ç–æ–ª—å–∫–æ –ø–µ—Ä–≤—ã–µ 2 —Ä–∞–∑–ª–∏—á–∏—è
    #                 print(f"üîç –û—Ç–ª–∏—á–∏–µ –≤ —Ç–µ–∫—Å—Ç–µ {i}:")
    #                 print(f"   –û—Ä–∏–≥–∏–Ω–∞–ª: '{orig[:100]}...'")
    #                 print(f"   –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '{rec[:100]}...'")
    #                 print(f"   –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: {orig_norm == rec_norm}")

    #     accuracy = (correct / total * 100) if total > 0 else 0
    #     print(f"   –¢–æ—á–Ω–æ—Å—Ç—å —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏: {accuracy:.1f}%")
    #     return accuracy

    def calculate_reconstruction_accuracy(self, original_texts: List[str], reconstructed_texts: List[str]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç —Ç–æ—á–Ω–æ—Å—Ç—å —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ —Å —É–ª—É—á—à–µ–Ω–Ω–æ–π –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–µ–π –ø—Ä–æ–±–µ–ª–æ–≤"""
        correct = 0
        total = min(len(original_texts), len(reconstructed_texts))

        for i in range(total):
            orig = original_texts[i]
            rec = reconstructed_texts[i]

            if not orig.strip() or not rec.strip():
                continue

            # –£–õ–£–ß–®–ï–ù–ù–ê–Ø –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è: —É–±–∏—Ä–∞–µ–º –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è
            orig_norm = self.normalize_text(orig)
            rec_norm = self.normalize_text(rec)

            # –°—Ä–∞–≤–Ω–∏–≤–∞–µ–º
            if orig_norm == rec_norm:
                correct += 1
            else:
                # –î–ª—è –æ—Ç–ª–∞–¥–∫–∏: –ø–æ–∫–∞–∑—ã–≤–∞–µ–º —Ä–∞–∑–ª–∏—á–∏—è –¥–ª—è –ø–µ—Ä–≤—ã—Ö –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö —Ç–µ–∫—Å—Ç–æ–≤
                if i < 2 and total > 1:
                    print(f"üîç –û—Ç–ª–∏—á–∏–µ –≤ —Ç–µ–∫—Å—Ç–µ {i}:")
                    print(f"   –û—Ä–∏–≥–∏–Ω–∞–ª: '{orig_norm[:80]}...'")
                    print(f"   –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '{rec_norm[:80]}...'")
                    print(f"   –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: {orig_norm == rec_norm}")

        accuracy = (correct / total * 100) if total > 0 else 0
        print(f"   –¢–æ—á–Ω–æ—Å—Ç—å —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏: {accuracy:.1f}% ({correct}/{total})")
        return accuracy

    def normalize_text(self, text: str) -> str:
        """–ù–æ—Ä–º–∞–ª–∏–∑—É–µ—Ç —Ç–µ–∫—Å—Ç, —É–±–∏—Ä–∞—è –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è"""
        # –ü—Ä–∏–≤–æ–¥–∏–º –∫ –Ω–∏–∂–Ω–µ–º—É —Ä–µ–≥–∏—Å—Ç—Ä—É –∏ —É–±–∏—Ä–∞–µ–º –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã
        text = text.lower().strip()

        # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã –ø–µ—Ä–µ–¥ –∑–Ω–∞–∫–∞–º–∏ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è
        text = re.sub(r'\s+([.,!?;:])', r'\1', text)

        # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã –ø–æ—Å–ª–µ –æ—Ç–∫—Ä—ã–≤–∞—é—â–∏—Ö —Å–∫–æ–±–æ–∫ –∏ –ø–µ—Ä–µ–¥ –∑–∞–∫—Ä—ã–≤–∞—é—â–∏–º–∏
        text = re.sub(r'\(\s+', '(', text)
        text = re.sub(r'\s+\)', ')', text)

        # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –¥–µ—Ñ–∏—Å–æ–≤ (–Ω–æ —Å–æ—Ö—Ä–∞–Ω—è–µ–º –¥–µ—Ñ–∏—Å—ã –≤ —Å–ª–æ–≤–∞—Ö)
        text = re.sub(r'\s*-\s*', '-', text)

        # –ó–∞–º–µ–Ω—è–µ–º multiple spaces –Ω–∞ single space
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def reconstruct_text_for_model(self, tokens: List[str], model_name: str) -> str:
        """–ü—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –∫–∞–∂–¥–æ–π –º–æ–¥–µ–ª–∏"""
        if not tokens:
            return ""

        if model_name == "Unigram_SP":
            # SentencePiece: '‚ñÅ' –æ–∑–Ω–∞—á–∞–µ—Ç –Ω–∞—á–∞–ª–æ —Å–ª–æ–≤–∞
            text = ''.join(tokens).replace('‚ñÅ', ' ').strip()

        elif model_name == "WordPiece":
            # WordPiece: '##' –æ–∑–Ω–∞—á–∞–µ—Ç –ø—Ä–æ–¥–æ–ª–∂–µ–Ω–∏–µ —Å–ª–æ–≤–∞
            if not tokens:
                return ""
            text = tokens[0]
            for token in tokens[1:]:
                if token.startswith('##'):
                    text += token[2:]
                else:
                    text += ' ' + token

        elif model_name == "BPE":
            # BPE: –ø—Ä–æ—Å—Ç–æ–π join, –Ω–æ —É–±–∏—Ä–∞–µ–º –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è
            text = ' '.join(tokens)
            # –£–±–∏—Ä–∞–µ–º –≤–æ–∑–º–æ–∂–Ω—ã–µ –∞—Ä—Ç–µ—Ñ–∞–∫—Ç—ã –º–∞—Ä–∫–∏—Ä–æ–≤–∫–∏
            text = text.replace(' ##', '')

        elif model_name == "Unigram_HF":
            # Hugging Face Unigram: –æ–±—ã—á–Ω–æ –∏—Å–ø–æ–ª—å–∑—É–µ—Ç –ø—Ä–æ–±–µ–ª—ã
            text = ' '.join(tokens).replace(' ##', '')

        else:
            # Fallback
            text = ' '.join(tokens)

        # –ü–†–ò–ú–ï–ù–Ø–ï–ú –ù–û–†–ú–ê–õ–ò–ó–ê–¶–ò–Æ –ö –í–û–°–°–¢–ê–ù–û–í–õ–ï–ù–ù–û–ú–£ –¢–ï–ö–°–¢–£
        return self.normalize_text(text)

    def debug_tokenization(self, model_name: str, tokenized_texts: List[List[str]], num_examples: int = 1):
        """–ü–æ–∫–∞–∑—ã–≤–∞–µ—Ç –ø—Ä–∏–º–µ—Ä—ã —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –¥–ª—è –æ—Ç–ª–∞–¥–∫–∏"""
        print(f"   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ ({model_name}):")

        for i in range(min(num_examples, len(tokenized_texts))):
            if i < len(self.corpus):
                original = self.corpus[i]
                tokens = tokenized_texts[i]
                reconstructed = self.reconstruct_text_for_model(tokens, model_name)

                # –ü–æ–∫–∞–∑—ã–≤–∞–µ–º —Ç–æ–ª—å–∫–æ –Ω–∞—á–∞–ª–æ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –∫—Ä–∞—Ç–∫–æ—Å—Ç–∏
                orig_short = original[:50] + "..." if len(original) > 50 else original
                rec_short = reconstructed[:50] + "..." if len(reconstructed) > 50 else reconstructed

                print(f"     –û—Ä–∏–≥–∏–Ω–∞–ª: '{orig_short}'")
                print(f"     –¢–æ–∫–µ–Ω—ã: {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
                print(f"     –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '{rec_short}'")
                print(f"     –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: {original.lower().strip() == reconstructed.lower().strip()}")
                print()

    def train_bpe(self, vocab_size: int, min_frequency: int) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç BPE –º–æ–¥–µ–ª—å —Å Whitespace –≤–º–µ—Å—Ç–æ ByteLevel –¥–ª—è –∫–∏—Ä–∏–ª–ª–∏—Ü—ã"""
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

        # –ò–°–ü–†–ê–í–õ–ï–ù–ò–ï: –ò—Å–ø–æ–ª—å–∑—É–µ–º Whitespace –≤–º–µ—Å—Ç–æ ByteLevel –¥–ª—è –∫–∏—Ä–∏–ª–ª–∏—Ü—ã
        tokenizer.pre_tokenizer = Whitespace()

        trainer = BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            show_progress=False,  # –£–±–∏—Ä–∞–µ–º –ø—Ä–æ–≥—Ä–µ—Å—Å-–±–∞—Ä –¥–ª—è —á–∏—Å—Ç–æ—Ç—ã –≤—ã–≤–æ–¥–∞
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        return tokenizer, tokenized_texts

    def train_wordpiece(self, vocab_size: int, min_frequency: int) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç WordPiece –º–æ–¥–µ–ª—å"""
        tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()

        trainer = WordPieceTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            show_progress=False,
            continuing_subword_prefix="##"
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        return tokenizer, tokenized_texts

    def train_unigram_sentencepiece(self, vocab_size: int, min_frequency: int) -> Tuple[any, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç Unigram –º–æ–¥–µ–ª—å –∏—Å–ø–æ–ª—å–∑—É—è sentencepiece —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏ —Ä–∞–∑–º–µ—Ä–∞–º–∏"""
        corpus_file = self.prepare_corpus_file()

        # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è –¥–ª—è SentencePiece
        max_sentencepiece_vocab = 3000  # –ë–µ–∑–æ–ø–∞—Å–Ω—ã–π –ª–∏–º–∏—Ç
        actual_vocab_size = min(vocab_size, max_sentencepiece_vocab)

        model_prefix = f"unigram_model_{actual_vocab_size}"

        try:
            spm.SentencePieceTrainer.train(
                input=corpus_file,
                model_prefix=model_prefix,
                vocab_size=actual_vocab_size,
                model_type='unigram',
                character_coverage=0.9995,
                pad_id=0,
                unk_id=1,
                bos_id=2,
                eos_id=3,
                pad_piece='[PAD]',
                unk_piece='[UNK]',
                bos_piece='[CLS]',
                eos_piece='[SEP]',
                user_defined_symbols=['[MASK]'],
                split_by_whitespace=True,
                max_sentence_length=10000,
            )

            # –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å
            sp = spm.SentencePieceProcessor()
            sp.load(f"{model_prefix}.model")

            # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
            tokenized_texts = []
            for text in self.corpus:
                if text.strip():
                    tokens = sp.encode_as_pieces(text)
                    tokenized_texts.append(tokens)

            return sp, tokenized_texts

        finally:
            # –£–¥–∞–ª—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã
            for file in [corpus_file, f"{model_prefix}.model", f"{model_prefix}.vocab"]:
                if os.path.exists(file):
                    try:
                        os.remove(file)
                    except:
                        pass

    def train_unigram_huggingface(self, vocab_size: int, min_frequency: int) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç Unigram –º–æ–¥–µ–ª—å —á–µ—Ä–µ–∑ Hugging Face (–±–µ–∑ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π)"""
        tokenizer = Tokenizer(Unigram())
        tokenizer.pre_tokenizer = Whitespace()

        trainer = UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            unk_token="[UNK]",
            max_piece_length=16,
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        return tokenizer, tokenized_texts

    def analyze_corpus(self):
        """–ê–Ω–∞–ª–∏–∑–∏—Ä—É–µ—Ç –∫–æ—Ä–ø—É—Å –¥–ª—è –ø–æ–Ω–∏–º–∞–Ω–∏—è –ø—Ä–æ–±–ª–µ–º—ã"""
        print("\nüîç –ê–ù–ê–õ–ò–ó –ö–û–†–ü–£–°–ê:")
        print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—Å—Ç–æ–≤: {len(self.corpus)}")

        # –ê–Ω–∞–ª–∏–∑ –¥–ª–∏–Ω—ã —Ç–µ–∫—Å—Ç–æ–≤
        text_lengths = [len(text.split()) for text in self.corpus if text.strip()]
        if text_lengths:
            print(f"–°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞: {np.mean(text_lengths):.1f} —Å–ª–æ–≤")

        # –ê–Ω–∞–ª–∏–∑ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤
        all_words = []
        for text in self.corpus:
            if text.strip():
                all_words.extend(text.lower().split())

        unique_words = set(all_words)
        print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤ –≤ –∫–æ—Ä–ø—É—Å–µ: {len(unique_words)}")
        print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–≤: {len(all_words)}")

        return len(unique_words)

    def evaluate_model(self, model_name: str, tokenized_texts: List[List[str]],
                      processing_time: float, vocab_size: int) -> Dict:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –≤—Å–µ –º–µ—Ç—Ä–∏–∫–∏ –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∏—Å–ø—Ä–∞–≤–ª–µ–Ω–Ω–æ–π —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–µ–π"""

        # –§–∏–ª—å—Ç—Ä—É–µ–º –ø—É—Å—Ç—ã–µ —Ç–µ–∫—Å—Ç—ã
        valid_original = [text for text in self.corpus if text.strip()]
        valid_tokenized = [tokens for tokens in tokenized_texts if tokens]

        if not valid_tokenized:
            return {
                'model': model_name,
                'vocab_size': vocab_size,
                'actual_vocab_size': 0,
                'fragmentation_rate': 0,
                'compression_ratio': 1,
                'reconstruction_accuracy': 0,
                'processing_time_sec': round(processing_time, 2),
                'avg_token_length': 0
            }

        # –ü—Ä–æ—Ü–µ–Ω—Ç —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞—Ü–∏–∏
        fragmentation = self.calculate_fragmentation(valid_tokenized)

        # –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Å–∂–∞—Ç–∏—è
        compression_ratio = self.calculate_compression_ratio(valid_original, valid_tokenized)

        # –†–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏—è —Ç–µ–∫—Å—Ç–∞
        reconstructed_texts = []
        for tokens in valid_tokenized:
            reconstructed = self.reconstruct_text_for_model(tokens, model_name)
            reconstructed_texts.append(reconstructed)

        reconstruction_accuracy = self.calculate_reconstruction_accuracy(valid_original, reconstructed_texts)

        # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ —Ç–æ–∫–µ–Ω–∞–º
        all_tokens = [token for tokens in valid_tokenized for token in tokens]
        avg_token_length = np.mean([len(token) for token in all_tokens]) if all_tokens else 0
        actual_vocab_size = len(set(all_tokens))

        return {
            'model': model_name,
            'vocab_size': vocab_size,
            'actual_vocab_size': actual_vocab_size,
            'fragmentation_rate': round(fragmentation, 2),
            'compression_ratio': round(compression_ratio, 3),
            'reconstruction_accuracy': round(reconstruction_accuracy, 2),
            'processing_time_sec': round(processing_time, 2),
            'avg_token_length': round(avg_token_length, 2)
        }

    def run_comparison(self, vocab_sizes: List[int] = None, min_frequency: int = 2) -> pd.DataFrame:
        """–ó–∞–ø—É—Å–∫–∞–µ—Ç —Å—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –º–æ–¥–µ–ª–µ–π —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏ —Ä–∞–∑–º–µ—Ä–∞–º–∏ —Å–ª–æ–≤–∞—Ä—è"""

        if vocab_sizes is None:
            # –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ –æ–ø—Ä–µ–¥–µ–ª—è–µ–º –ø–æ–¥—Ö–æ–¥—è—â–∏–µ —Ä–∞–∑–º–µ—Ä—ã
            unique_words = self.analyze_corpus()
            vocab_sizes = [
                min(5000, unique_words),      # –ú–∞–ª–µ–Ω—å–∫–∏–π
                min(10000, unique_words * 2), # –°—Ä–µ–¥–Ω–∏–π
                min(20000, unique_words * 3)  # –ë–æ–ª—å—à–æ–π
            ]
            vocab_sizes = [size for size in vocab_sizes if size >= 1000]
            print(f"üéØ –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ –≤—ã–±—Ä–∞–Ω—ã —Ä–∞–∑–º–µ—Ä—ã —Å–ª–æ–≤–∞—Ä—è: {vocab_sizes}")

        print("üöÄ –ó–ê–ü–£–°–ö –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê –ü–û–î–°–õ–û–í–ù–´–• –ú–û–î–ï–õ–ï–ô")
        print("=" * 70)

        for vocab_size in vocab_sizes:
            print(f"\nüìä –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: {vocab_size}")
            print("-" * 50)

            # BPE –º–æ–¥–µ–ª—å
            print("–û–±—É—á–∞–µ—Ç—Å—è BPE...", end=" ")
            start_time = time.time()
            try:
                bpe_model, bpe_tokens = self.train_bpe(vocab_size, min_frequency)
                bpe_time = time.time() - start_time
                self.debug_tokenization("BPE", bpe_tokens)
                bpe_metrics = self.evaluate_model("BPE", bpe_tokens, bpe_time, vocab_size)
                self.results.append(bpe_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # WordPiece –º–æ–¥–µ–ª—å
            print("–û–±—É—á–∞–µ—Ç—Å—è WordPiece...", end=" ")
            start_time = time.time()
            try:
                wp_model, wp_tokens = self.train_wordpiece(vocab_size, min_frequency)
                wp_time = time.time() - start_time
                self.debug_tokenization("WordPiece", wp_tokens)
                wp_metrics = self.evaluate_model("WordPiece", wp_tokens, wp_time, vocab_size)
                self.results.append(wp_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # Unigram –º–æ–¥–µ–ª—å —á–µ—Ä–µ–∑ Hugging Face (–±–µ–∑ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏–π)
            print("–û–±—É—á–∞–µ—Ç—Å—è Unigram (HF)...", end=" ")
            start_time = time.time()
            try:
                unigram_model, unigram_tokens = self.train_unigram_huggingface(vocab_size, min_frequency)
                unigram_time = time.time() - start_time
                self.debug_tokenization("Unigram_HF", unigram_tokens)
                unigram_metrics = self.evaluate_model("Unigram_HF", unigram_tokens, unigram_time, vocab_size)
                self.results.append(unigram_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # Unigram —á–µ—Ä–µ–∑ SentencePiece (—Ç–æ–ª—å–∫–æ –¥–ª—è –º–∞–ª–µ–Ω—å–∫–∏—Ö —Å–ª–æ–≤–∞—Ä–µ–π)
            if vocab_size <= 3000:
                print("–û–±—É—á–∞–µ—Ç—Å—è Unigram (SP)...", end=" ")
                start_time = time.time()
                try:
                    unigram_sp_model, unigram_sp_tokens = self.train_unigram_sentencepiece(vocab_size, min_frequency)
                    unigram_sp_time = time.time() - start_time
                    self.debug_tokenization("Unigram_SP", unigram_sp_tokens)
                    unigram_sp_metrics = self.evaluate_model("Unigram_SP", unigram_sp_tokens, unigram_sp_time, vocab_size)
                    self.results.append(unigram_sp_metrics)
                    print("‚úì")
                except Exception as e:
                    print(f"‚úó –û—à–∏–±–∫–∞: {e}")

        return pd.DataFrame(self.results)

def main():
    """–ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è"""

    # –í–∞—à –∫–æ—Ä–ø—É—Å (–∑–∞–º–µ–Ω–∏—Ç–µ –Ω–∞ –≤–∞—à–∏ –¥–∞–Ω–Ω—ã–µ)
    corpus = [item["text"] for item in preprocessed_data]

    # –¢–µ—Å—Ç–æ–≤—ã–π –∫–æ—Ä–ø—É—Å –¥–ª—è –ø—Ä–∏–º–µ—Ä–∞
    # corpus = [
    #     "–ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ —ç—Ç–æ –≤–∞–∂–Ω—ã–π —Ä–∞–∑–¥–µ–ª –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞",
    #     "–ù–µ–π—Ä–æ–Ω–Ω—ã–µ —Å–µ—Ç–∏ –≥–ª—É–±–æ–∫–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è –∏—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è –ø–æ–≤—Å–µ–º–µ—Å—Ç–Ω–æ",
    #     "–¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ —è–≤–ª—è–µ—Ç—Å—è —Ñ—É–Ω–¥–∞–º–µ–Ω—Ç–∞–ª—å–Ω–æ–π –∑–∞–¥–∞—á–µ–π NLP",
    #     "–ü–æ–¥—Å–ª–æ–≤–Ω—ã–µ –º–æ–¥–µ–ª–∏ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞—é—Ç —Ä–µ–¥–∫–∏–µ —Å–ª–æ–≤–∞",
    #     "BPE –∞–ª–≥–æ—Ä–∏—Ç–º —à–∏—Ä–æ–∫–æ –ø—Ä–∏–º–µ–Ω—è–µ—Ç—Å—è –≤ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–∏–∏ —è–∑—ã–∫–æ–≤—ã—Ö –º–æ–¥–µ–ª–µ–π",
    # ] * 66  # 330 —Ç–µ–∫—Å—Ç–æ–≤

    print(f"üìö –†–∞–∑–º–µ—Ä –∫–æ—Ä–ø—É—Å–∞: {len(corpus)} —Ç–µ–∫—Å—Ç–æ–≤")

    # –°–æ–∑–¥–∞–Ω–∏–µ –∫–æ–º–ø–∞—Ä–∞—Ç–æ—Ä–∞
    comparator = SubwordModelComparator(corpus)

    # –ó–∞–ø—É—Å–∫ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è —Å –±–µ–∑–æ–ø–∞—Å–Ω—ã–º–∏ —Ä–∞–∑–º–µ—Ä–∞–º–∏ —Å–ª–æ–≤–∞—Ä—è
    results_df = comparator.run_comparison(
        vocab_sizes=[2000, 5000, 10000],  # –ë–µ–∑–æ–ø–∞—Å–Ω—ã–µ —Ä–∞–∑–º–µ—Ä—ã
        min_frequency=2
    )

    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    results_df.to_csv('subword_models_comparison.csv', index=False, encoding='utf-8')

    # –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    print("\n" + "="*80)
    print("–†–ï–ó–£–õ–¨–¢–ê–¢–´ –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê")
    print("="*80)
    print(results_df.to_string(index=False))

    # –ê–Ω–∞–ª–∏–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    if not results_df.empty:
        print("\n" + "="*80)
        print("–ö–õ–Æ–ß–ï–í–´–ï –í–´–í–û–î–´")
        print("="*80)

        successful_models = results_df[results_df['actual_vocab_size'] > 100]

        if not successful_models.empty:
            best_fragmentation = successful_models.loc[successful_models['fragmentation_rate'].idxmin()]
            best_compression = successful_models.loc[successful_models['compression_ratio'].idxmin()]
            best_reconstruction = successful_models.loc[successful_models['reconstruction_accuracy'].idxmax()]

            print(f"–õ—É—á—à–∞—è –ø–æ –º–∏–Ω–∏–º–∏–∑–∞—Ü–∏–∏ —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞—Ü–∏–∏: {best_fragmentation['model']} ({best_fragmentation['fragmentation_rate']}%)")
            print(f"–õ—É—á—à–∞—è –ø–æ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏ —Å–∂–∞—Ç–∏—è: {best_compression['model']} ({best_compression['compression_ratio']})")
            print(f"–õ—É—á—à–∞—è –ø–æ —Ç–æ—á–Ω–æ—Å—Ç–∏ —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏: {best_reconstruction['model']} ({best_reconstruction['reconstruction_accuracy']}%)")

if __name__ == "__main__":
    main()

üìö –†–∞–∑–º–µ—Ä –∫–æ—Ä–ø—É—Å–∞: 330 —Ç–µ–∫—Å—Ç–æ–≤
üöÄ –ó–ê–ü–£–°–ö –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê –ü–û–î–°–õ–û–í–ù–´–• –ú–û–î–ï–õ–ï–ô

üìä –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: 2000
--------------------------------------------------
–û–±—É—á–∞–µ—Ç—Å—è BPE...    üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ (BPE):
     –û—Ä–∏–≥–∏–Ω–∞–ª: '–†–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à–ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω–∞...'
     –¢–æ–∫–µ–Ω—ã: ['–†–æ—Å—Å–∏—è–¥”ô', '–±–µ—Ä–µ–Ω—á–µ', '—Ç–µ–ø–ª–∏—Ü–∞', '—Ç”©–∑–µ–ª”ô', '–±–∞—à', '–ª–∞–¥—ã', ',', '–∞–Ω–¥–∞', '–±–∞–Ω–∞–Ω', '“Ø—Å—Ç–µ—Ä']...
     –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '—Ä–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à –ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω...'
     –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: False

üîç –û—Ç–ª–∏—á–∏–µ –≤ —Ç–µ–∫—Å—Ç–µ 0:
   –û—Ä–∏–≥–∏–Ω–∞–ª: '—Ä–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à–ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω–∞–Ω “Ø—Å—Ç–µ—Ä”ô—á”ô–∫–ª”ô—Ä. –±—É —Ö–∞–∫—Ç–∞ —Ä–æ—Å—Å–∏...'
   –í–æ—Å—Å—Ç–

In [None]:
import pandas as pd
import numpy as np
import time
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple
import re
import os
import json

# –û—Å–Ω–æ–≤–Ω—ã–µ –±–∏–±–ª–∏–æ—Ç–µ–∫–∏ –¥–ª—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–æ–≤
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers import normalizers

import sentencepiece as spm

class SubwordModelComparator:
    def __init__(self, corpus: List[str]):
        self.corpus = corpus
        self.results = []
        self.trained_models = {}  # üÜï –•—Ä–∞–Ω–∏–º –æ–±—É—á–µ–Ω–Ω—ã–µ –º–æ–¥–µ–ª–∏

    def prepare_corpus_file(self):
        """–°–æ—Ö—Ä–∞–Ω—è–µ—Ç –∫–æ—Ä–ø—É—Å –≤–æ –≤—Ä–µ–º–µ–Ω–Ω—ã–π —Ñ–∞–π–ª –¥–ª—è sentencepiece"""
        with open('temp_corpus.txt', 'w', encoding='utf-8') as f:
            for text in self.corpus:
                if text.strip():  # –ü—Ä–æ–ø—É—Å–∫–∞–µ–º –ø—É—Å—Ç—ã–µ —Å—Ç—Ä–æ–∫–∏
                    f.write(text.strip() + '\n')
        return 'temp_corpus.txt'

    # üÜï –ú–ò–ù–ò–ú–ê–õ–¨–ù–û–ï –î–û–ë–ê–í–õ–ï–ù–ò–ï: –ú–µ—Ç–æ–¥—ã –¥–ª—è —Ä–∞–±–æ—Ç—ã —Å –æ–±—É—á–µ–Ω–Ω—ã–º–∏ –º–æ–¥–µ–ª—è–º–∏
    def save_model(self, model, model_name: str, vocab_size: int):
        """–°–æ—Ö—Ä–∞–Ω—è–µ—Ç –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å –≤ —Ñ–∞–π–ª"""
        try:
            model_key = f"{model_name}_vocab_{vocab_size}"

            if model_name == "Unigram_SP":
                # SentencePiece –º–æ–¥–µ–ª—å
                model.save(f"saved_{model_key}.model")
                print(f"‚úÖ –ú–æ–¥–µ–ª—å {model_key} —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞")
            else:
                # Hugging Face —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä—ã
                model.save(f"saved_{model_key}")
                print(f"‚úÖ –ú–æ–¥–µ–ª—å {model_key} —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞")

            # –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å –≤ —Å–ª–æ–≤–∞—Ä—å
            self.trained_models[model_key] = model
            return True

        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è {model_name}: {e}")
            return False

    def get_trained_model(self, model_name: str, vocab_size: int):
        """–í–æ–∑–≤—Ä–∞—â–∞–µ—Ç –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å –ø–æ –∏–º–µ–Ω–∏ –∏ —Ä–∞–∑–º–µ—Ä—É —Å–ª–æ–≤–∞—Ä—è"""
        model_key = f"{model_name}_vocab_{vocab_size}"
        return self.trained_models.get(model_key)

    def tokenize_text(self, text: str, model_name: str, vocab_size: int):
        """–¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ—Ç —Ç–µ–∫—Å—Ç —Å –ø–æ–º–æ—â—å—é –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏"""
        model = self.get_trained_model(model_name, vocab_size)
        if not model:
            print(f"‚ùå –ú–æ–¥–µ–ª—å {model_name} —Å vocab_size={vocab_size} –Ω–µ –Ω–∞–π–¥–µ–Ω–∞")
            return None

        try:
            if model_name == "Unigram_SP":
                return model.encode_as_pieces(text)
            else:
                encoding = model.encode(text)
                return encoding.tokens
        except Exception as e:
            print(f"‚ùå –û—à–∏–±–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏: {e}")
            return None

    # üÜï –ú–ò–ù–ò–ú–ê–õ–¨–ù–û–ï –ò–ó–ú–ï–ù–ï–ù–ò–ï: –î–æ–±–∞–≤–ª—è–µ–º —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ –º–µ—Ç–æ–¥—ã –æ–±—É—á–µ–Ω–∏—è
    def train_bpe(self, vocab_size: int, min_frequency: int, save_model: bool = True) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç BPE –º–æ–¥–µ–ª—å —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è"""
        tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

        # –ò—Å–ø–æ–ª—å–∑—É–µ–º Whitespace –≤–º–µ—Å—Ç–æ ByteLevel –¥–ª—è –∫–∏—Ä–∏–ª–ª–∏—Ü—ã
        tokenizer.pre_tokenizer = Whitespace()

        trainer = BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            show_progress=False,
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        # üÜï –°–û–•–†–ê–ù–Ø–ï–ú –ú–û–î–ï–õ–¨
        if save_model:
            self.save_model(tokenizer, "BPE", vocab_size)

        return tokenizer, tokenized_texts

    def train_wordpiece(self, vocab_size: int, min_frequency: int, save_model: bool = True) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç WordPiece –º–æ–¥–µ–ª—å —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è"""
        tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
        tokenizer.pre_tokenizer = Whitespace()

        trainer = WordPieceTrainer(
            vocab_size=vocab_size,
            min_frequency=min_frequency,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            show_progress=False,
            continuing_subword_prefix="##"
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        # üÜï –°–û–•–†–ê–ù–Ø–ï–ú –ú–û–î–ï–õ–¨
        if save_model:
            self.save_model(tokenizer, "WordPiece", vocab_size)

        return tokenizer, tokenized_texts

    def train_unigram_sentencepiece(self, vocab_size: int, min_frequency: int, save_model: bool = True) -> Tuple[any, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç Unigram –º–æ–¥–µ–ª—å –∏—Å–ø–æ–ª—å–∑—É—è sentencepiece —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è"""
        corpus_file = self.prepare_corpus_file()

        # –û–≥—Ä–∞–Ω–∏—á–∏–≤–∞–µ–º —Ä–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è –¥–ª—è SentencePiece
        max_sentencepiece_vocab = 3000
        actual_vocab_size = min(vocab_size, max_sentencepiece_vocab)

        model_prefix = f"unigram_model_{actual_vocab_size}"

        try:
            spm.SentencePieceTrainer.train(
                input=corpus_file,
                model_prefix=model_prefix,
                vocab_size=actual_vocab_size,
                model_type='unigram',
                character_coverage=0.9995,
                pad_id=0,
                unk_id=1,
                bos_id=2,
                eos_id=3,
                pad_piece='[PAD]',
                unk_piece='[UNK]',
                bos_piece='[CLS]',
                eos_piece='[SEP]',
                user_defined_symbols=['[MASK]'],
                split_by_whitespace=True,
                max_sentence_length=10000,
            )

            # –ó–∞–≥—Ä—É–∂–∞–µ–º –æ–±—É—á–µ–Ω–Ω—É—é –º–æ–¥–µ–ª—å
            sp = spm.SentencePieceProcessor()
            sp.load(f"{model_prefix}.model")

            # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
            tokenized_texts = []
            for text in self.corpus:
                if text.strip():
                    tokens = sp.encode_as_pieces(text)
                    tokenized_texts.append(tokens)

            # üÜï –°–û–•–†–ê–ù–Ø–ï–ú –ú–û–î–ï–õ–¨
            if save_model:
                self.save_model(sp, "Unigram_SP", vocab_size)

            return sp, tokenized_texts

        finally:
            # –£–¥–∞–ª—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ —Ñ–∞–π–ª—ã
            for file in [corpus_file, f"{model_prefix}.model", f"{model_prefix}.vocab"]:
                if os.path.exists(file):
                    try:
                        os.remove(file)
                    except:
                        pass

    def train_unigram_huggingface(self, vocab_size: int, min_frequency: int, save_model: bool = True) -> Tuple[Tokenizer, List[List[str]]]:
        """–û–±—É—á–∞–µ—Ç Unigram –º–æ–¥–µ–ª—å —á–µ—Ä–µ–∑ Hugging Face —Å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å—é —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏—è"""
        tokenizer = Tokenizer(Unigram())
        tokenizer.pre_tokenizer = Whitespace()

        trainer = UnigramTrainer(
            vocab_size=vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"],
            unk_token="[UNK]",
            max_piece_length=16,
        )

        # –û–±—É—á–∞–µ–º –Ω–∞ –∫–æ—Ä–ø—É—Å–µ
        tokenizer.train_from_iterator(self.corpus, trainer)

        # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º –∫–æ—Ä–ø—É—Å
        tokenized_texts = []
        for text in self.corpus:
            if text.strip():
                encoding = tokenizer.encode(text)
                tokens = encoding.tokens
                tokenized_texts.append(tokens)

        # üÜï –°–û–•–†–ê–ù–Ø–ï–ú –ú–û–î–ï–õ–¨
        if save_model:
            self.save_model(tokenizer, "Unigram_HF", vocab_size)

        return tokenizer, tokenized_texts

    # üÜï –ú–ò–ù–ò–ú–ê–õ–¨–ù–û–ï –î–û–ë–ê–í–õ–ï–ù–ò–ï: –ú–µ—Ç–æ–¥ –¥–ª—è –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏–∏ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è
    def demonstrate_usage(self, test_text: str = "–ü—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞ –¥–ª—è —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏"):
        """–î–µ–º–æ–Ω—Å—Ç—Ä–∏—Ä—É–µ—Ç –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –æ–±—É—á–µ–Ω–Ω—ã—Ö –º–æ–¥–µ–ª–µ–π"""
        print("\n" + "="*60)
        print("üöÄ –î–ï–ú–û–ù–°–¢–†–ê–¶–ò–Ø –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–Ø –û–ë–£–ß–ï–ù–ù–´–• –ú–û–î–ï–õ–ï–ô")
        print("="*60)

        for result in self.results:
            model_name = result['model']
            vocab_size = result['vocab_size']

            tokens = self.tokenize_text(test_text, model_name, vocab_size)
            if tokens:
                print(f"üîπ {model_name} (vocab={vocab_size}):")
                print(f"   –¢–µ–∫—Å—Ç: '{test_text}'")
                print(f"   –¢–æ–∫–µ–Ω—ã: {tokens}")
                print(f"   –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–æ–∫–µ–Ω–æ–≤: {len(tokens)}")
                print()

    # –í—Å–µ –æ—Å—Ç–∞–ª—å–Ω—ã–µ –º–µ—Ç–æ–¥—ã –æ—Å—Ç–∞—é—Ç—Å—è –±–µ–∑ –∏–∑–º–µ–Ω–µ–Ω–∏–π
    def calculate_fragmentation(self, tokenized_texts: List[List[str]]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –ø—Ä–æ—Ü–µ–Ω—Ç —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞—Ü–∏–∏ —Å–ª–æ–≤"""
        total_words = 0
        fragmented_words = 0

        for tokens in tokenized_texts:
            for token in tokens:
                # –û–ø—Ä–µ–¥–µ–ª—è–µ–º —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ç–æ–∫–µ–Ω—ã
                if token.startswith('##') or '‚ñÅ' in token or len(token) < 3:
                    fragmented_words += 1
                total_words += 1

        return (fragmented_words / total_words * 100) if total_words > 0 else 0

    def calculate_compression_ratio(self, original_texts: List[str], tokenized_texts: List[List[str]]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç —Å–∂–∞—Ç–∏—è"""
        total_original_tokens = sum(len(text.split()) for text in original_texts if text.strip())
        total_subword_tokens = sum(len(tokens) for tokens in tokenized_texts)

        return total_subword_tokens / total_original_tokens if total_original_tokens > 0 else 1

    def normalize_text(self, text: str) -> str:
        """–ù–æ—Ä–º–∞–ª–∏–∑—É–µ—Ç —Ç–µ–∫—Å—Ç, —É–±–∏—Ä–∞—è –ª–∏—à–Ω–∏–µ –ø—Ä–æ–±–µ–ª—ã –≤–æ–∫—Ä—É–≥ –∑–Ω–∞–∫–æ–≤ –ø—Ä–µ–ø–∏–Ω–∞–Ω–∏—è"""
        text = text.lower().strip()
        text = re.sub(r'\s+([.,!?;:])', r'\1', text)
        text = re.sub(r'\(\s+', '(', text)
        text = re.sub(r'\s+\)', ')', text)
        text = re.sub(r'\s*-\s*', '-', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def reconstruct_text_for_model(self, tokens: List[str], model_name: str) -> str:
        """–ü—Ä–∞–≤–∏–ª—å–Ω–∞—è —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –∫–∞–∂–¥–æ–π –º–æ–¥–µ–ª–∏"""
        if not tokens:
            return ""

        if model_name == "Unigram_SP":
            text = ''.join(tokens).replace('‚ñÅ', ' ').strip()
        elif model_name == "WordPiece":
            if not tokens:
                return ""
            text = tokens[0]
            for token in tokens[1:]:
                if token.startswith('##'):
                    text += token[2:]
                else:
                    text += ' ' + token
        elif model_name == "BPE":
            text = ' '.join(tokens).replace(' ##', '')
        elif model_name == "Unigram_HF":
            text = ' '.join(tokens).replace(' ##', '')
        else:
            text = ' '.join(tokens)

        return self.normalize_text(text)

    def calculate_reconstruction_accuracy(self, original_texts: List[str], reconstructed_texts: List[str]) -> float:
        """–í—ã—á–∏—Å–ª—è–µ—Ç —Ç–æ—á–Ω–æ—Å—Ç—å —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏ —Å —É–ª—É—á—à–µ–Ω–Ω–æ–π –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–µ–π –ø—Ä–æ–±–µ–ª–æ–≤"""
        correct = 0
        total = min(len(original_texts), len(reconstructed_texts))

        for i in range(total):
            orig = original_texts[i]
            rec = reconstructed_texts[i]

            if not orig.strip() or not rec.strip():
                continue

            orig_norm = self.normalize_text(orig)
            rec_norm = self.normalize_text(rec)

            if orig_norm == rec_norm:
                correct += 1

        accuracy = (correct / total * 100) if total > 0 else 0
        return accuracy

    def debug_tokenization(self, model_name: str, tokenized_texts: List[List[str]], num_examples: int = 1):
        """–ü–æ–∫–∞–∑—ã–≤–∞–µ—Ç –ø—Ä–∏–º–µ—Ä—ã —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ –¥–ª—è –æ—Ç–ª–∞–¥–∫–∏"""
        print(f"   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ ({model_name}):")

        for i in range(min(num_examples, len(tokenized_texts))):
            if i < len(self.corpus):
                original = self.corpus[i]
                tokens = tokenized_texts[i]
                reconstructed = self.reconstruct_text_for_model(tokens, model_name)

                orig_short = original[:50] + "..." if len(original) > 50 else original
                rec_short = reconstructed[:50] + "..." if len(reconstructed) > 50 else reconstructed

                print(f"     –û—Ä–∏–≥–∏–Ω–∞–ª: '{orig_short}'")
                print(f"     –¢–æ–∫–µ–Ω—ã: {tokens[:10]}{'...' if len(tokens) > 10 else ''}")
                print(f"     –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '{rec_short}'")
                print(f"     –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: {original.lower().strip() == reconstructed.lower().strip()}")
                print()

    def analyze_corpus(self):
        """–ê–Ω–∞–ª–∏–∑–∏—Ä—É–µ—Ç –∫–æ—Ä–ø—É—Å –¥–ª—è –ø–æ–Ω–∏–º–∞–Ω–∏—è –ø—Ä–æ–±–ª–µ–º—ã"""
        print("\nüîç –ê–ù–ê–õ–ò–ó –ö–û–†–ü–£–°–ê:")
        print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç–µ–∫—Å—Ç–æ–≤: {len(self.corpus)}")

        text_lengths = [len(text.split()) for text in self.corpus if text.strip()]
        if text_lengths:
            print(f"–°—Ä–µ–¥–Ω—è—è –¥–ª–∏–Ω–∞ —Ç–µ–∫—Å—Ç–∞: {np.mean(text_lengths):.1f} —Å–ª–æ–≤")

        all_words = []
        for text in self.corpus:
            if text.strip():
                all_words.extend(text.lower().split())

        unique_words = set(all_words)
        print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å–ª–æ–≤ –≤ –∫–æ—Ä–ø—É—Å–µ: {len(unique_words)}")
        print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–ª–æ–≤: {len(all_words)}")

        return len(unique_words)

    def evaluate_model(self, model_name: str, tokenized_texts: List[List[str]],
                      processing_time: float, vocab_size: int) -> Dict:
        """–í—ã—á–∏—Å–ª—è–µ—Ç –≤—Å–µ –º–µ—Ç—Ä–∏–∫–∏ –¥–ª—è –º–æ–¥–µ–ª–∏ —Å –∏—Å–ø—Ä–∞–≤–ª–µ–Ω–Ω–æ–π —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–µ–π"""

        valid_original = [text for text in self.corpus if text.strip()]
        valid_tokenized = [tokens for tokens in tokenized_texts if tokens]

        if not valid_tokenized:
            return {
                'model': model_name,
                'vocab_size': vocab_size,
                'actual_vocab_size': 0,
                'fragmentation_rate': 0,
                'compression_ratio': 1,
                'reconstruction_accuracy': 0,
                'processing_time_sec': round(processing_time, 2),
                'avg_token_length': 0
            }

        fragmentation = self.calculate_fragmentation(valid_tokenized)
        compression_ratio = self.calculate_compression_ratio(valid_original, valid_tokenized)

        reconstructed_texts = []
        for tokens in valid_tokenized:
            reconstructed = self.reconstruct_text_for_model(tokens, model_name)
            reconstructed_texts.append(reconstructed)

        reconstruction_accuracy = self.calculate_reconstruction_accuracy(valid_original, reconstructed_texts)

        all_tokens = [token for tokens in valid_tokenized for token in tokens]
        avg_token_length = np.mean([len(token) for token in all_tokens]) if all_tokens else 0
        actual_vocab_size = len(set(all_tokens))

        return {
            'model': model_name,
            'vocab_size': vocab_size,
            'actual_vocab_size': actual_vocab_size,
            'fragmentation_rate': round(fragmentation, 2),
            'compression_ratio': round(compression_ratio, 3),
            'reconstruction_accuracy': round(reconstruction_accuracy, 2),
            'processing_time_sec': round(processing_time, 2),
            'avg_token_length': round(avg_token_length, 2)
        }

    def run_comparison(self, vocab_sizes: List[int] = None, min_frequency: int = 2) -> pd.DataFrame:
        """–ó–∞–ø—É—Å–∫–∞–µ—Ç —Å—Ä–∞–≤–Ω–∏—Ç–µ–ª—å–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –º–æ–¥–µ–ª–µ–π —Å –ø—Ä–∞–≤–∏–ª—å–Ω—ã–º–∏ —Ä–∞–∑–º–µ—Ä–∞–º–∏ —Å–ª–æ–≤–∞—Ä—è"""

        if vocab_sizes is None:
            unique_words = self.analyze_corpus()
            vocab_sizes = [
                min(5000, unique_words),
                min(10000, unique_words * 2),
                min(20000, unique_words * 3)
            ]
            vocab_sizes = [size for size in vocab_sizes if size >= 1000]
            print(f"üéØ –ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ –≤—ã–±—Ä–∞–Ω—ã —Ä–∞–∑–º–µ—Ä—ã —Å–ª–æ–≤–∞—Ä—è: {vocab_sizes}")

        print("üöÄ –ó–ê–ü–£–°–ö –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê –ü–û–î–°–õ–û–í–ù–´–• –ú–û–î–ï–õ–ï–ô")
        print("=" * 70)

        for vocab_size in vocab_sizes:
            print(f"\nüìä –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: {vocab_size}")
            print("-" * 50)

            # BPE –º–æ–¥–µ–ª—å
            print("–û–±—É—á–∞–µ—Ç—Å—è BPE...", end=" ")
            start_time = time.time()
            try:
                bpe_model, bpe_tokens = self.train_bpe(vocab_size, min_frequency)
                bpe_time = time.time() - start_time
                self.debug_tokenization("BPE", bpe_tokens)
                bpe_metrics = self.evaluate_model("BPE", bpe_tokens, bpe_time, vocab_size)
                self.results.append(bpe_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # WordPiece –º–æ–¥–µ–ª—å
            print("–û–±—É—á–∞–µ—Ç—Å—è WordPiece...", end=" ")
            start_time = time.time()
            try:
                wp_model, wp_tokens = self.train_wordpiece(vocab_size, min_frequency)
                wp_time = time.time() - start_time
                self.debug_tokenization("WordPiece", wp_tokens)
                wp_metrics = self.evaluate_model("WordPiece", wp_tokens, wp_time, vocab_size)
                self.results.append(wp_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # Unigram –º–æ–¥–µ–ª—å —á–µ—Ä–µ–∑ Hugging Face
            print("–û–±—É—á–∞–µ—Ç—Å—è Unigram (HF)...", end=" ")
            start_time = time.time()
            try:
                unigram_model, unigram_tokens = self.train_unigram_huggingface(vocab_size, min_frequency)
                unigram_time = time.time() - start_time
                self.debug_tokenization("Unigram_HF", unigram_tokens)
                unigram_metrics = self.evaluate_model("Unigram_HF", unigram_tokens, unigram_time, vocab_size)
                self.results.append(unigram_metrics)
                print("‚úì")
            except Exception as e:
                print(f"‚úó –û—à–∏–±–∫–∞: {e}")

            # Unigram —á–µ—Ä–µ–∑ SentencePiece
            if vocab_size <= 3000:
                print("–û–±—É—á–∞–µ—Ç—Å—è Unigram (SP)...", end=" ")
                start_time = time.time()
                try:
                    unigram_sp_model, unigram_sp_tokens = self.train_unigram_sentencepiece(vocab_size, min_frequency)
                    unigram_sp_time = time.time() - start_time
                    self.debug_tokenization("Unigram_SP", unigram_sp_tokens)
                    unigram_sp_metrics = self.evaluate_model("Unigram_SP", unigram_sp_tokens, unigram_sp_time, vocab_size)
                    self.results.append(unigram_sp_metrics)
                    print("‚úì")
                except Exception as e:
                    print(f"‚úó –û—à–∏–±–∫–∞: {e}")

        return pd.DataFrame(self.results)

def main():
    """–ü—Ä–∏–º–µ—Ä –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è —Å –¥–µ–º–æ–Ω—Å—Ç—Ä–∞—Ü–∏–µ–π —Ä–∞–±–æ—Ç—ã –º–æ–¥–µ–ª–µ–π"""

    # –¢–µ—Å—Ç–æ–≤—ã–π –∫–æ—Ä–ø—É—Å
    corpus = [
        "–ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ —ç—Ç–æ –≤–∞–∂–Ω—ã–π —Ä–∞–∑–¥–µ–ª –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ –∏–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞",
        "–ù–µ–π—Ä–æ–Ω–Ω—ã–µ —Å–µ—Ç–∏ –≥–ª—É–±–æ–∫–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è –∏—Å–ø–æ–ª—å–∑—É—é—Ç—Å—è –ø–æ–≤—Å–µ–º–µ—Å—Ç–Ω–æ",
        "–¢–æ–∫–µ–Ω–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞ —è–≤–ª—è–µ—Ç—Å—è —Ñ—É–Ω–¥–∞–º–µ–Ω—Ç–∞–ª—å–Ω–æ–π –∑–∞–¥–∞—á–µ–π NLP",
        "–ü–æ–¥—Å–ª–æ–≤–Ω—ã–µ –º–æ–¥–µ–ª–∏ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞—é—Ç —Ä–µ–¥–∫–∏–µ —Å–ª–æ–≤–∞",
        "BPE –∞–ª–≥–æ—Ä–∏—Ç–º —à–∏—Ä–æ–∫–æ –ø—Ä–∏–º–µ–Ω—è–µ—Ç—Å—è –≤ –ø—Ä–µ–¥–æ–±—É—á–µ–Ω–∏–∏ —è–∑—ã–∫–æ–≤—ã—Ö –º–æ–¥–µ–ª–µ–π",
    ] * 20  # 100 —Ç–µ–∫—Å—Ç–æ–≤

    print(f"üìö –†–∞–∑–º–µ—Ä –∫–æ—Ä–ø—É—Å–∞: {len(corpus)} —Ç–µ–∫—Å—Ç–æ–≤")

    # –°–æ–∑–¥–∞–Ω–∏–µ –∫–æ–º–ø–∞—Ä–∞—Ç–æ—Ä–∞
    comparator = SubwordModelComparator(corpus)

    # –ó–∞–ø—É—Å–∫ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è
    results_df = comparator.run_comparison(
        vocab_sizes=[2000, 5000],
        min_frequency=2
    )

    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    results_df.to_csv('subword_models_comparison.csv', index=False, encoding='utf-8')

    # –í—ã–≤–æ–¥ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    print("\n" + "="*80)
    print("–†–ï–ó–£–õ–¨–¢–ê–¢–´ –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê")
    print("="*80)
    print(results_df.to_string(index=False))

    # üÜï –î–ï–ú–û–ù–°–¢–†–ê–¶–ò–Ø –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–Ø –ú–û–î–ï–õ–ï–ô
    test_text = "–ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ –∏ –Ω–µ–π—Ä–æ–Ω–Ω—ã–µ —Å–µ—Ç–∏"
    comparator.demonstrate_usage(test_text)

    # üÜï –ü–†–ò–ú–ï–† –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–Ø –ö–û–ù–ö–†–ï–¢–ù–û–ô –ú–û–î–ï–õ–ò
    print("\n" + "="*60)
    print("üéØ –ü–†–ò–ú–ï–† –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–Ø –ö–û–ù–ö–†–ï–¢–ù–û–ô –ú–û–î–ï–õ–ò")
    print("="*60)

    # –ò—Å–ø–æ–ª—å–∑—É–µ–º BPE –º–æ–¥–µ–ª—å —Å vocab_size=2000
    bpe_tokens = comparator.tokenize_text("–ü—Ä–∏–º–µ—Ä —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏", "BPE", 2000)
    if bpe_tokens:
        print(f"BPE –º–æ–¥–µ–ª—å (vocab=2000): {bpe_tokens}")

    # –ê–Ω–∞–ª–∏–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤
    if not results_df.empty:
        print("\n" + "="*80)
        print("–ö–õ–Æ–ß–ï–í–´–ï –í–´–í–û–î–´")
        print("="*80)

        successful_models = results_df[results_df['actual_vocab_size'] > 100]

        if not successful_models.empty:
            best_fragmentation = successful_models.loc[successful_models['fragmentation_rate'].idxmin()]
            best_compression = successful_models.loc[successful_models['compression_ratio'].idxmin()]
            best_reconstruction = successful_models.loc[successful_models['reconstruction_accuracy'].idxmax()]

            print(f"–õ—É—á—à–∞—è –ø–æ –º–∏–Ω–∏–º–∏–∑–∞—Ü–∏–∏ —Ñ—Ä–∞–≥–º–µ–Ω—Ç–∞—Ü–∏–∏: {best_fragmentation['model']} ({best_fragmentation['fragmentation_rate']}%)")
            print(f"–õ—É—á—à–∞—è –ø–æ —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏ —Å–∂–∞—Ç–∏—è: {best_compression['model']} ({best_compression['compression_ratio']})")
            print(f"–õ—É—á—à–∞—è –ø–æ —Ç–æ—á–Ω–æ—Å—Ç–∏ —Ä–µ–∫–æ–Ω—Å—Ç—Ä—É–∫—Ü–∏–∏: {best_reconstruction['model']} ({best_reconstruction['reconstruction_accuracy']}%)")

if __name__ == "__main__":
    main()

üìö –†–∞–∑–º–µ—Ä –∫–æ—Ä–ø—É—Å–∞: 100 —Ç–µ–∫—Å—Ç–æ–≤
üöÄ –ó–ê–ü–£–°–ö –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê –ü–û–î–°–õ–û–í–ù–´–• –ú–û–î–ï–õ–ï–ô

üìä –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: 2000
--------------------------------------------------
–û–±—É—á–∞–µ—Ç—Å—è BPE... ‚úÖ –ú–æ–¥–µ–ª—å BPE_vocab_2000 —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞
   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ (BPE):
     –û—Ä–∏–≥–∏–Ω–∞–ª: '–ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ —ç—Ç–æ –≤–∞–∂–Ω—ã–π —Ä–∞–∑–¥–µ–ª –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ...'
     –¢–æ–∫–µ–Ω—ã: ['–ú–∞—à–∏–Ω–Ω–æ–µ', '–æ–±—É—á–µ–Ω–∏–µ', '—ç—Ç–æ', '–≤–∞–∂–Ω—ã–π', '—Ä–∞–∑–¥–µ–ª', '–∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ', '–∏–Ω—Ç–µ–ª–ª–µ–∫—Ç–∞']
     –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '–º–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ —ç—Ç–æ –≤–∞–∂–Ω—ã–π —Ä–∞–∑–¥–µ–ª –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω–æ–≥–æ...'
     –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: True

‚úì
–û–±—É—á–∞–µ—Ç—Å—è WordPiece... ‚úÖ –ú–æ–¥–µ–ª—å WordPiece_vocab_2000 —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞
   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ (WordPiece):
     –û—Ä–∏–≥–∏–Ω–∞–ª: '–ú–

In [None]:
# –ü–æ—Å–ª–µ –æ–±—É—á–µ–Ω–∏—è
corpus = [item["text"] for item in preprocessed_data]
comparator = SubwordModelComparator(corpus)
results_df = comparator.run_comparison(vocab_sizes=[2000, 5000])

# –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏–µ –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–π –º–æ–¥–µ–ª–∏
tokens = comparator.tokenize_text("–ú–∏–Ω —è—Ä–∞—Ç–∞–º —Å–∏–Ω–µ –¢–∞—Ç–∞—Ä—Å—Ç–∞–Ω", "WordPiece", 2000)
print(f"–¢–æ–∫–µ–Ω—ã: {tokens}")

# –ò–ª–∏ –ø–æ–ª—É—á–∏—Ç–µ —Å–∞–º—É –º–æ–¥–µ–ª—å
bpe_model = comparator.get_trained_model("Unigram_HF", 2000)
if bpe_model:
    encoding = bpe_model.encode("–¢–µ–∫—Å—Ç")
    tokens = encoding.tokens

üöÄ –ó–ê–ü–£–°–ö –°–†–ê–í–ù–ò–¢–ï–õ–¨–ù–û–ì–û –ê–ù–ê–õ–ò–ó–ê –ü–û–î–°–õ–û–í–ù–´–• –ú–û–î–ï–õ–ï–ô

üìä –†–∞–∑–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è: 2000
--------------------------------------------------
–û–±—É—á–∞–µ—Ç—Å—è BPE... ‚úÖ –ú–æ–¥–µ–ª—å BPE_vocab_2000 —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞
   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ (BPE):
     –û—Ä–∏–≥–∏–Ω–∞–ª: '–†–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à–ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω–∞...'
     –¢–æ–∫–µ–Ω—ã: ['–†–æ—Å—Å–∏—è–¥”ô', '–±–µ—Ä–µ–Ω—á–µ', '—Ç–µ–ø–ª–∏—Ü–∞', '—Ç”©–∑–µ–ª”ô', '–±–∞—à', '–ª–∞–¥—ã', ',', '–∞–Ω–¥–∞', '–±–∞–Ω–∞–Ω', '“Ø—Å—Ç–µ—Ä']...
     –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π: '—Ä–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–ª”ô –±–∞—à –ª–∞–¥—ã, –∞–Ω–¥–∞ –±–∞–Ω...'
     –°–æ–≤–ø–∞–¥–µ–Ω–∏–µ: False

‚úì
–û–±—É—á–∞–µ—Ç—Å—è WordPiece... ‚úÖ –ú–æ–¥–µ–ª—å WordPiece_vocab_2000 —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞
   üîç –ü—Ä–∏–º–µ—Ä —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏ (WordPiece):
     –û—Ä–∏–≥–∏–Ω–∞–ª: '–†–æ—Å—Å–∏—è–¥”ô –±–µ—Ä–µ–Ω—á–µ —Ç–µ–ø–ª–∏—Ü–∞ —Ç”©–∑–µ–