# ShohnomaLLM - Сбор данных

Этот notebook собирает таджикскую поэзию из различных источников:
- **Ganjoor.net** - классическая персидская поэзия (с транслитерацией)
- **Adabiyot.tj** - современная таджикская поэзия

## Инструкции
1. Запустите все ячейки последовательно
2. Данные будут сохранены в Google Drive

In [None]:
# Подключение Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Создаём директорию проекта
!mkdir -p /content/drive/MyDrive/ShohnomaLLM/data/raw

In [None]:
# Установка зависимостей
!pip install requests beautifulsoup4 lxml tqdm -q

In [None]:
# Клонируем репозиторий (если есть на GitHub)
# !git clone https://github.com/username/ShohnomaLLM.git

# Или копируем код напрямую
import os
os.chdir('/content')
!mkdir -p ShohnomaLLM/scraper/utils ShohnomaLLM/scraper/sources

## Транслитератор
Конвертация персидской арабицы в таджикскую кириллицу

In [None]:
%%writefile /content/ShohnomaLLM/scraper/utils/transliterate.py

import re
import json
from typing import Optional
from pathlib import Path


class PersianToTajikTransliterator:
    CONSONANTS = {
        'ب': 'б', 'پ': 'п', 'ت': 'т', 'ث': 'с',
        'ج': 'ҷ', 'چ': 'ч', 'ح': 'ҳ', 'خ': 'х',
        'د': 'д', 'ذ': 'з', 'ر': 'р', 'ز': 'з',
        'ژ': 'ж', 'س': 'с', 'ش': 'ш', 'ص': 'с',
        'ض': 'з', 'ط': 'т', 'ظ': 'з', 'ع': 'ъ',
        'غ': 'ғ', 'ف': 'ф', 'ق': 'қ', 'ک': 'к',
        'ك': 'к', 'گ': 'г', 'ل': 'л', 'م': 'м',
        'ن': 'н', 'ه': 'ҳ', 'ی': 'й', 'ي': 'й',
    }
    
    WORD_DICT = {
        'من': 'ман', 'تو': 'ту', 'او': 'ӯ', 'ما': 'мо',
        'است': 'аст', 'بود': 'буд', 'شد': 'шуд',
        'دل': 'дил', 'جان': 'ҷон', 'عشق': 'ишқ',
        'و': 'ва', 'که': 'ки', 'از': 'аз', 'به': 'ба',
        'در': 'дар', 'با': 'бо', 'بر': 'бар',
    }
    
    def __init__(self):
        pass
    
    def transliterate(self, text: str) -> str:
        if not text:
            return ""
        text = self._normalize(text)
        tokens = self._tokenize(text)
        result = []
        for token in tokens:
            if self._is_persian(token):
                result.append(self._transliterate_word(token))
            else:
                result.append(token)
        return ''.join(result)
    
    def _normalize(self, text: str) -> str:
        text = text.replace('ي', 'ی').replace('ك', 'ک')
        return text
    
    def _tokenize(self, text: str) -> list:
        pattern = r'([\u0600-\u06FF]+|[^\u0600-\u06FF]+)'
        return re.findall(pattern, text)
    
    def _is_persian(self, token: str) -> bool:
        return bool(re.match(r'^[\u0600-\u06FF]+$', token))
    
    def _transliterate_word(self, word: str) -> str:
        if word in self.WORD_DICT:
            return self.WORD_DICT[word]
        return self._apply_rules(word)
    
    def _apply_rules(self, word: str) -> str:
        result = []
        for i, char in enumerate(word):
            if char == 'ا':
                result.append('о' if i > 0 else '')
            elif char == 'آ':
                result.append('о')
            elif char == 'و':
                result.append('у' if i > 0 else 'в')
            elif char in ('ی', 'ي'):
                result.append('ӣ' if i == len(word)-1 else 'и')
            elif char == 'ه' and i == len(word)-1:
                result.append('а')
            elif char == 'ع' and i == 0:
                result.append('')
            elif char in self.CONSONANTS:
                result.append(self.CONSONANTS[char])
            else:
                result.append(char)
        return ''.join(result)
    
    def transliterate_poem(self, poem: str) -> str:
        lines = poem.split('\n')
        return '\n'.join(self.transliterate(line) for line in lines)

## Парсер Ganjoor.net

In [None]:
import requests
import json
import time
from tqdm import tqdm
from pathlib import Path

# Импортируем транслитератор
import sys
sys.path.insert(0, '/content/ShohnomaLLM')
from scraper.utils.transliterate import PersianToTajikTransliterator


class GanjoorScraper:
    BASE_URL = "https://api.ganjoor.net/api"
    
    POETS = {
        2: "Рӯдакӣ",
        5: "Хайём",
        7: "Ҳофиз",
        22: "Саъдӣ",
        26: "Мавлавӣ",
        35: "Ҷомӣ",
    }
    
    FORM_MAP = {
        'رباعیات': 'rubaiyat', 'رباعی': 'rubaiyat',
        'غزلیات': 'ghazal', 'غزل': 'ghazal',
        'قصاید': 'qasida', 'مثنوی': 'masnavi',
    }
    
    def __init__(self, output_dir):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.transliterator = PersianToTajikTransliterator()
        self.session = requests.Session()
    
    def _request(self, endpoint):
        try:
            resp = self.session.get(f"{self.BASE_URL}/{endpoint}", timeout=30)
            return resp.json() if resp.ok else None
        except:
            return None
    
    def get_categories(self, poet_id):
        data = self._request(f"ganjoor/poet/{poet_id}")
        if not data or 'cat' not in data:
            return []
        return self._extract_cats(data['cat'])
    
    def _extract_cats(self, cat, cats=None):
        if cats is None:
            cats = []
        for child in cat.get('children', []):
            cats.append({'id': child['id'], 'title': child['title']})
            self._extract_cats(child, cats)
        return cats
    
    def get_poems(self, cat_id):
        data = self._request(f"ganjoor/cat/{cat_id}?poems=true")
        if data and 'cat' in data:
            return data['cat'].get('poems', [])
        return []
    
    def get_poem_text(self, poem_id):
        data = self._request(f"ganjoor/poem/{poem_id}?verseDetails=true")
        if data and 'verses' in data:
            return '\n'.join(v['text'] for v in data['verses'] if 'text' in v)
        return ''
    
    def _detect_form(self, title):
        for pattern, form in self.FORM_MAP.items():
            if pattern in title:
                return form
        return 'other'
    
    def scrape_poet(self, poet_id):
        poet_name = self.POETS.get(poet_id, f"Poet_{poet_id}")
        print(f"\nСбор: {poet_name}")
        
        poems = []
        categories = self.get_categories(poet_id)
        
        for cat in tqdm(categories, desc="Категории"):
            cat_poems = self.get_poems(cat['id'])
            
            for p in cat_poems:
                text_fa = self.get_poem_text(p['id'])
                if not text_fa:
                    continue
                
                text_tj = self.transliterator.transliterate_poem(text_fa)
                
                poems.append({
                    'id': f"ganjoor_{poet_id}_{p['id']}",
                    'poet': poet_name,
                    'title': p.get('title', ''),
                    'text_persian': text_fa,
                    'text_tajik': text_tj,
                    'form': self._detect_form(cat['title']),
                    'source': 'ganjoor',
                })
                time.sleep(0.1)
        
        return poems
    
    def save(self, poems, filename):
        path = self.output_dir / filename
        with open(path, 'w', encoding='utf-8') as f:
            for p in poems:
                json.dump(p, f, ensure_ascii=False)
                f.write('\n')
        print(f"Сохранено: {len(poems)} -> {path}")

## Запуск сбора данных

In [None]:
# Сбор классической поэзии
OUTPUT_DIR = "/content/drive/MyDrive/ShohnomaLLM/data/raw/ganjoor"

scraper = GanjoorScraper(OUTPUT_DIR)

all_poems = []

# Собираем по каждому поэту
for poet_id in [5, 7]:  # Начнём с Хайяма и Хафиза
    poems = scraper.scrape_poet(poet_id)
    all_poems.extend(poems)
    scraper.save(poems, f"poet_{poet_id}.jsonl")

# Сохраняем всё вместе
scraper.save(all_poems, "all_classical.jsonl")

print(f"\nВсего собрано: {len(all_poems)} стихов")

In [None]:
# Просмотр примера
if all_poems:
    poem = all_poems[0]
    print("Пример:")
    print(f"Поэт: {poem['poet']}")
    print(f"Форма: {poem['form']}")
    print(f"\nОригинал:\n{poem['text_persian'][:200]}")
    print(f"\nТранслитерация:\n{poem['text_tajik'][:200]}")

## Статистика

In [None]:
from collections import Counter

# Статистика по формам
forms = Counter(p['form'] for p in all_poems)
print("Статистика по формам:")
for form, count in forms.most_common():
    print(f"  {form}: {count}")

# Статистика по поэтам
poets = Counter(p['poet'] for p in all_poems)
print("\nСтатистика по поэтам:")
for poet, count in poets.most_common():
    print(f"  {poet}: {count}")