### Вариант 17, mодель: Matcha TTS, язык: kyrgyz

#### Install dependencies

In [None]:
!pip install torch torchaudio accelerate bitsandbytes transformers soundfile numpy requests beautifulsoup4



In [None]:
import os
import random
import tarfile
from pathlib import Path
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import time
from transformers import pipeline
import torch
import requests
from bs4 import BeautifulSoup
import re
import subprocess
import shutil
import json


#### Text generation and preparation

In [None]:
def contains_digits_or_abbreviations(text):
    """Check if text contains digits or multiple capital letters (abbreviations)"""
    # Check for digits
    if re.search(r'\d', text):
        return True
    
    # Count consecutive capital letters or multiple capital words
    capital_words = re.findall(r'\b[A-ZА-ЯӨҮӘ]{2,}\b', text)
    if capital_words:
        return True
    
    # Check for single words with multiple consecutive capitals
    if re.search(r'[A-ZА-ЯӨҮӘ]{3,}', text):
        return True
    
    return False

In [None]:
def clean_kyrgyz_text(text):
    """Clean and format Kyrgyz text"""
    # Remove extra spaces, newlines
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove content in brackets and parentheses
    text = re.sub(r'[\[\(][^\]\)]*[\]\)]', '', text)
    
    # Remove special characters but keep Kyrgyz letters and basic punctuation
    text = re.sub(r'[^\w\sәөүҥңһіӊӨҮӘҢҺ.,!?;:—–-]', '', text)
    
    # Skip sentences with digits or abbreviations
    if contains_digits_or_abbreviations(text):
        return ""
    
    # Don't add period if sentence already ends with ; or other punctuation
    if text and not text[0].isupper():
        text = text[0].upper() + text[1:]
    if text and not text.endswith(('.', '!', '?', ';', ':')):
        text += '.'
    
    return text

In [None]:

def is_kyrgyz_text(text):
    """Check if text contains Kyrgyz characters or keywords"""
    kyrgyz_chars = set('әөүҥңһіӊӨҮӘҢҺ')
    kyrgyz_keywords = [
        'кыргыз', 'бишкек', 'ош', 'жалал', 'кыргызстан', 'бакай', 
        'талдык', 'нарын', 'иссык', 'чыгыш', 'батыш', 'түндүк',
        'түштүк', 'жаштар', 'маданият', 'тарых', 'билим', 'дене',
        'саламаттык', 'экономика', 'саясат', 'коом', 'спорт', 'бизнес',
        'мамлекет', 'өкмөт', 'президент', 'министр', 'парламент',
        'адам', 'жаран', 'кызмат', 'иш', 'соода', 'базар', 'бакча',
        'мектеп', 'университет', 'оорукана', 'дарыгер', 'мугалим',
        'китеп', 'окуу', 'жазуу', 'сүйлөө', 'угуу', 'көрүү', 'ичүү',
        'жеңил', 'оор', 'жылуу', 'муздак', 'жаңы', 'эски', 'чоң', 'кичине',
        'күн', 'ай', 'жыл', 'убакыт', 'жер', 'суу', 'аба', 'отун', 'тамак'
    ]
    
    has_kyrgyz_chars = any(char in kyrgyz_chars for char in text.lower())
    has_kyrgyz_words = any(keyword in text.lower() for keyword in kyrgyz_keywords)
    
    return has_kyrgyz_chars or has_kyrgyz_words

In [None]:
def get_content_rich_wikipedia_pages():
    """Get only Wikipedia pages that are known to have extensive content"""
    content_rich_pages = [
        # MAJOR HISTORICAL EVENTS (very long articles)
        "https://ky.wikipedia.org/wiki/Биринчи_дүйнөлүк_согуш",
        "https://ky.wikipedia.org/wiki/Экинчи_дүйнөлүк_согуш", 
        "https://ky.wikipedia.org/wiki/Улуу_Ата_Мекендик_согуш",
        "https://ky.wikipedia.org/wiki/Чыңгыз_хан",
        "https://ky.wikipedia.org/wiki/Осмон_империясы",
        "https://ky.wikipedia.org/wiki/Рим_империясы",
        "https://ky.wikipedia.org/wiki/Византия_империясы",
        "https://ky.wikipedia.org/wiki/Моңгол_империясы",
        "https://ky.wikipedia.org/wiki/Француз_революциясы",
        "https://ky.wikipedia.org/wiki/Октябрь_революциясы",
        "https://ky.wikipedia.org/wiki/Америка_революциясы",
        "https://ky.wikipedia.org/wiki/Кытай_революциясы",
        "https://ky.wikipedia.org/wiki/Корея_согушу",
        "https://ky.wikipedia.org/wiki/Вьетнам_согушу",
        "https://ky.wikipedia.org/wiki/Афганстан_согушу",
        "https://ky.wikipedia.org/wiki/Байтак_согуштар",
        "https://ky.wikipedia.org/wiki/Жылаңач_согуш",
        
        # COMPREHENSIVE COUNTRIES (very long articles)
        "https://ky.wikipedia.org/wiki/Орусия",
        "https://ky.wikipedia.org/wiki/Кытай",
        "https://ky.wikipedia.org/wiki/Индия", 
        "https://ky.wikipedia.org/wiki/АКШ",
        "https://ky.wikipedia.org/wiki/Германия",
        "https://ky.wikipedia.org/wiki/Франция",
        "https://ky.wikipedia.org/wiki/Улуу_Британия",
        "https://ky.wikipedia.org/wiki/Япония",
        "https://ky.wikipedia.org/wiki/Корея",
        "https://ky.wikipedia.org/wiki/Түркия",
        "https://ky.wikipedia.org/wiki/Иран",
        "https://ky.wikipedia.org/wiki/Мысыр",
        "https://ky.wikipedia.org/wiki/Бразилия",
        "https://ky.wikipedia.org/wiki/Канада",
        "https://ky.wikipedia.org/wiki/Австралия",
        "https://ky.wikipedia.org/wiki/Мексика",
        "https://ky.wikipedia.org/wiki/Индонезия",
        "https://ky.wikipedia.org/wiki/Пакистан",
        "https://ky.wikipedia.org/wiki/Бангладеш",
        "https://ky.wikipedia.org/wiki/Нигерия",
        "https://ky.wikipedia.org/wiki/Эфиопия",
        
        # MAJOR SCIENTIFIC TOPICS (very detailed)
        "https://ky.wikipedia.org/wiki/Күн_системасы",
        "https://ky.wikipedia.org/wiki/Жер_планетасы",
        "https://ky.wikipedia.org/wiki/Ай",
        "https://ky.wikipedia.org/wiki/Марс",
        "https://ky.wikipedia.org/wiki/Юпитер",
        "https://ky.wikipedia.org/wiki/Сатурн",
        "https://ky.wikipedia.org/wiki/Галактика",
        "https://ky.wikipedia.org/wiki/Жылдыздар",
        "https://ky.wikipedia.org/wiki/Кара_тешиктер",
        "https://ky.wikipedia.org/wiki/Эволюция_теориясы",
        "https://ky.wikipedia.org/wiki/ДНК",
        "https://ky.wikipedia.org/wiki/Генетика",
        "https://ky.wikipedia.org/wiki/Клетка",
        "https://ky.wikipedia.org/wiki/Бактериялар",
        "https://ky.wikipedia.org/wiki/Вирустар",
        "https://ky.wikipedia.org/wiki/Иммунитет_системасы",
        "https://ky.wikipedia.org/wiki/Неврология",
        "https://ky.wikipedia.org/wiki/Психология",
        "https://ky.wikipedia.org/wiki/Психиатрия",
        
        # TECHNOLOGY AND INNOVATION (long articles)
        "https://ky.wikipedia.org/wiki/Компьютер",
        "https://ky.wikipedia.org/wiki/Интернет",
        "https://ky.wikipedia.org/wiki/Искусственный_интеллект",
        "https://ky.wikipedia.org/wiki/Робототехника",
        "https://ky.wikipedia.org/wiki/Нанотехнология",
        "https://ky.wikipedia.org/wiki/Биотехнология",
        "https://ky.wikipedia.org/wiki/Космонавтика",
        "https://ky.wikipedia.org/wiki/Авиация",
        "https://ky.wikipedia.org/wiki/Телевидение",
        "https://ky.wikipedia.org/wiki/Радио",
        "https://ky.wikipedia.org/wiki/Телефон",
        "https://ky.wikipedia.org/wiki/Мобилдик_телефон",
        "https://ky.wikipedia.org/wiki/Социальдык_тармактар",
        "https://ky.wikipedia.org/wiki/Киберкоопсуздук",
        "https://ky.wikipedia.org/wiki/Виртуалдык_реальность",
        
        # MAJOR LITERARY WORKS AND AUTHORS
        "https://ky.wikipedia.org/wiki/Манас_эпосу",
        "https://ky.wikipedia.org/wiki/Чыңгыз_Айтматов",
        "https://ky.wikipedia.org/wiki/Лев_Толстой",
        "https://ky.wikipedia.org/wiki/Фёдор_Достоевский",
        "https://ky.wikipedia.org/wiki/Антон_Чехов",
        "https://ky.wikipedia.org/wiki/Александр_Пушкин",
        "https://ky.wikipedia.org/wiki/Уильям_Шекспир",
        "https://ky.wikipedia.org/wiki/Чарльз_Диккенс",
        "https://ky.wikipedia.org/wiki/Марк_Твен",
        "https://ky.wikipedia.org/wiki/Эрнест_Хемингуэй",
        "https://ky.wikipedia.org/wiki/Джордж_Оруэлл",
        "https://ky.wikipedia.org/wiki/Джон_Толкин",
        "https://ky.wikipedia.org/wiki/Джоан_Роулинг",
        
        # WORLD RELIGIONS AND PHILOSOPHY
        "https://ky.wikipedia.org/wiki/Ислам",
        "https://ky.wikipedia.org/wiki/Христианство",
        "https://ky.wikipedia.org/wiki/Буддизм",
        "https://ky.wikipedia.org/wiki/Иудаизм",
        "https://ky.wikipedia.org/wiki/Индуизм",
        "https://ky.wikipedia.org/wiki/Конфуций",
        "https://ky.wikipedia.org/wiki/Даосизм",
        "https://ky.wikipedia.org/wiki/Сикхизм",
        "https://ky.wikipedia.org/wiki/Философия",
        "https://ky.wikipedia.org/wiki/Этика",
        "https://ky.wikipedia.org/wiki/Логика",
        "https://ky.wikipedia.org/wiki/Метафизика",
        "https://ky.wikipedia.org/wiki/Эпистемология",
        
        # MAJOR ECONOMIC SYSTEMS
        "https://ky.wikipedia.org/wiki/Экономика",
        "https://ky.wikipedia.org/wiki/Капитализм",
        "https://ky.wikipedia.org/wiki/Социализм",
        "https://ky.wikipedia.org/wiki/Коммунизм",
        "https://ky.wikipedia.org/wiki/Глобализация",
        "https://ky.wikipedia.org/wiki/Эл_аралык_соода",
        "https://ky.wikipedia.org/wiki/Банк_системасы",
        "https://ky.wikipedia.org/wiki/Валюта",
        "https://ky.wikipedia.org/wiki/Биржа",
        "https://ky.wikipedia.org/wiki/Инфляция",
        "https://ky.wikipedia.org/wiki/Эмгек_базары",
        "https://ky.wikipedia.org/wiki/Каржы",
        "https://ky.wikipedia.org/wiki/Инвестиция",
        
        # COMPREHENSIVE HEALTH AND MEDICINE
        "https://ky.wikipedia.org/wiki/Медицина",
        "https://ky.wikipedia.org/wiki/Анатомия",
        "https://ky.wikipedia.org/wiki/Физиология",
        "https://ky.wikipedia.org/wiki/Фармакология",
        "https://ky.wikipedia.org/wiki/Хирургия",
        "https://ky.wikipedia.org/wiki/Педиатрия",
        "https://ky.wikipedia.org/wiki/Кардиология",
        "https://ky.wikipedia.org/wiki/Неврология",
        "https://ky.wikipedia.org/wiki/Онкология",
        "https://ky.wikipedia.org/wiki/Эпидемиология",
        "https://ky.wikipedia.org/wiki/Вирусология",
        "https://ky.wikipedia.org/wiki/Бактериология",
        "https://ky.wikipedia.org/wiki/Генетикалык_инженерия",
        
        # MAJOR ART FORMS AND MOVEMENTS
        "https://ky.wikipedia.org/wiki/Сүрөт_искусствосу",
        "https://ky.wikipedia.org/wiki/Музыка",
        "https://ky.wikipedia.org/wiki/Театр",
        "https://ky.wikipedia.org/wiki/Кино",
        "https://ky.wikipedia.org/wiki/Архитектура",
        "https://ky.wikipedia.org/wiki/Адабият",
        "https://ky.wikipedia.org/wiki/Бий",
        "https://ky.wikipedia.org/wiki/Опера",
        "https://ky.wikipedia.org/wiki/Балет",
        "https://ky.wikipedia.org/wiki/Фотография",
        "https://ky.wikipedia.org/wiki/Дизайн",
        "https://ky.wikipedia.org/wiki/Мода",
        
        # COMPREHENSIVE SPORTS
        "https://ky.wikipedia.org/wiki/Олимпиада_оюндары",
        "https://ky.wikipedia.org/wiki/Футбол",
        "https://ky.wikipedia.org/wiki/Баскетбол",
        "https://ky.wikipedia.org/wiki/Волейбол",
        "https://ky.wikipedia.org/wiki/Теннис",
        "https://ky.wikipedia.org/wiki/Бейсбол",
        "https://ky.wikipedia.org/wiki/Крикет",
        "https://ky.wikipedia.org/wiki/Гольф",
        "https://ky.wikipedia.org/wiki/Бокс",
        "https://ky.wikipedia.org/wiki/Дзюдо",
        "https://ky.wikipedia.org/wiki/Карате",
        "https://ky.wikipedia.org/wiki/Таэквондо",
        "https://ky.wikipedia.org/wiki/Жеңил_атлетика",
        "https://ky.wikipedia.org/wiki/Суу_спорту",
        "https://ky.wikipedia.org/wiki/Кышкы_спорттор",
        
        # MAJOR ENVIRONMENTAL TOPICS
        "https://ky.wikipedia.org/wiki/Климат_өзгөрүүсү",
        "https://ky.wikipedia.org/wiki/Глобалдык_жылынуу",
        "https://ky.wikipedia.org/wiki/Экология",
        "https://ky.wikipedia.org/wiki/Биоар түрдүүлүк",
        "https://ky.wikipedia.org/wiki/Токойлор",
        "https://ky.wikipedia.org/wiki/Дарыялар",
        "https://ky.wikipedia.org/wiki/Океандар",
        "https://ky.wikipedia.org/wiki/Атмосфера",
        "https://ky.wikipedia.org/wiki/Жер_кыртышы",
        "https://ky.wikipedia.org/wiki/Табигый_байлыктар",
        "https://ky.wikipedia.org/wiki/Кайра_иштетүү",
        "https://ky.wikipedia.org/wiki/Эко_система",
        
        # MAJOR POLITICAL SYSTEMS
        "https://ky.wikipedia.org/wiki/Демократия",
        "https://ky.wikipedia.org/wiki/Монархия",
        "https://ky.wikipedia.org/wiki/Республика",
        "https://ky.wikipedia.org/wiki/Диктатура",
        "https://ky.wikipedia.org/wiki/Фашизм",
        "https://ky.wikipedia.org/wiki/Нацизм",
        "https://ky.wikipedia.org/wiki/Анархизм",
        "https://ky.wikipedia.org/wiki/Либерализм",
        "https://ky.wikipedia.org/wiki/Консерватизм",
        "https://ky.wikipedia.org/wiki/Национализм",
        "https://ky.wikipedia.org/wiki/Интернационализм",
        
        # COMPREHENSIVE EDUCATION
        "https://ky.wikipedia.org/wiki/Билим_берүү",
        "https://ky.wikipedia.org/wiki/Педагогика",
        "https://ky.wikipedia.org/wiki/Психология",
        "https://ky.wikipedia.org/wiki/Социология",
        "https://ky.wikipedia.org/wiki/Университет",
        "https://ky.wikipedia.org/wiki/Мектеп",
        "https://ky.wikipedia.org/wiki/Колледж",
        "https://ky.wikipedia.org/wiki/Академия",
        "https://ky.wikipedia.org/wiki/Илим",
        "https://ky.wikipedia.org/wiki/Технология",
        "https://ky.wikipedia.org/wiki/Инженердик",
        "https://ky.wikipedia.org/wiki/Медициналык_билим",
        
        # WORLD CULTURES AND CIVILIZATIONS
        "https://ky.wikipedia.org/wiki/Кыргыз_маданияты",
        "https://ky.wikipedia.org/wiki/Орус_маданияты",
        "https://ky.wikipedia.org/wiki/Кытай_маданияты",
        "https://ky.wikipedia.org/wiki/Индия_маданияты",
        "https://ky.wikipedia.org/wiki/Япония_маданияты",
        "https://ky.wikipedia.org/wiki/Корея_маданияты",
        "https://ky.wikipedia.org/wiki/Араб_маданияты",
        "https://ky.wikipedia.org/wiki/Европа_маданияты",
        "https://ky.wikipedia.org/wiki/Африка_маданияты",
        "https://ky.wikipedia.org/wiki/Латын_Америкасынын_маданияты",
        
        # MAJOR LANGUAGES AND LINGUISTICS
        "https://ky.wikipedia.org/wiki/Кыргыз_тили",
        "https://ky.wikipedia.org/wiki/Орус_тили",
        "https://ky.wikipedia.org/wiki/Англис_тили",
        "https://ky.wikipedia.org/wiki/Кытай_тили",
        "https://ky.wikipedia.org/wiki/Араб_тили",
        "https://ky.wikipedia.org/wiki/Испан_тили",
        "https://ky.wikipedia.org/wiki/Француз_тили",
        "https://ky.wikipedia.org/wiki/Немис_тили",
        "https://ky.wikipedia.org/wiki/Япон_тили",
        "https://ky.wikipedia.org/wiki/Корей_тили",
        "https://ky.wikipedia.org/wiki/Түрк_тили",
        "https://ky.wikipedia.org/wiki/Фарсы_тили",
        
        # MAJOR INVENTIONS AND DISCOVERIES
        "https://ky.wikipedia.org/wiki/Телеграф",
        "https://ky.wikipedia.org/wiki/Телефон",
        "https://ky.wikipedia.org/wiki/Радио",
        "https://ky.wikipedia.org/wiki/Телевидение",
        "https://ky.wikipedia.org/wiki/Компьютер",
        "https://ky.wikipedia.org/wiki/Интернет",
        "https://ky.wikipedia.org/wiki/Мобилдик_телефон",
        "https://ky.wikipedia.org/wiki/Пенициллин",
        "https://ky.wikipedia.org/wiki/Вакцина",
        "https://ky.wikipedia.org/wiki/Рентген",
        "https://ky.wikipedia.org/wiki/Микроскоп",
        "https://ky.wikipedia.org/wiki/Телескоп",
        "https://ky.wikipedia.org/wiki/Пар_машинасы",
        "https://ky.wikipedia.org/wiki/Ички_күйүү_кыймылдаткычы",
        
        # WORLD ORGANIZATIONS
        "https://ky.wikipedia.org/wiki/БУУ",
        "https://ky.wikipedia.org/wiki/НАТО",
        "https://ky.wikipedia.org/wiki/Евросоюз",
        "https://ky.wikipedia.org/wiki/Азия_өнүктүрүү_банкы",
        "https://ky.wikipedia.org/wiki/Дүйнөлүк_банк",
        "https://ky.wikipedia.org/wiki/Эл_аралык_валютa_фонду",
        "https://ky.wikipedia.org/wiki/ЮНЕСКО",
        "https://ky.wikipedia.org/wiki/БУУнун_Балдар_фонду",
        "https://ky.wikipedia.org/wiki/Дүйнөлүк_соода_уюму",
        
        # MAJOR CITIES OF THE WORLD
        "https://ky.wikipedia.org/wiki/Москва",
        "https://ky.wikipedia.org/wiki/Пекин",
        "https://ky.wikipedia.org/wiki/Токио",
        "https://ky.wikipedia.org/wiki/Дели",
        "https://ky.wikipedia.org/wiki/Нью-Йорк",
        "https://ky.wikipedia.org/wiki/Лондон",
        "https://ky.wikipedia.org/wiki/Париж",
        "https://ky.wikipedia.org/wiki/Берлин",
        "https://ky.wikipedia.org/wiki/Рим",
        "https://ky.wikipedia.org/wiki/Мадрид",
        "https://ky.wikipedia.org/wiki/Стамбул",
        "https://ky.wikipedia.org/wiki/Каир",
        "https://ky.wikipedia.org/wiki/Сиэтл",
        "https://ky.wikipedia.org/wiki/Сидней",
        "https://ky.wikipedia.org/wiki/Рио-де-Жанейро",
        
        # MAJOR RIVERS AND MOUNTAINS
        "https://ky.wikipedia.org/wiki/Амазонка",
        "https://ky.wikipedia.org/wiki/Нил",
        "https://ky.wikipedia.org/wiki/Янцзы",
        "https://ky.wikipedia.org/wiki/Миссисипи",
        "https://ky.wikipedia.org/wiki/Волга",
        "https://ky.wikipedia.org/wiki/Дунай",
        "https://ky.wikipedia.org/wiki/Ганг",
        "https://ky.wikipedia.org/wiki/Эверест",
        "https://ky.wikipedia.org/wiki/Альп тоолору",
        "https://ky.wikipedia.org/wiki/Анды",
        "https://ky.wikipedia.org/wiki/Рокки тоолору",
        "https://ky.wikipedia.org/wiki/Урал тоолору",
        "https://ky.wikipedia.org/wiki/Кавказ тоолору",
        
        # MAJOR OCEANS AND SEAS
        "https://ky.wikipedia.org/wiki/Тынч_океан",
        "https://ky.wikipedia.org/wiki/Атлантика_океаны",
        "https://ky.wikipedia.org/wiki/Инди_океаны",
        "https://ky.wikipedia.org/wiki/Түндүк_Муз_океаны",
        "https://ky.wikipedia.org/wiki/Түштүк_океан",
        "https://ky.wikipedia.org/wiki/Кариб_деңизи",
        "https://ky.wikipedia.org/wiki/Жер_орта_деңизи",
        "https://ky.wikipedia.org/wiki/Кызыл_деңиз",
        "https://ky.wikipedia.org/wiki/Каспий_деңизи",
        "https://ky.wikipedia.org/wiki/Балтика_деңизи",
        
        # MAJOR ANIMALS AND PLANTS
        "https://ky.wikipedia.org/wiki/Аюу",
        "https://ky.wikipedia.org/wiki/Бөрү",
        "https://ky.wikipedia.org/wiki/Түлкү",
        "https://ky.wikipedia.org/wiki/Жолборс",
        "https://ky.wikipedia.org/wiki/Арстан",
        "https://ky.wikipedia.org/wiki/Пил",
        "https://ky.wikipedia.org/wiki/Жираф",
        "https://ky.wikipedia.org/wiki/Зебра",
        "https://ky.wikipedia.org/wiki/Крокодил",
        "https://ky.wikipedia.org/wiki/Ак_сарык",
        "https://ky.wikipedia.org/wiki/Бугу",
        "https://ky.wikipedia.org/wiki/Кой",
        "https://ky.wikipedia.org/wiki/Жылкы",
    ]
    
    return content_rich_pages

In [None]:
def scrape_content_rich_page(url, headers, min_sentences=5):
    """Scrape pages that are known to have extensive content"""
    sentences = []
    
    try:
        print(f"Scraping content-rich page: {url}")
        response = requests.get(url, timeout=30, headers=headers)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main content area
        content_div = soup.find('div', {'id': 'mw-content-text'})
        if not content_div:
            return sentences
        
        # Extract ALL text elements including lists
        elements = content_div.find_all(['p', 'li', 'div'])
        
        for elem in elements:
            text = elem.get_text().strip()
            if len(text) > 20:  # Only substantial content
                # Split into sentences
                raw_sentences = re.split(r'[.!?]', text)
                
                for sentence in raw_sentences:
                    sentence = sentence.strip()
                    if len(sentence) > 15:
                        cleaned = clean_kyrgyz_text(sentence)
                        if cleaned and is_kyrgyz_text(cleaned):
                            words = cleaned.split()
                            if 7 <= len(words) <= 70:  # Wider range for content-rich pages
                                if cleaned not in sentences:
                                    sentences.append(cleaned)
        
        print(f"  Extracted {len(sentences)} sentences from this page")
        
        # If this page has good content, explore its main links
        if len(sentences) >= min_sentences:
            print(f"  Page has good content, exploring related pages...")
            related_sentences = explore_related_pages(url, headers, sentences)
            sentences.extend(related_sentences)
        
        return sentences
        
    except Exception as e:
        print(f"  Error scraping {url}: {e}")
        return []

In [None]:
def explore_related_pages(main_url, headers, existing_sentences, max_related=10):
    """Explore related pages from a content-rich page"""
    related_sentences = []
    
    try:
        response = requests.get(main_url, timeout=20, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        content_div = soup.find('div', {'id': 'mw-content-text'})
        if not content_div:
            return related_sentences
        
        # Find main section links (usually in the first part of the article)
        links = content_div.find_all('a', href=True)
        related_urls = []
        
        for link in links[:100]:  # Check first 100 links
            href = link['href']
            if (href.startswith('/wiki/') and 
                ':' not in href and
                len(related_urls) < max_related):
                
                full_url = "https://ky.wikipedia.org" + href
                if full_url not in related_urls:
                    related_urls.append(full_url)
        
        # Scrape the related pages
        for related_url in related_urls:
            try:
                response = requests.get(related_url, timeout=20, headers=headers)
                soup = BeautifulSoup(response.content, 'html.parser')
                
                content_div = soup.find('div', {'id': 'mw-content-text'})
                if content_div:
                    elements = content_div.find_all(['p', 'li'])
                    
                    for elem in elements:
                        text = elem.get_text().strip()
                        if len(text) > 20:
                            raw_sentences = re.split(r'[.!?]', text)
                            
                            for sentence in raw_sentences:
                                sentence = sentence.strip()
                                if len(sentence) > 15:
                                    cleaned = clean_kyrgyz_text(sentence)
                                    if cleaned and is_kyrgyz_text(cleaned):
                                        words = cleaned.split()
                                        if 4 <= len(words) <= 60:
                                            if cleaned not in existing_sentences and cleaned not in related_sentences:
                                                related_sentences.append(cleaned)
                    
                    print(f"    Added {len(related_sentences)} sentences from related page")
                    time.sleep(1)  # Brief pause between related pages
                    
            except Exception as e:
                print(f"    Error with related page {related_url}: {e}")
                continue
    
    except Exception as e:
        print(f"Error exploring related pages: {e}")
    
    return related_sentences

In [None]:
def scrape_content_rich_wikipedia(target_count=9000):
    """Scrape only content-rich Wikipedia pages"""
    print(f"Scraping content-rich Kyrgyz Wikipedia pages for {target_count} sentences...")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    all_sentences = []
    content_rich_pages = get_content_rich_wikipedia_pages()
    
    print(f"Targeting {len(content_rich_pages)} content-rich pages")
    
    for page_url in content_rich_pages:
        if len(all_sentences) >= target_count:
            break
            
        sentences = scrape_content_rich_page(page_url, headers, min_sentences=30)
        all_sentences.extend(sentences)
        
        print(f"Total so far: {len(all_sentences)} sentences")
        
        # Respectful delay
        time.sleep(random.uniform(2, 4))
    
    # Remove duplicates
    seen = set()
    unique_sentences = []
    for sentence in all_sentences:
        if sentence not in seen:
            seen.add(sentence)
            unique_sentences.append(sentence)
    
    print(f"\nFinal unique sentences from content-rich pages: {len(unique_sentences)}")
    return unique_sentences[:target_count]

In [None]:
def create_text_file(sentences, filename="texts.txt"):
    """Create the final text file"""
    with open(filename, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')
    
    print(f"Created {filename} with {len(sentences)} sentences")

In [None]:
if __name__ == "__main__":
    print("Starting Content-Rich Kyrgyz Wikipedia Corpus Creation...")
    print("This targets only pages with extensive content like World War I")
    
    sentences = scrape_content_rich_wikipedia(9000)
    
    create_text_file(sentences, "texts.txt")
    
    print(f"\n=== CONTENT-RICH CORPUS CREATION COMPLETE ===")
    print(f"Total sentences: {len(sentences)}")
    
    if len(sentences) < 9000:
        print(f"Note: Collected {len(sentences)} sentences (target was 9000)")
        print("The synthesis will use whatever we collected")

Starting Content-Rich Kyrgyz Wikipedia Corpus Creation...
This targets only pages with extensive content like World War I
Scraping content-rich Kyrgyz Wikipedia pages for 9000 sentences...
Targeting 238 content-rich pages
Scraping content-rich page: https://ky.wikipedia.org/wiki/Кыргызстан
  Extracted 220 sentences from this page
  Page has good content, exploring related pages...
Total so far: 220 sentences
Scraping content-rich page: https://ky.wikipedia.org/wiki/Бишкек
  Extracted 140 sentences from this page
  Page has good content, exploring related pages...
Total so far: 360 sentences
Scraping content-rich page: https://ky.wikipedia.org/wiki/Ош
  Extracted 27 sentences from this page
Total so far: 387 sentences
Scraping content-rich page: https://ky.wikipedia.org/wiki/Жалал-Абад
  Extracted 71 sentences from this page
  Page has good content, exploring related pages...
Total so far: 458 sentences
Scraping content-rich page: https://ky.wikipedia.org/wiki/Каракол
  Extracted 33 sen

In [None]:
'''
def generate_with_ollama(prompt, model="deepseek-coder-v2:latest"):
    """Generate natural Kyrgyz sentences about coding topics"""
    url = "http://localhost:11434/api/generate" #ollama hosted locally
    
    # More natural prompt without definition structure
    enhanced_prompt = f"""
Сиз программалоо жана технология темасында табигый кыргыз тилинде сүйлөмдөр жаратуучу AI болосуз.

Тема: {prompt}

Жарыялоо: Бул тема боюнча 7-15 сөздөн турган табигый, толук сүйлөм жаратыңыз.
- Сүйлөм табигый кыргыз тилинде болсун
- Англис сөздөрүн колдонбоңуз
- Аныктама бербеңиз
- Сүйлөм баш тамга менен башталып, чекит менен бүтсүн
- Сүйлөм табигый жана коомдо колдонулган сыяктуу болсун

Сүйлөм:"""
    
    payload = {
        "model": model,
        "prompt": enhanced_prompt,
        "stream": False,
        "options": {
            "temperature": 0.9,  # Increased for more creativity
            "num_predict": 100,
            "top_p": 0.95,
            "repeat_penalty": 1.3
        }
    }
    
    try:
        response = requests.post(url, json=payload, timeout=120)
        if response.status_code == 200:
            result = response.json()
            return result["response"].strip()
        else:
            print(f"API error {response.status_code}: {response.text}")
            return None
    except Exception as e:
        print(f"Request failed: {e}")
        return None

def generate_coding_sentences_ollama(num_sentences=9000):
    """Generate natural coding/programming sentences in Kyrgyz"""
    
    # More conversational topics that encourage natural sentences
    coding_topics = [
        # Programming experiences
        "Python менен программа жазуунун жакшы тараптары",
        "Java колдонуп жаңы проект түзүү",
        "JavaScript жардамында динамикалык веб-беттер",
        "C++ менен ыңгайлуу колдонмолор иштеп чыгуу",
        "PHP аркылуу сервердик скриптер жазуу",
        "Ruby программалоо тилин үйрөнүү тажрыйбасы",
        
        # Daily programming activities
        "Код жазууда колдонулуучу негизги каражаттар",
        "Программа иштебей калганда кылган аракеттер",
        "Жаңы функция кошуудан мурун кандай даярдык кылуу",
        "Башка программачылар менен бирге иштөө",
        "Колдонмонун ырачатын жакшыртуу үчүн аракеттер",
        "Кодду текшерүү жана оңдоо процесси",
        
        # Learning and development
        "Программалоо тилин үйрөнүүдөгү кыйынчылыктар",
        "Алгачкы программаңызды иштеткендеги сезимдер",
        "Технологиялык жаңылыктарды күндөлүк көзөмөлдөө",
        "Программалоо боюнча коомдук иш-чараларга катышуу",
        "Жаңы технологияларды долбоордо колдонуу",
        "Код жазуунун жакшы ыкмаларын үйрөнүү",
        
        # Project scenarios
        "Ири проектти аяктоодон кийинки баалоо",
        "Топ менен иштөөдө коммуникациянын маанилүүлүгү",
        "Колдонмонун иштешине мониторинг жүргүзүү",
        "Жаңы версия чыгарганда эске алынуучу жабдуулар",
        "Колдонуучулардын пикири боюнча өзгөртүүлөр киргизүү",
        "Программанын коопсуздугун камсыз кылуу чаралары",
        
        # Problem solving
        "Программа иштебей калганда издөө ыкмалары",
        "Кыйын маселе чечүү үчүн колдонулган стратегиялар",
        "Кодду оңдоодо кезиккен кызыктуу учурлар",
        "Башка адамдын кодун түшүнүүгө болгон машыгуу",
        "Программанын ырачатын жогорулатуу жолдору",
        "Техникалык көйгөй чечүүдөгү тажрыйбалар"
    ]
    
    sentences = []
    attempts = 0
    max_attempts = num_sentences * 2
    
    print(f"Starting generation of {num_sentences} natural coding sentences...")
    
    while len(sentences) < num_sentences and attempts < max_attempts:
        topic = random.choice(coding_topics)
        
        generated_text = generate_with_ollama(topic)
        attempts += 1
        
        if generated_text:
            # Extract just the sentence part
            sentence = extract_sentence(generated_text)
            sentence = clean_sentence(sentence)
            
            if is_valid_natural_sentence(sentence):
                sentences.append(sentence)
                print(f"[{len(sentences)}/{num_sentences}] {sentence}")
                
                # Save progress regularly
                if len(sentences) % 100 == 0:
                    save_progress(sentences, f"progress_{len(sentences)}.txt")
                    print(f"Progress saved: {len(sentences)} sentences")
            else:
                if attempts % 10 == 0:
                    print(f"Attempt {attempts}: Retrying... Last output: {sentence[:60]}...")
        else:
            print(f"Generation failed on attempt {attempts}")
        
        # Rate limiting
        time.sleep(1.5)  # Slightly longer delay for better quality
    
    return sentences

def extract_sentence(text):
    """Extract the actual sentence from generated text"""
    # Remove any prompt remnants
    if "Сүйлөм:" in text:
        text = text.split("Сүйлөм:")[-1].strip()
    
    # Split into sentences and take the first complete one
    sentences = re.split(r'(?<=[.!?])\s+', text)
    for sentence in sentences:
        sentence = sentence.strip()
        # Check for natural sentence structure
        if (len(sentence.split()) >= 7 and 
            not sentence.startswith(('-', '—', '«')) and
            ':' not in sentence and 
            not any(word in sentence for word in ['аныктама', 'дегенди билдирет', 'мааниси'])):
            return sentence
    
    return text.strip()

def clean_sentence(sentence):
    """Clean up the sentence to remove definition-like structures"""
    if not sentence:
        return ""
    
    # Remove definition markers and English terms
    sentence = re.sub(r'^[-—]\s*', '', sentence)  # Remove starting hyphen/dash
    sentence = re.sub(r'["\']', '', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    
    # Remove any parenthetical English terms
    sentence = re.sub(r'\([^)]*[a-zA-Z][^)]*\)', '', sentence)
    
    # Remove common definition patterns
    sentence = re.sub(r'бул\s+[^.]*\.?\s*', '', sentence)
    sentence = re.sub(r'деп\s+аталат\s*', '', sentence)
    sentence = re.sub(r'мааниси\s*', '', sentence)
    
    # Ensure proper punctuation
    if sentence and not sentence.endswith(('.', '!', '?')):
        sentence += '.'
    
    # Capitalize first letter
    if sentence and sentence[0].islower():
        sentence = sentence[0].upper() + sentence[1:]
    
    return sentence

def is_valid_natural_sentence(sentence):
    """Check if this is a valid natural Kyrgyz sentence about coding"""
    if not sentence or len(sentence) < 20:
        return False
    
    words = sentence.split()
    if len(words) < 7 or len(words) > 20:
        return False
    
    if not sentence[0].isupper() or not sentence.endswith(('.', '!', '?')):
        return False
    
    # Check for definition-like patterns to exclude
    definition_indicators = [
        'бул', 'дегенди билдирет', 'мааниси', 'аныктама', 'деп аталат',
        '-', '—', ':', '(', ')'
    ]
    
    if any(indicator in sentence.lower() for indicator in definition_indicators):
        return False
    
    # Check for English words
    if re.search(r'[a-zA-Z]', sentence):
        return False
    
    # Check for natural Kyrgyz sentence structure
    kyrgyz_indicators = [
        'менен', 'үчүн', 'болуп', 'жана', 'бирок', 'андан', 'кийин',
        'ошондо', 'ошентсе', 'аркылуу', 'сайын', 'бойдон'
    ]
    
    has_kyrgyz_structure = any(indicator in sentence.lower() for indicator in kyrgyz_indicators)
    
    return has_kyrgyz_structure

def save_progress(sentences, filename):
    """Save progress to file"""
    with open(filename, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + '\n')

def test_ollama():
    """Test if Ollama is working"""
    print("Testing Ollama connection...")
    try:
        response = requests.get("http://localhost:11434/", timeout=10)
        if response.status_code == 200:
            print("Ollama is running!")
            
            # Test with natural topic
            test_result = generate_with_ollama("Python менен программа жазуунун жакшы тараптары")
            if test_result:
                cleaned = clean_sentence(extract_sentence(test_result))
                print(f"Model test successful: {cleaned}")
                return True
            else:
                print("Model test failed")
                return False
        else:
            print(f"Ollama responded with: {response.status_code}")
            return False
    except Exception as e:
        print(f"Cannot connect to Ollama: {e}")
        return False

def prepare_texts(num_sentences=9000):
    """Main function to prepare natural coding texts"""
    if test_ollama():
        print(f"\nGenerating {num_sentences} natural coding sentences in Kyrgyz...")
        print("This will take several hours...")
        
        start_time = time.time()
        texts = generate_coding_sentences_ollama(num_sentences)
        end_time = time.time()
        
        duration = (end_time - start_time) / 3600
        print(f"Successfully generated {len(texts)} natural sentences")
        print(f"Total time: {duration:.2f} hours")
        
        return texts
    else:
        print("Ollama setup failed")
        return []

'''

'\ndef generate_with_ollama(prompt, model="deepseek-coder-v2:latest"):\n    """Generate natural Kyrgyz sentences about coding topics"""\n    url = "http://localhost:11434/api/generate" #ollama hosted locally\n    \n    # More natural prompt without definition structure\n    enhanced_prompt = f"""\nСиз программалоо жана технология темасында табигый кыргыз тилинде сүйлөмдөр жаратуучу AI болосуз.\n\nТема: {prompt}\n\nЖарыялоо: Бул тема боюнча 7-15 сөздөн турган табигый, толук сүйлөм жаратыңыз.\n- Сүйлөм табигый кыргыз тилинде болсун\n- Англис сөздөрүн колдонбоңуз\n- Аныктама бербеңиз\n- Сүйлөм баш тамга менен башталып, чекит менен бүтсүн\n- Сүйлөм табигый жана коомдо колдонулган сыяктуу болсун\n\nСүйлөм:"""\n    \n    payload = {\n        "model": model,\n        "prompt": enhanced_prompt,\n        "stream": False,\n        "options": {\n            "temperature": 0.9,  # Increased for more creativity\n            "num_predict": 100,\n            "top_p": 0.95,\n            "repeat_penalt

In [None]:
def load_texts_from_file(filename="texts.txt"):
    """Load texts from file for synthesis"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            texts = [line.strip() for line in f if line.strip()]
        print(f"Loaded {len(texts)} texts from {filename}")
        return texts
    except FileNotFoundError:
        print(f"File {filename} not found")
        return []

#### Speech Synthesis with Matcha TTS

In [None]:
def synthesize_with_matcha_cli_from_file(text_file="texts.txt", output_dir="audio_files"):
    """Synthesize speech using texts from file"""
    # Load texts from file
    texts = load_texts_from_file(text_file)
    if not texts:
        print("No texts found to synthesize!")
        return 0
    
    # Create output directory
    Path(output_dir).mkdir(exist_ok=True)
    
    print(f"Starting REAL Matcha TTS synthesis for {len(texts)} texts from {text_file}...")
    
    successful = 0
    
    for i, text in enumerate(texts, 1):
        try:
            print(f"\n[{i}/{len(texts)}] Processing: {text[:50]}...")
            
            # Use a temporary output name that's definitely a file
            temp_output = f"temp_audio_{i}.wav"
            final_output = f"{output_dir}/{i}.wav"
            
            # Clean up any existing temp files
            if os.path.exists(temp_output):
                if os.path.isfile(temp_output):
                    os.remove(temp_output)
                else:
                    shutil.rmtree(temp_output)
            
            # Run Matcha TTS with output parameter
            cmd = [
                'matcha-tts',
                '--text', text,
                '--output', temp_output
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=180)
            
            # Check what was created
            if os.path.exists(temp_output):
                if os.path.isfile(temp_output):
                    # It's a proper file, move to final location
                    shutil.move(temp_output, final_output)
                    print(f"Created: {final_output}")
                    successful += 1
                else:
                    # It's a directory, look for WAV files inside
                    print(f"Created directory, looking for audio...")
                    wav_files = list(Path(temp_output).glob("*.wav"))
                    if wav_files:
                        # Move the first WAV file to final location
                        first_audio = wav_files[0]
                        shutil.move(str(first_audio), final_output)
                        shutil.rmtree(temp_output)
                        print(f"Extracted audio: {final_output}")
                        successful += 1
                    else:
                        print(f"No audio files in directory")
                        shutil.rmtree(temp_output)
            else:
                print(f"No output created")
            
            # CLEAN UP: Remove any PNG files created by Matcha TTS
            png_files = list(Path(".").glob("*.png"))
            for png_file in png_files:
                try:
                    png_file.unlink()
                    print(f"Removed PNG file: {png_file}")
                except Exception as e:
                    print(f"Could not remove {png_file}: {e}")
                
        except subprocess.TimeoutExpired:
            print(f"Timeout")
            # Clean up temp files AND any PNG files
            if os.path.exists(f"temp_audio_{i}.wav"):
                if os.path.isfile(f"temp_audio_{i}.wav"):
                    os.remove(f"temp_audio_{i}.wav")
                else:
                    shutil.rmtree(f"temp_audio_{i}.wav")
            # Clean up PNG files on timeout too
            png_files = list(Path(".").glob("*.png"))
            for png_file in png_files:
                try:
                    png_file.unlink()
                except:
                    pass
        except Exception as e:
            print(f"Error: {e}")
            # Clean up temp files AND any PNG files
            if os.path.exists(f"temp_audio_{i}.wav"):
                if os.path.isfile(f"temp_audio_{i}.wav"):
                    os.remove(f"temp_audio_{i}.wav")
                else:
                    shutil.rmtree(f"temp_audio_{i}.wav")
            # Clean up PNG files on error too
            png_files = list(Path(".").glob("*.png"))
            for png_file in png_files:
                try:
                    png_file.unlink()
                except:
                    pass
    
    print(f"\n Synthesis completed: {successful}/{len(texts)} successful")
    return successful

successful = synthesize_with_matcha_cli_from_file('texts.txt')

Loaded 4602 texts from texts.txt
Starting REAL Matcha TTS synthesis for 4602 texts from texts.txt...

[1/4602] Processing: Дини Динден тышкары Аянты  Жалпы  Суу бетинин ....
Created directory, looking for audio...
Extracted audio: audio_files/1.wav
Removed PNG file: utterance_001.png

[2/4602] Processing: Түндүктө Казакстан, батышта жана түштүк-батышта Өз...
Created directory, looking for audio...
Extracted audio: audio_files/2.wav
Removed PNG file: utterance_001.png

[3/4602] Processing: Кыргызстан  басымдуу аймагы тоолуу аймактарды камт...


KeyboardInterrupt: 

#### Demonstrate audio

In [None]:
# Audio Demonstration Section
def demonstrate_audio_files(audio_dir="audio_files", num_to_show=5):
    """Display and play sample audio files with analysis"""
    
    print("\n" + "="*60)
    print("AUDIO FILES DEMONSTRATION")
    print("="*60)
    
    # Get list of audio files
    audio_files = list(Path(audio_dir).glob("*.wav"))
    audio_files.sort(key=lambda x: int(x.stem))
    
    # Read the text file to show corresponding texts
    with open("texts.txt", 'r', encoding='utf-8') as f:
        text_lines = f.readlines()
    
    print(f"Found {len(audio_files)} audio files")
    print(f"Showing first {min(num_to_show, len(audio_files))} files for demonstration\n")
    
    for i, audio_file in enumerate(audio_files[:num_to_show]):
        print(f"\n{'─'*50}")
        print(f"🎵 AUDIO FILE {i+1}: {audio_file.name}")
        print(f"{'─'*50}")
        
        try:
            # Load audio data
            audio_data, sample_rate = sf.read(str(audio_file))
            duration = len(audio_data) / sample_rate
            
            # Display file information
            print(f"File Info:")
            print(f"   • Duration: {duration:.2f} seconds")
            print(f"   • Sample rate: {sample_rate} Hz")
            print(f"   • Channels: {audio_data.shape[1] if len(audio_data.shape) > 1 else 1}")
            print(f"   • File size: {os.path.getsize(audio_file) / 1024:.1f} KB")
            
            # Display corresponding text
            if i < len(text_lines):
                text_preview = text_lines[i].strip()
                # Truncate long texts for display
                if len(text_preview) > 100:
                    text_preview = text_preview[:100] + "..."
                print(f"Text: {text_preview}")
            
            # Create a simple waveform visualization
            plt.figure(figsize=(10, 3))
            
            # Handle both mono and stereo audio
            if len(audio_data.shape) > 1:
                audio_mono = audio_data.mean(axis=1)
            else:
                audio_mono = audio_data
            
            # Plot first 2 seconds or entire audio if shorter
            samples_to_plot = min(len(audio_mono), int(2 * sample_rate))
            time_axis = np.linspace(0, samples_to_plot/sample_rate, samples_to_plot)
            
            plt.plot(time_axis, audio_mono[:samples_to_plot], color='blue', alpha=0.7)
            plt.title(f'Audio Waveform: {audio_file.name}')
            plt.xlabel('Time (seconds)')
            plt.ylabel('Amplitude')
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            
            # Display audio player
            print("Audio Player:")
            display(Audio(str(audio_file)))
            
            # Add small delay between files for better presentation
            time.sleep(0.5)
            
        except Exception as e:
            print(f"Error displaying {audio_file.name}: {e}")
    
    print(f"\n{'='*60}")
    print("Audio demonstration completed!")
    print(f"{'='*60}")

def analyze_audio_quality(audio_dir="audio_files"):
    """Analyze the quality and characteristics of generated audio files"""
    
    print("\n" + "="*60)
    print("AUDIO QUALITY ANALYSIS")
    print("="*60)
    
    audio_files = list(Path(audio_dir).glob("*.wav"))
    
    if not audio_files:
        print("No audio files found for analysis")
        return
    
    durations = []
    file_sizes = []
    
    for audio_file in audio_files:
        try:
            audio_data, sample_rate = sf.read(str(audio_file))
            duration = len(audio_data) / sample_rate
            file_size = os.path.getsize(audio_file) / 1024  # KB
            
            durations.append(duration)
            file_sizes.append(file_size)
            
        except Exception as e:
            print(f"Error analyzing {audio_file.name}: {e}")
    
    if durations:
        print(f"Statistics for {len(durations)} audio files:")
        print(f"   Average duration: {np.mean(durations):.2f} seconds")
        print(f"   Duration range: {min(durations):.2f}s - {max(durations):.2f}s")
        print(f"   Total audio duration: {sum(durations)/60:.2f} minutes")
        print(f"   Average file size: {np.mean(file_sizes):.1f} KB")
        print(f"   Total storage used: {sum(file_sizes)/1024:.2f} MB")
        
        # Create summary plot
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.hist(durations, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        plt.xlabel('Duration (seconds)')
        plt.ylabel('Number of Files')
        plt.title('Audio Duration Distribution')
        plt.grid(True, alpha=0.3)
        
        plt.subplot(1, 2, 2)
        plt.hist(file_sizes, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
        plt.xlabel('File Size (KB)')
        plt.ylabel('Number of Files')
        plt.title('File Size Distribution')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Run the demonstration
print("Starting audio demonstration...")

demonstrate_audio_files(num_to_show=5) 

analyze_audio_quality()


Starting audio demonstration...

AUDIO FILES DEMONSTRATION


NameError: name 'Path' is not defined

#### Create archive file

In [None]:
def create_archive():
    archive_name = "bvt2201_nyathi_17_matcha_kyrgyz.tar.gz"
    
    print("\n" + "="*60)
    print("CREATING FINAL ARCHIVE")
    print("="*60)
    
    with tarfile.open(archive_name, "w:gz") as tar:
        # Add text file
        if os.path.exists("texts.txt"):
            tar.add("texts.txt")
            print("Added texts.txt")
        
        # Add audio files
        audio_dir = "audio_files"
        if os.path.exists(audio_dir):
            audio_files = list(Path(audio_dir).glob("*.wav"))
            for audio_file in audio_files:
                tar.add(audio_file)
            print(f"Added {len(audio_files)} audio files")
    
    # Check archive size and contents
    archive_size = os.path.getsize(archive_name) / 1024  # Size in KB
    print(f"\n Archive created successfully!")
    print(f"   Archive name: {archive_name}")
    print(f"   Archive size: {archive_size:.2f} KB")
    
    # Show archive contents
    with tarfile.open(archive_name, "r:gz") as tar:
        members = tar.getnames()
        text_files = [f for f in members if f.endswith('.txt')]
        audio_files = [f for f in members if f.endswith('.wav')]
        
        print(f"   Text files: {len(text_files)}")
        print(f"   Audio files: {len(audio_files)}")
        print(f"   First 5 files: {members[:5]}")
    
    return archive_name

# Create the archive
archive_path = create_archive()

print(f"Archive file: {archive_path}")


CREATING FINAL ARCHIVE
Added texts.txt
Added 3 audio files

 Archive created successfully!
   Archive name: bvt2201_nyathi_17_matcha_kyrgyz.tar.gz
   Archive size: 3784.54 KB
   Text files: 1
   Audio files: 3
   First 5 files: ['texts.txt', 'audio_files/1.wav', 'audio_files/2.wav', 'audio_files/3.wav']
Archive file: bvt2201_nyathi_17_matcha_kyrgyz.tar.gz
