In [85]:
import pandas as pd
import numpy as np
import asyncio
from googletrans import Translator
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Настройки
MAX_WORKERS = 5
CHUNK_SIZE = 50

# Загрузка данных
df = pd.read_csv('data.csv')

# ================== 1. Обработка животных ==================
df['AnimalName'] = (
    df['AnimalName']
    .str.lower()
    .str.strip()
    .str.replace(' ', '')
    .replace({
        'cattle': 'cow', 'pigs': 'pig', 'goats': 'goat',
        'mules': 'mule', 'wolves': 'wolf', 'hyaenas': 'hyena',
        'dogs': 'dog'
    })
)

class_mapping = {
    'птица': ['fowl', 'duck', 'bird', 'chicken', 'otherbird', 'turkey', 'goose', 'birds'],
    'кот': ['cat', 'tiger', 'lion'],
    'собака': ['dog', 'wolf', 'hyena', 'fox'],
    'хомячок': ['hamster', 'gerbil', 'rabbit', 'guineapig']
}

reverse_mapping = {}
for cls, animals in class_mapping.items():
    for animal in animals:
        reverse_mapping[animal] = cls

df['AnimalClass'] = df['AnimalName'].map(reverse_mapping)
mask = df['AnimalClass'].isna()
class_counts = df['AnimalClass'].value_counts()
df.loc[mask, 'AnimalClass'] = class_counts.idxmin()

# ================== 2. Асинхронный перевод ==================
symptoms_columns = ['symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
all_symptoms = df[symptoms_columns].stack().dropna().unique().tolist()

translator = Translator()

async def translate_text(text):
    try:
        result = await translator.translate(text, src='en', dest='ru')
        return text, result.text
    except Exception as e:
        print(f"Ошибка перевода для '{text}': {str(e)}")
        return text, text

async def translate_batch(batch):
    tasks = [translate_text(text) for text in batch]
    return await asyncio.gather(*tasks)

async def main_async():
    batches = [all_symptoms[i:i + CHUNK_SIZE] 
              for i in range(0, len(all_symptoms), CHUNK_SIZE)]
    symptom_translations = {}
    
    with tqdm(total=len(all_symptoms), desc="Перевод симптомов") as pbar:
        for batch in batches:
            results = await translate_batch(batch)
            for original, translated in results:
                symptom_translations[original] = translated
            pbar.update(len(batch))
    
    def translate_row(row):
        symptoms = [str(row[col]) for col in symptoms_columns if pd.notna(row[col])]
        translated = [symptom_translations.get(s, s) for s in symptoms]
        return ', '.join(translated)
    
    df['Симптомы_рус'] = df.apply(translate_row, axis=1)
    df['Dangerous'] = df['Dangerous'].fillna('No')
    df.to_csv('processed_dataset.csv', index=False)
    
    print("\nОбработка завершена!")
    print("Финальное распределение классов:")
    print(df['AnimalClass'].value_counts())

def run_async_code():
    try:
        loop = asyncio.get_event_loop()
        if loop.is_running():
            # Если цикл уже запущен (например, в Jupyter)
            import nest_asyncio
            nest_asyncio.apply()
            loop.run_until_complete(main_async())
        else:
            # Если цикл не запущен
            loop.run_until_complete(main_async())
    except RuntimeError:
        # Альтернативный вариант для Python 3.7+
        asyncio.run(main_async())

# Запуск
if __name__ == '__main__':
    run_async_code()

  # a bit longer, but that's ok.
  # a bit longer, but that's ok.
Перевод симптомов: 100%|██████████| 935/935 [00:10<00:00, 92.86it/s] 


Обработка завершена!
Финальное распределение классов:
AnimalClass
хомячок    588
птица      169
кот         69
собака      45
Name: count, dtype: int64



