<a href="https://colab.research.google.com/github/GermannM3/model_sf/blob/master/Kenga_Dual_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone -b master https://github.com/GermannM3/model_sf.git

Cloning into 'model_sf'...
remote: Enumerating objects: 222, done.[K
remote: Counting objects: 100% (222/222), done.[K
remote: Compressing objects: 100% (205/205), done.[K
remote: Total 222 (delta 35), reused 6 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (222/222), 660.56 KiB | 4.97 MiB/s, done.
Resolving deltas: 100% (35/35), done.


In [4]:
%cd model_sf

/content/model_sf


In [None]:
!pip install -r requirements.txt

Collecting fastapi>=0.104.1 (from -r requirements.txt (line 2))
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting python-telegram-bot>=20.6 (from -r requirements.txt (line 3))
  Downloading python_telegram_bot-21.10-py3-none-any.whl.metadata (17 kB)
Collecting asyncio>=3.4.3 (from -r requirements.txt (line 6))
  Downloading asyncio-3.4.3-py3-none-any.whl.metadata (1.7 kB)
Collecting grafana-api>=1.0.3 (from -r requirements.txt (line 13))
  Downloading grafana_api-1.0.3-py2.py3-none-any.whl.metadata (4.3 kB)
Collecting pytest-asyncio>=0.21.1 (from -r requirements.txt (line 17))
  Downloading pytest_asyncio-0.25.3-py3-none-any.whl.metadata (3.9 kB)
Collecting pytest-cov>=4.1.0 (from -r requirements.txt (line 18))
  Downloading pytest_cov-6.0.0-py3-none-any.whl.metadata (27 kB)
Collecting mypy>=1.7.0 (from -r requirements.txt (line 21))
  Downloading mypy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Coll

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
import matplotlib.pyplot as plt

# Импортируем модули для обучения и инференса
from src.neural.models.kenga import KengaConfig, KengaModel
from src.neural.training import KengaTrainer
from src.neural.data import TextDataset
from src.utils.tokenizer import Tokenizer

from src.neural.models.kenga_s import KengaSConfig, KengaSModel
from src.neural.training_kenga_s import KengaSTrainer

def generate_tokens(files, tokenizer):
    tokens = []
    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    t = tokenizer.encode(line)
                    if t:
                        tokens.append(t)
    return tokens

# Создаем директории и файлы с данными
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
train_files = ["data/train.txt"]
val_files = ["data/val.txt"]
for file in train_files + val_files:
    if not os.path.exists(file):
        with open(file, "w", encoding="utf-8") as f:
            f.write("This is a sample sentence.\nThis is another sentence.\n")

# Инициализируем токенизатор, обучаем его и сохраняем
tokenizer = Tokenizer()
tokenizer.train(train_files)
tokenizer.save("models", "kenga")

train_tokens = generate_tokens(train_files, tokenizer)
val_tokens = generate_tokens(val_files, tokenizer)

train_dataset = TextDataset(train_tokens)
val_dataset = TextDataset(val_tokens)

# Инициализация модели Kenga (MML) и тренера
config = KengaConfig()
config.vocab_size = tokenizer.vocab_size
model_mml = KengaModel(config)
trainer_mml = KengaTrainer(model_mml, train_dataset, val_dataset, {
    "lr": 1e-4,
    "batch_size": 4,
    "epochs": 5,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "model_path": "models/kenga.pth"
})

# Инициализация модели KengaS (SML) и тренера
config_s = KengaSConfig()
config_s.vocab_size = tokenizer.vocab_size
model_sml = KengaSModel(config_s)
trainer_sml = KengaSTrainer(model_sml, train_dataset, val_dataset, {
    "lr": 1e-4,
    "batch_size": 4,
    "epochs": 5,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "model_path": "models/kengaS.pth"
})

# Запускаем параллельное обучение обеих моделей
import threading
t1 = threading.Thread(target=trainer_mml.train)
t2 = threading.Thread(target=trainer_sml.train)
t1.start()
t2.start()
t1.join()
t2.join()

# После обучения выводим тестовый инференс
model_mml.eval()
model_sml.eval()
input_ids = torch.randint(0, tokenizer.vocab_size, (1, 10))
mml_output = model_mml(input_ids)
sml_output = model_sml(input_ids)
print("Kenga (MML) Output:", mml_output)
print("KengaS (SML) Output:", sml_output)

# Выводим графики прогресса обучения (если тренеры записали историю потерь)
if trainer_mml.train_loss_history and trainer_sml.train_loss_history:
    epochs_mml = range(1, len(trainer_mml.train_loss_history) + 1)
    epochs_sml = range(1, len(trainer_sml.train_loss_history) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1,2,1)
    plt.plot(epochs_mml, trainer_mml.train_loss_history, label='Train Loss (MML)')
    plt.plot(epochs_mml, trainer_mml.val_loss_history, label='Val Loss (MML)')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Kenga (MML) Training Progress')
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(epochs_sml, trainer_sml.train_loss_history, label='Train Loss (SML)')
    plt.plot(epochs_sml, trainer_sml.val_loss_history, label='Val Loss (SML)')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('KengaS (SML) Training Progress')
    plt.legend()

    plt.show()


In [None]:
# Если настроены креденциалы Git, можно отправить чекпоинты в GitLab
!git config --global user.email "your_email@example.com"
!git config --global user.name "Your Name"
!git add models/
!git commit -m "Обновленные чекпоинты и графики обучения после тренировки в Colab"
!git push gitlab main