# Model Initiation

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import pandas as pd

In [None]:
MODEL_NAME_YA = "yandex/YandexGPT-5-Lite-8B-instruct"

tokenizer_ya = AutoTokenizer.from_pretrained(MODEL_NAME_YA, legacy=False)
model_ya = AutoModelForCausalLM.from_pretrained(
   MODEL_NAME_YA,
   device_map="cuda",
   torch_dtype="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Data Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_path = "/content/drive/My Drive/mag diploma/cyberleninka_ru_sents.txt"

In [None]:
with open(data_path, encoding='utf-8') as f:
    sents = f.readlines()
ru_sents = list(map(lambda x: x.replace('\n', ''), sents))

In [None]:
from statistics import mean


print('Mean length is: ' + str(mean([len(i) for i in ru_sents])))
print('Max length is: ' + str(max([len(i) for i in ru_sents])))

Mean length is: 157.31399212404628
Max length is: 1271


# Playground

In [None]:
input_text = "Ты профессиональный переводчик. Переведи на английский язык: Финский аблатив имеет окончание -lta или -ltä в зависимости от гармонии гласных."
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer_ya.apply_chat_template(
    messages, tokenize=True, return_tensors="pt"
)

outputs = model_ya.generate(input_ids, max_new_tokens=1024)
print(tokenizer_ya.decode(outputs[0][input_ids.size(1) :], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The Finnish ablative has the ending -lta or -ltä depending on vowel harmony.


# Tranlsating sentences

In [None]:
PROMPT_START = 'Ты профессиональный переводчик. Переведи на английский язык:  '
MAX_NEW_LENGTH = 100

prompts_src = [[{"role": "user", "content": PROMPT_START + text}] for text in ru_sents]

In [3]:
predictions_ya = []

for prompt in prompts_src:
    input_ids = tokenizer_ya.apply_chat_template(
        prompt, tokenize=True, return_tensors="pt"
    )

    outputs = model_ya.generate(input_ids, max_new_tokens=MAX_NEW_LENGTH)
    answer = tokenizer_ya.decode(outputs[0][input_ids.size(1) :], skip_special_tokens=True)
    predictions_ya.append(answer)

100%|██████████| 16252/16252 [2:46:46<00:00,  1.62it/s]


In [None]:
df = pd.DataFrame({'ru': ru_sents, 'en': predictions_ya})

In [None]:
df.to_csv('cyberleninka.csv', sep='\t')