Примените один из трансформеров, например BERT к задаче классификации отзывов клиентов. Сравните полученные результаты с классическими методами машинного обучения, с RNN. Сделайте выводы

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import time

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
import pandas as pd

In [None]:
import os

In [None]:
def load_reviews_from_folder(label, folder_path):
    reviews = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    review_text = line.strip()
                    reviews.append({'text': review_text, 'label': label})
    return reviews

In [None]:
folder_path_positive_reviews = './p'
positive_label = 1
positive_reviews = load_reviews_from_folder(positive_label, folder_path_positive_reviews)

folder_path_negative_reviews = './n'
negative_label = 0
negative_reviews = load_reviews_from_folder(negative_label, folder_path_negative_reviews)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


In [None]:

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
positive_reviews = positive_reviews[::2]
negative_reviews = negative_reviews[::2]

In [None]:
def process_reviews(reviews):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    processed_reviews = []
    for review in reviews:
        if 'text' not in review or 'label' not in review:
            continue
        tokens = word_tokenize(review['text'])

        processed_tokens = [
            lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.lower() not in stop_words and token.lower() not in string.punctuation
        ]

        processed_review = ' '.join(processed_tokens)

        processed_reviews.append({'text': processed_review, 'label': review['label']})

    return processed_reviews

In [None]:
processed_neg_reviews = process_reviews(negative_reviews)
processed_pos_reviews = process_reviews(positive_reviews)

In [None]:
df = pd.DataFrame(processed_pos_reviews + processed_neg_reviews, columns=['text', 'label'])

In [None]:
df.head()

Unnamed: 0,text,label
0,simplistic silly tedious,1
1,exploitative largely devoid depth sophisticati...,1
2,visually flashy narratively opaque emotionally...,1
3,thing give movie point bravado -- take entirel...,1
4,unfortunately story actor served hack script,1


In [None]:
df.tail()

Unnamed: 0,text,label
5327,performance absolute joy,0
5328,grant carry day impeccable comic timing raffis...,0
5329,exuberantly romantic serenely melancholy time ...,0
5330,standing shadow motown best kind documentary o...,0
5331,provides porthole noble trembling incoherence ...,0


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)

    processed_tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words and token.lower() not in string.punctuation
    ]

    processed_text = ' '.join(processed_tokens)
    return processed_text

def bert_classification(train_texts, train_labels, test_texts, test_labels):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    train_texts = [preprocess_text(text) for text in train_texts]
    test_texts = [preprocess_text(text) for text in test_texts]

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
    test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels))

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    optimizer = optim.AdamW(model.parameters(), lr=3e-5)


    loss_fn = nn.CrossEntropyLoss()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.9)

    for epoch in range(3):
        model.train()
        start_time = time.time()

        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1}/{3}", unit="batch")):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)

            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()

            if (batch_idx + 1) % 100 == 0:
                predictions = torch.argmax(outputs.logits, dim=1)
                batch_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
                print(f"Epoch {epoch + 1}/{3}, Batch {batch_idx + 1}, Batch Accuracy: {batch_accuracy:.4f}")

        elapsed_time = time.time() - start_time
        print(f"Epoch {epoch + 1}/{3} completed. Time elapsed: {elapsed_time:.2f} seconds")

        scheduler.step()

    model.eval()
    predicted_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            predicted_labels.extend(predicted.cpu().numpy())

    return predicted_labels


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    tokens = word_tokenize(text)

    processed_tokens = [
        lemmatizer.lemmatize(token.lower())
        for token in tokens
        if token.lower() not in stop_words and token.lower() not in string.punctuation
    ]

    processed_text = ' '.join(processed_tokens)
    return processed_text
def classical_ml_classification(train_df, test_df, epochs=1):
    train_df['text'] = train_df['text'].apply(preprocess_text)
    test_df['text'] = test_df['text'].apply(preprocess_text)

    train_texts = train_df['text'].tolist()
    test_texts = test_df['text'].tolist()
    train_labels = train_df['label'].to_numpy()
    test_labels = test_df['label'].to_numpy()

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)

    model = LogisticRegression(random_state=42)

    for epoch in range(epochs):
        model.fit(X_train, train_labels)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(test_labels, y_pred)
        report = classification_report(test_labels, y_pred)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
        print("------------------------------")

    return model, vectorizer

In [None]:
def rnn_classification(train_texts, train_labels, test_texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_texts)

    train_sequences = tokenizer.texts_to_sequences(train_texts)
    test_sequences = tokenizer.texts_to_sequences(test_texts)

    vocab_size = len(tokenizer.word_index) + 1

    train_padded = pad_sequences(train_sequences, maxlen=128, padding='post', truncating='post')
    test_padded = pad_sequences(test_sequences, maxlen=128, padding='post', truncating='post')

    model = Sequential([
        Embedding(vocab_size, 32, input_length=128),
        Bidirectional(LSTM(64)),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(train_padded, train_labels, epochs=3, batch_size=16)

    predictions = model.predict(test_padded)
    predicted_labels = (predictions > 0.5).astype(int)

    accuracy = accuracy_score(test_labels, predicted_labels)
    precision = precision_score(test_labels, predicted_labels)
    recall = recall_score(test_labels, predicted_labels)
    f1 = f1_score(test_labels, predicted_labels)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return predicted_labels.flatten()


In [None]:
train_texts = train_df['text'].tolist()
test_texts = test_df['text'].tolist()

In [None]:
train_labels = train_df['label'].to_numpy()
test_labels = test_df['label'].to_numpy()

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [None]:
pip install tqdm



In [None]:
bert_predictions = bert_classification(train_texts, train_labels, test_texts, test_labels)

print("BERT Classification Results:")
print("Accuracy:", accuracy_score(test_labels, bert_predictions))
print("Classification Report:\n", classification_report(test_labels, bert_predictions))


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3:  37%|███▋      | 100/267 [05:14<09:41,  3.48s/batch]

Epoch 1/3, Batch 100, Batch Accuracy: 0.8750


Epoch 1/3:  75%|███████▍  | 200/267 [10:25<03:27,  3.10s/batch]

Epoch 1/3, Batch 200, Batch Accuracy: 1.0000


Epoch 1/3: 100%|██████████| 267/267 [13:54<00:00,  3.13s/batch]


Epoch 1/3 completed. Time elapsed: 834.48 seconds


Epoch 2/3:  37%|███▋      | 100/267 [05:14<08:34,  3.08s/batch]

Epoch 2/3, Batch 100, Batch Accuracy: 0.8125


Epoch 2/3:  75%|███████▍  | 200/267 [10:29<03:26,  3.09s/batch]

Epoch 2/3, Batch 200, Batch Accuracy: 0.8125


Epoch 2/3: 100%|██████████| 267/267 [13:58<00:00,  3.14s/batch]


Epoch 2/3 completed. Time elapsed: 838.66 seconds


Epoch 3/3:  37%|███▋      | 100/267 [05:13<08:34,  3.08s/batch]

Epoch 3/3, Batch 100, Batch Accuracy: 1.0000


Epoch 3/3:  75%|███████▍  | 200/267 [10:26<03:28,  3.11s/batch]

Epoch 3/3, Batch 200, Batch Accuracy: 1.0000


Epoch 3/3: 100%|██████████| 267/267 [13:55<00:00,  3.13s/batch]


Epoch 3/3 completed. Time elapsed: 835.68 seconds
BERT Classification Results:
Accuracy: 0.7478912839737581
Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.88      0.78       529
           1       0.84      0.62      0.71       538

    accuracy                           0.75      1067
   macro avg       0.77      0.75      0.74      1067
weighted avg       0.77      0.75      0.74      1067



In [None]:
ml_predictions = classical_ml_classification(train_texts, train_labels, test_texts)

print("\nClassical ML Classification Results:")
print("Accuracy:", accuracy_score(test_labels, ml_predictions))
print("Classification Report:\n", classification_report(test_labels, ml_predictions))



Classical ML Classification Results:
Accuracy: 0.7497656982193065
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.76      0.75       529
           1       0.76      0.74      0.75       538

    accuracy                           0.75      1067
   macro avg       0.75      0.75      0.75      1067
weighted avg       0.75      0.75      0.75      1067



In [None]:
model, vectorizer = classical_ml_classification(train_df, test_df, epochs=3)

Epoch 1/3
Accuracy: 0.7516
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75       529
           1       0.75      0.76      0.76       538

    accuracy                           0.75      1067
   macro avg       0.75      0.75      0.75      1067
weighted avg       0.75      0.75      0.75      1067

------------------------------
Epoch 2/3
Accuracy: 0.7516
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75       529
           1       0.75      0.76      0.76       538

    accuracy                           0.75      1067
   macro avg       0.75      0.75      0.75      1067
weighted avg       0.75      0.75      0.75      1067

------------------------------
Epoch 3/3
Accuracy: 0.7516
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.75       529
           1       0.75  

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
rnn_predictions = rnn_classification(train_texts, train_labels, test_texts)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.7301
Precision: 0.7216
Recall: 0.7565
F1 Score: 0.7387


ВЫводы

BERT Classification:

Преимущества:

Высокая точность классификации (74.79%).
BERT позволяет учесть контекст и зависимости между словами.
Обработка текста и токенизация выполняются автоматически с использованием BERT-токенайзера.
Модель способна работать с небольшим объемом данных благодаря передаче знаний из предобученной модели.

Недостатки:

Требует значительных вычислительных ресурсов и времени для обучения, даже после урезки датасета в 2 раза, потребовалось намного больше времени для обучения, чем для остальных способов
Возможны сложности в настройке и использовании для новых задач.

Classical ML Classification (TF-IDF + Logistic Regression):

Преимущества:

Приемлемая точность классификации (74.98%).
Простота и быстрота обучения по сравнению с BERT. (Обучилась практически мнгновенно, если сравнивать с bert)
Эффективен для небольших и средних по размеру наборов данных.

Недостатки:

Модель ограничена в способности улавливать сложные зависимости и контекст в тексте.

RNN Classification:

Преимущества:

Относительно хорошая точность классификации (73.01%).
Способность учитывать последовательные зависимости в тексте.
Реализация с использованием билинейного LSTM для улучшения понимания контекста.

Недостатки:

Требует времени на обучение, хотя из-за того, что bert требовал слишком много времени на обучение, я использовала лишь 3 эпохи, и чтобы сравнить модели, не стала увеличивать кол-во эпох в других примерах, поэтому в данном случае rnn обучилась довольно быстро.
Может потребовать больше данных для достижения лучшей обобщающей способности.


**Задание 3. Примените один из трансформеров, например BERT, к задаче генерации англоязычного и русскоязычного текстов. Сравните результаты с LSTM. Сделайте выводы.**

Т.к. bert не очень подходит для задачи генерации текста (больше подходит для классификации или извлечения информациии), то я буду использовать gpt

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn as nn
import torch.optim as optim

import numpy as np
text=''
with open('dostoevsky.txt', 'r', encoding='utf-8') as file:
    text = file.read()
text = text[:len(text)//3]

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

prompt = "I'm a little tired I want New Year already"
input_ids = tokenizer.encode(prompt, return_tensors='pt')

output = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print('----------------------------------')
print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------------
Generated Text: I'm a little tired. I want New Year already.

"I want to go to the gym. It's a good time. We're going to be together. And I'm going out there and I'll be ready for it."


In [None]:
prompt = "waiting for the new year"
input_ids = tokenizer.encode(prompt, return_tensors='pt')

output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print('----------------------------------')
print("Generated Text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------------------------------
Generated Text: waiting for the new year.

"I'm not going to be able to do that," he said. "I've got to get back to work. I'm going back home to my family and my kids. It's going well."
.@TroySnyder is back in the lineup for tonight's game against the Detroit Lions. pic.twitter.com/QJ9XJ6Xq9W — The Detroit News (@TheDetroitNews) December 11, 2017
, the first of three games in which Snyder will be in Detroit. The first game of the season will take place at 7 p.m. on Sunday, December 12.


In [None]:
import torch
import torch.nn as nn

class LSTMGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(LSTMGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x)
        lstm_out, hidden = self.lstm(embedded, hidden)
        output = self.fc(lstm_out)
        return output, hidden

def generate_text_lstm(model, tokenizer, prompt, max_length=50):
    model.eval()

    input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

    hidden = None

    generated_text = []

    for _ in range(max_length):
        output, hidden = model(input_ids, hidden)
        predicted_id = torch.argmax(output[:, -1, :]).item()

        generated_text.append(predicted_id)

        input_ids = torch.tensor([[predicted_id]])

    generated_text = tokenizer.decode(generated_text, skip_special_tokens=True)
    return generated_text

vocab_size = len(tokenizer)
embedding_dim = 256
hidden_size = 512
num_layers = 2

lstm_model = LSTMGenerator(vocab_size, embedding_dim, hidden_size, num_layers)

prompt = "waiting for the new year"
generated_text_lstm = generate_text_lstm(lstm_model, tokenizer, prompt, max_length=50)

print('----------------------------------')
print("Generated Text (LSTM):", generated_text_lstm)


----------------------------------
Generated Text (LSTM): afe PROGRAM PROGRAMCustom textureorryorryorry rescuing grains grains Roose Roose adulteryCrewODUCT legally indicating speciallyExportExport smuggled attention attention attentionMarcusigne favoredaferistfledgedfledged 2011 2011^^Custom refreshed spont refreshedCustom Catal Andromedaresist kicking kicking FANTcule%); FANTeus


Как мы видим, gpt справился куда лучше, тк LSTM инициализируется случайными весами, а не обучается на наборе данных (а она требует большой набор данных). Кроме того, можно заметить, что в gpt при увеличении max_length, улучшается и генерируемый текст

**Задание 4. Примените один из трансформеров, например BERT, к задаче машинного перевода.**

In [None]:
!pip install sentencepiece
from transformers import MarianMTModel, MarianTokenizer



In [None]:
model_name = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

source_text = "Hello, how are you?"

input_ids = tokenizer.encode(source_text, return_tensors="pt")

output_ids = model.generate(input_ids)

translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Source Text:", source_text)
print("Translated Text:", translated_text)



tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Source Text: Hello, how are you?
Translated Text: Привет, как дела?


In [None]:
source_text = "I want a New Year already"

input_ids = tokenizer.encode(source_text, return_tensors="pt")

output_ids = model.generate(input_ids)

translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Source Text:", source_text)
print("Translated Text:", translated_text)

Source Text: I want a New Year already
Translated Text: Я уже хочу новый год.


In [None]:
model_name = "Helsinki-NLP/opus-mt-ru-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
source_text = "Хочется Новый год уже"

input_ids = tokenizer.encode(source_text, return_tensors="pt")
output_ids = model.generate(input_ids)

translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Source Text:", source_text)
print("Translated Text:", translated_text)


Source Text: Хочется Новый год уже
Translated Text: I'd like New Year's already.


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

input_text = "Новый год"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

max_length = 100
for _ in range(max_length):
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits[:, -1, :]
        predicted_id = torch.argmax(logits, dim=-1)
        input_ids = torch.cat([input_ids, predicted_id.unsqueeze(1)], dim=-1)

generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(generated_text)


Новый года простивально простивально простивально простивально простивально простивально простивально простив


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = 'sberbank-ai/rugpt3large_based_on_gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def generate_text(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

russian_prompt = "Новый год - это"
print(generate_text(russian_prompt, model, tokenizer))

pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

Новый год - это время, когда мы начинаем жить по-новому.

В этом году мы решили сделать для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.

Мы подготовили для вас новогодний подарок.


