In [2]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 1. Параметры
model_name = "openai-community/gpt2-large"
dataset_path = "/content/drive/MyDrive/IMBD_Train.csv"  # путь к вашей таблице

# 2. Загружаем токенизатор и модель
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 не имеет pad_token, укажем его

model = GPT2LMHeadModel.from_pretrained(model_name)

# 3. Загружаем датасет
raw_ds = load_dataset("csv", data_files=dataset_path)

def rate_1_8(r):
    if r > 6:
        return r - 2
    return r

# 4. Преобразуем строки в текст
def format_row(row):
    return f"Rate: {rate_1_8(row['rate'])}, Text: {row['Text']}"

raw_ds = raw_ds.map(lambda x: {"text": format_row(x)})

# 5. Токенизация
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

tokenized_ds = raw_ds.map(tokenize_fn, batched=True, remove_columns=raw_ds["train"].column_names)

# 6. Датаколлатор для языковой модели
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 7. Параметры обучения
training_args = TrainingArguments(
    output_dir="./gpt2-large-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,    # A100 потянет 8-16
    gradient_accumulation_steps=2,    # суммарный эффективный батч ~16-32
    learning_rate=1e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=2,
    fp16=True,                        # A100 поддерживает mixed precision
    dataloader_num_workers=4,
    report_to="none"                  # чтобы не тянуть wandb
)


# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_ds["train"],
)

# 9. Старт обучения
trainer.train()

# 10. Сохранение
trainer.save_model("/content/drive/MyDrive/gpt2-large-finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/gpt2-large-finetuned")


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Step,Training Loss
10,3.4388
20,3.4909
30,3.3927
40,3.3264
50,3.2636
60,3.2613
70,3.2837
80,3.2095
90,3.1998
100,3.2533


('/content/drive/MyDrive/gpt2-large-finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2-large-finetuned/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2-large-finetuned/vocab.json',
 '/content/drive/MyDrive/gpt2-large-finetuned/merges.txt',
 '/content/drive/MyDrive/gpt2-large-finetuned/added_tokens.json')

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# 1. Загружаем модель и токенизатор
model_path = "/content/drive/MyDrive/gpt2-large-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 2. Функция генерации
def generate_review(rate, max_length=100, num_return_sequences=1):
    prompt = f"Rate: {rate}, Text:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output_sequences = model.generate(
        **inputs,
        max_length=max_length + len(inputs["input_ids"][0]),
        temperature=0.8,          # креативность генерации
        top_k=50,                 # случайность выбора токенов
        top_p=0.95,               # nucleus sampling
        do_sample=True,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id
    )

    reviews = []
    for seq in output_sequences:
        text = tokenizer.decode(seq, skip_special_tokens=True)
        # Обрезаем до конца сгенерированного отзыва
        review = text.split("Text:")[-1].strip()
        reviews.append(review)
    return reviews

# 3. Пример использования
rating_input = int(input("Введите рейтинг (1-8): "))
reviews = generate_review(rating_input, max_length=100, num_return_sequences=3)

for i, r in enumerate(reviews):
    print(f"\nReview {i+1}:\n{r}")


Введите рейтинг (1-8): 3

Review 1:
The movie has the same feel as a "drama" which is a rather odd thing to have in a movie, especially a movie which has a great cast. However, this does not detract from the quality of the movie. The acting is average, but that is expected, because it's not your typical movie. In fact, the only thing that made this movie a 3 was the acting. I don't know about you, but I am a very slow viewer. I would much rather watch

Review 2:
In this film, the main character, David, is a gay, British film student living in France. He meets a girl, Gisele, on a night out, they fall in love and decide to marry. However, the wedding is stolen, and Gisele is killed by her abusive husband. David then has to go through a series of horrible events to track down the killer.<br /><br />The acting is very weak. The camera work is slow and choppy, the

Review 3:
I saw this film at the New York Film Festival and was really impressed by it. The performances were excellent and th

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

# Модель для оценки отзывов
eval_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
eval_tokenizer = AutoTokenizer.from_pretrained(eval_model_name)
eval_model = AutoModelForSequenceClassification.from_pretrained(eval_model_name)
eval_model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
eval_model.to(device)

# Функция получения оценки от 0 до 1
def sentiment_score(text):
    inputs = eval_tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        logits = eval_model(**inputs).logits
    probs = torch.softmax(logits, dim=-1)
    # Положительный класс (2)
    return probs[0][2].item()

mae_list = []

for rating_input in range(1, 9):
    reviews = generate_review(rating_input, max_length=100, num_return_sequences=24)

    # Нормализуем рейтинг в диапазон [0,1] для сравнения
    rate_norm = {1: 0, 2: 1/9, 3: 2/9, 4: 3/9, 5: 6/9, 6: 7/9, 7: 8/9, 8: 1}
    #normalized_rating = (rating_input - 1) / (8 - 1)  # если рейтинг 1-8
    normalized_rating = rate_norm[rating_input]

    # Получаем оценки сгенерированных отзывов
    scores = [sentiment_score(r) for r in reviews]

    # Вычисляем среднее абсолютное отклонение (MAE)
    mae = np.mean([abs(s - normalized_rating) for s in scores])

    mae_list.append(mae)

print(f"mae: {sum(mae_list)/len(mae_list)}")


0.19358987867693672
