<a href="https://colab.research.google.com/github/KrzRac/UGP/blob/main/enkoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

drive_results_dir = "/content/drive/MyDrive/roberta_results"
os.makedirs(drive_results_dir, exist_ok=True)

In [None]:
dataset = load_dataset("rotten_tomatoes")

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=128)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
print(tokenized_datasets)

In [None]:
train_dataset = tokenized_datasets["train"]
valid_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [None]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

In [None]:
frozen_layer_count = 4  # Number of encoder layers to freeze
for name, param in model.named_parameters():
    if "embeddings" in name or any(f"layer.{i}." in name for i in range(frozen_layer_count)):
        param.requires_grad = False

In [None]:
print("RobertaForSequenceClassification:")
print(model)
frozen_layers = [name for name, param in model.named_parameters() if not param.requires_grad]
print("Frozen layers:", frozen_layers)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

In [None]:
training_args = TrainingArguments(
    output_dir=drive_results_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=drive_results_dir,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
print("Training RobertaForSequenceClassification...")
trainer.train()

In [None]:
eval_results = trainer.evaluate(test_dataset)

eval_results_file = os.path.join(drive_results_dir, "eval_results.txt")
with open(eval_results_file, "w") as f:
    f.write(str(eval_results))

print(f"Evaluation results saved at: {eval_results_file}")

print("Evaluation results:", eval_results)

In [None]:
print("Generating detailed evaluation report...")
logits, labels = trainer.predict(test_dataset)[:2]
predictions = np.argmax(logits, axis=-1)
report = classification_report(labels, predictions, target_names=["negative", "positive"], digits=4)

classification_report_file = os.path.join(drive_results_dir, "classification_report.txt")
with open(classification_report_file, "w") as f:
    f.write(report)

print(f"Classification report saved at: {classification_report_file}")

print("Classification Report:\n", report)

In [None]:
model_save_path = os.path.join(drive_results_dir, "roberta_finetuned")
tokenizer_save_path = os.path.join(drive_results_dir, "roberta_finetuned_tokenizer")

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model saved at: {model_save_path}")
print(f"Tokenizer saved at: {tokenizer_save_path}")

In [None]:
examples = test_dataset['text'][:3]

tokens = tokenizer(examples, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Ensure the model and data are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokens = {key: val.to(device) for key, val in tokens.items()}

# Get predictions
outputs = model(**tokens)
predictions = torch.argmax(outputs.logits, dim=-1)

print("Predictions:")
for i, example in enumerate(examples):
    print(f"Review: {example}")
    print(f"Predicted sentiment: {'positive' if predictions[i] == 1 else 'negative'}")
