In [None]:
from transformers import (
    GPT2Tokenizer,
    GPT2ForSequenceClassification,  # Use sequence classification model
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate
import os
from datasets import load_dataset
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Step 1: Load the Yelp Review Dataset
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")

# Step 2: Initialize the GPT-2 Tokenizer and Assign a Padding Token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Use <|endoftext|> as the padding token

# Step 3: Tokenize the Dataset and Add Labels
def tokenize_function(example):
    tokens = tokenizer(
        example['text'],             # Process the text column
        truncation=True,             # Truncate to max length
        padding='max_length',        # Pad to max length
        max_length=1024,             # Define maximum input length
    )
    tokens["labels"] = example['label']  # Use label from dataset for classification task
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Select a subset of the dataset
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

# Step 4: Define the Model for Sequence Classification
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id  # Assign the padding token to the model

# Step 5: Create a Data Collator for Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Step 6: Define the Training Arguments
training_args = TrainingArguments(
    output_dir='test_train_gpt2',           # Directory to save checkpoints
    eval_strategy="epoch",                  # Perform evaluation at the end of each epoch
    per_device_train_batch_size=8,          # Batch size per device
    per_device_eval_batch_size=8,           # Evaluation batch size
    num_train_epochs=10,                    # Number of training epochs
    learning_rate=0.00005,
    prediction_loss_only=False,
)

# Step 7: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,      # Training dataset
    eval_dataset=small_eval_dataset,        # Evaluation dataset
    tokenizer=tokenizer,                    # Tokenizer
    data_collator=data_collator,            # Data collator for padding
    compute_metrics=compute_metrics,        # Compute accuracy metrics
)

# Step 8: Train the Model
trainer.train()

# Step 9: Save the Fine-Tuned Model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')


In [None]:
from transformers import GPT2ForSequenceClassification

# 加载训练好的分类模型
model = GPT2ForSequenceClassification.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# 输入文本
input_text = "This is a great product!"

# 编码文本
inputs = tokenizer(input_text, return_tensors='pt')

# 获取模型的输出
outputs = model(**inputs)

# 获取预测结果
logits = outputs.logits
predicted_class = logits.argmax(dim=-1).item()

print(f"Predicted class: {predicted_class}")


In [None]:
from datasets import load_dataset
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length",max_length=512, truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)#将 tokenize_function 应用到 dataset 的每个元素上。dataset 应该是一个 Hugging Face datasets 库中的数据集对象。

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=2)
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch",num_train_epochs=10,per_device_train_batch_size=8,learning_rate=0.00005)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
import gradio as gr
from transformers import BertTokenizer, BertForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
import torch

# Step 1: Load BERT and GPT-2 models for sentiment classification

# Load BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)

# Step 2: Define inference functions for each model

# BERT sentiment classification
def classify_with_bert(text):
    inputs = bert_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return "Positive" if prediction == 1 else "Negative"

# GPT-2 sentiment classification
def classify_with_gpt2(text):
    inputs = gpt2_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=1024)
    with torch.no_grad():
        outputs = gpt2_model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()
    return "Positive" if prediction == 1 else "Negative"

# Step 3: Define Gradio interface

def analyze_sentiment(text):
    bert_result = classify_with_bert(text)
    gpt2_result = classify_with_gpt2(text)
    return bert_result, gpt2_result

# Create a Gradio interface
iface = gr.Interface(
    fn=analyze_sentiment,
    inputs=gr.Textbox(label="Enter Text for Sentiment Analysis"),
    outputs=[gr.Textbox(label="BERT Sentiment"), gr.Textbox(label="GPT-2 Sentiment")],
    live=False,
    title="Sentiment Analysis with BERT and GPT-2",
    description="This interface analyzes the sentiment of the input text using both BERT and GPT-2 models."

)

# Step 4: Launch the interface
iface.launch()
