In [1]:
# Import necessary libraries
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load the cleaned Stanford Sentiment Treebank dataset
train_df = pd.read_csv("train_cleaned.csv")
dev_df = pd.read_csv("dev_cleaned.csv")
test_df = pd.read_csv("test_cleaned.csv")

# Display dataset sample
print(train_df.head())

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize text
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128)

# Convert dataset to Hugging Face format
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Define BERT model for sentiment classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")

# Load trained model for inference
model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")
tokenizer = BertTokenizer.from_pretrained("bert_sentiment_model")

# Function to predict sentiment of a given text
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    label_map = {0: "very negative", 1: "negative", 2: "neutral", 3: "positive", 4: "very positive"}
    return label_map[prediction]

# Test on a new example
test_text = "The movie was fantastic and very enjoyable!"
predicted_label = predict_sentiment(test_text)
print(f"Predicted Sentiment: {predicted_label}")


   label                                           sentence
0      3  The Rock is destined to be the 21st Century 's...
1      4  The gorgeously elaborate continuation of `` Th...
2      3  Singer\/composer Bryan Adams contributes a sle...
3      2  You 'd think by now America would have had eno...
4      3               Yet the act is still charming here .


ModuleNotFoundError: No module named 'datasets'