In [8]:
!pip install datasets



In [None]:
!pip install openai==0.28

In [9]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate
import random

In [10]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [11]:
print("Loading pre-trained DistilBERT...")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Loading pre-trained DistilBERT...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preprocess the dataset

In [12]:
def preprocess_data(examples, tokenizer, device, max_length=512):
    """
    Tokenizes the text data and prepares it for model input.

    Args:
        examples: A batch of text samples from the dataset.
        tokenizer: The tokenizer instance for the model.
        device: The device (CPU/GPU) where tensors will be moved.
        max_length: Maximum sequence length for padding/truncation.

    Returns:
        Encoded tensors moved to the specified device.
    """
    # Tokenize the text
    encoded = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)

    # Ensure tensors are on the same device
    for key in encoded:
        encoded[key] = torch.tensor(encoded[key]).to(device)  # Explicitly move to GPU

    return encoded

# Load dataset
print("Loading dataset...")
dataset = load_dataset("imdb")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Sample 5000 rows from the training and test sets first
train_sample = dataset["train"].shuffle(seed=40).select(range(5000))  # Select first 1000 from shuffled dataset
test_sample = dataset["test"].shuffle(seed=40).select(range(5000))

# Preprocess the sampled dataset
print("Preprocessing sampled dataset...")
tokenized_train = train_sample.map(lambda x: preprocess_data(x, tokenizer, device), batched=True)
tokenized_test = test_sample.map(lambda x: preprocess_data(x, tokenizer, device), batched=True)

# Rename and set format
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Now you have tokenized, sampled datasets
train_dataset = tokenized_train
test_dataset = tokenized_test

Loading dataset...
Preprocessing sampled dataset...


In [13]:
def compute_metrics(pred):
    """
    Computes classification metrics (accuracy, precision, recall, F1-score)
    for the model's predictions.

    Args:
        pred: Predictions output by the model during evaluation.

    Returns:
        A dictionary containing calculated metrics.
    """
    logits, labels = pred
    preds = torch.argmax(torch.tensor(logits), axis=1)
    #preds = torch.argmax(logits, axis=1) # No need to convert to tensor, already a tensor
    # Move both predictions and labels to CPU
    preds = preds.cpu().numpy()
    #labels = labels.cpu().numpy()
    labels = labels.cpu().numpy() if isinstance(labels, torch.Tensor) else labels
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)

    # Return a dictionary containing the computed metrics
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1} # Added return statement


### Define Trainer and TrainingArguments

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save the model at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,  # Ensure best model is loaded
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,  # Enable mixed precision for faster training on compatible GPUs
    report_to="none",
    no_cuda=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


### Train the model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
print("Training the model...")
# Verify that the model is on the GPU
print(f"Model is on device: {next(model.parameters()).device}")
trainer.train()

Training the model...
Model is on device: cuda:0


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.253806,0.8934,0.876201,0.915261,0.895305
2,0.280100,0.362496,0.889,0.839831,0.960241,0.896009
3,0.280100,0.357314,0.9164,0.920114,0.911245,0.915658


TrainOutput(global_step=939, training_loss=0.19960033347685371, metrics={'train_runtime': 289.4679, 'train_samples_per_second': 51.819, 'train_steps_per_second': 3.244, 'total_flos': 1987010979840000.0, 'train_loss': 0.19960033347685371, 'epoch': 3.0})

In [17]:
# Evaluate the model
print("Evaluating the model...")
results = trainer.evaluate()
print(f"Results: {results}")

Evaluating the model...


Results: {'eval_loss': 0.3573143184185028, 'eval_accuracy': 0.9164, 'eval_precision': 0.9201135442011354, 'eval_recall': 0.9112449799196787, 'eval_f1': 0.9156577885391445, 'eval_runtime': 20.5768, 'eval_samples_per_second': 242.992, 'eval_steps_per_second': 15.211, 'epoch': 3.0}


In [18]:
# Save the fine-tuned model
model.save_pretrained("./distilbert-imdb-classifier")
tokenizer.save_pretrained("./distilbert-imdb-classifier")

('./distilbert-imdb-classifier/tokenizer_config.json',
 './distilbert-imdb-classifier/special_tokens_map.json',
 './distilbert-imdb-classifier/vocab.txt',
 './distilbert-imdb-classifier/added_tokens.json')

In [19]:
import openai
import pandas as pd
import time

In [20]:
# Set your OpenAI API key
openai.api_key = "" # Add OpenAI Key here

In [21]:
# Load the IMDB dataset and sample 1000 rows
#dataset = load_dataset("imdb")
eval_data = dataset["test"].shuffle(seed=40).select(range(5000))

In [22]:
# Define the prompt template
def create_prompt(review_text):
    """
    Creates a classification prompt for GPT-3.5.

    Args:
        review_text: The text of the movie review.

    Returns:
        A formatted string prompt for GPT-3.5.
    """
    return f"""
    The following text is a movie review. Classify it as either "Positive" or "Negative" based on the sentiment expressed in the review.
    Review: "{review_text}"
    Sentiment:
    """

### Loop through the evaluation dataset and collect responses

In [23]:
def query_openai(review_text):
    """
    Sends a review to GPT-3.5 for sentiment classification.

    Args:
        review_text: The text of the movie review.

    Returns:
        Predicted sentiment label as a string.
    """
    prompt = create_prompt(review_text)
    try:
        response = openai.ChatCompletion.create(  # Use ChatCompletion
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=10,
            temperature=0
        )
        return response.choices[0].message.content.strip()  # Access content differently
    except Exception as e:
        print(f"Error: {e}")
        return None

In [24]:
true_labels = []
predicted_labels = []

for row in eval_data:
    review_text = row["text"]
    true_label = "Positive" if row["label"] == 1 else "Negative"

    # Get prediction from OpenAI
    prediction = query_openai(review_text)

    # Append results
    true_labels.append(true_label)
    predicted_labels.append(prediction)

    # Optional: Add delay to avoid hitting API rate limits
    time.sleep(1)


In [25]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

In [26]:
true_labels = [1 if label == "Positive" else 0 for label in true_labels]
predicted_labels = [1 if label == "Positive" else 0 for label in predicted_labels]

# Calculate metrics with average='binary'
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, pos_label=1, average='binary')
recall = recall_score(true_labels, predicted_labels, pos_label=1, average='binary')
f1 = f1_score(true_labels, predicted_labels, pos_label=1, average='binary')

# Evaluate performance
#accuracy = accuracy_score(true_labels, predicted_labels)
#precision = precision_score(true_labels, predicted_labels, pos_label="Positive", average='micro')  # Changed to 'micro'
#recall = recall_score(true_labels, predicted_labels, pos_label="Positive", average='micro')  # Changed to 'micro'
#f1 = f1_score(true_labels, predicted_labels, pos_label="Positive", average='micro')  # Changed to 'micro'

# Print evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Save results to a CSV file
results_df = pd.DataFrame({
    "Review": [row["text"] for row in eval_data],
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})
results_df.to_csv("imdb_openai_evaluation_results.csv", index=False)
print("Results saved to imdb_openai_evaluation_results.csv")

Evaluation Metrics:
Accuracy: 0.9180
Precision: 0.9622
Recall: 0.8605
F1 Score: 0.9085
Results saved to imdb_openai_evaluation_results.csv
