In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [26]:
import pandas as pd
import re
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score

In [20]:
# Load dataset
df = pd.read_csv("iphone14_customer_review.csv")

In [21]:
df['sentiment'] = np.where(df['rating'] > 3, 'Positive', 'Negative')
df = df[['review', 'sentiment']]

In [22]:
# Data Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[\U00010000-\U0010FFFF]", "", text)  # Remove emojis (Unicode range)
    text = re.sub(r"read more", "", text)  # Remove 'read more' phrase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df["review"] = df["review"].apply(clean_text)

In [23]:
# Convert labels to numerical format
df["sentiment"] = df["sentiment"].map({"Positive": 1, "Negative": 0})

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

In [27]:

# Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels.tolist()  # Ensure that labels are in list format for each sample
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels.tolist()  # Same for validation labels
})

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Function to compute accuracy during evaluation
def compute_metrics(p):
    predictions = p.predictions.argmax(axis=-1)  # Get predicted labels
    labels = p.label_ids  # Ground truth labels
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)



# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # Pass the compute_metrics function to evaluate accuracy
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Train the model
trainer.train()

# Save the model
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)


Step,Training Loss
10,0.0747
20,0.1018
30,0.0034
40,0.0802
50,0.0965
60,0.0809
70,0.1631
80,0.1448
90,0.1526
100,0.0043


Evaluation results: {'eval_loss': 0.09003015607595444, 'eval_accuracy': 0.9853658536585366, 'eval_runtime': 1.344, 'eval_samples_per_second': 152.526, 'eval_steps_per_second': 19.345, 'epoch': 3.0}
