In [12]:
import pandas as pd
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

print("All libraries imported successfully.")

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

All libraries imported successfully.
Using device: cpu


In [13]:
# --- Configuration ---

# 1. List all your manually graded CSV files
FILE_PATHS = [
    'ball_test.csv', 
    'bracelet_test.csv', 
    'lens_test.csv', 
    'fan_test.csv', 
    'coffee_test.csv', 
    'ps4_controller_test.csv'
]

# 2. Choose the base model to fine-tune. RoBERTa is our best performer so far.
BASE_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# 3. Define the name for your new, custom model
NEW_MODEL_NAME = "review-sentiment-roberta-custom"



In [14]:
#### Cell 4: Load and Prepare the Dataset
# --- Step 1: Load, Combine, and Preprocess Data ---

# Load all CSVs into a single DataFrame
list_of_dfs = [pd.read_csv(fp) for fp in FILE_PATHS]
df = pd.concat(list_of_dfs, ignore_index=True)

# Use the same text cleaning function from our analysis notebook
def clean_review_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('READ MORE', '')
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text'] = df['review_text'].apply(clean_review_text)

# Handle labels: Replace 'F' with 'N' and create integer labels
df['label_str'] = df['review_feel'].replace('F', 'N')

# Define our label mapping
labels = ['positive', 'neutral', 'negative']
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

# Map string labels ('P', 'O', 'N') to our new integer labels
str_to_id_map = {'P': label2id['positive'], 'O': label2id['neutral'], 'N': label2id['negative']}
df['label'] = df['label_str'].map(str_to_id_map)

# Remove rows with missing labels or text
df = df.dropna(subset=['text', 'label'])
df['label'] = df['label'].astype(int)

# Keep only the columns we need
final_df = df[['text', 'label']]

print(f"Dataset prepared. Total examples: {len(final_df)}")
print("\nLabel Distribution:")
print(final_df['label'].value_counts())
final_df.head()

Dataset prepared. Total examples: 374

Label Distribution:
label
0    302
2     52
1     20
Name: count, dtype: int64


Unnamed: 0,text,label
0,good,0
1,good,0
2,usefull for overloading the hand,0
3,good product,0
4,good,0


In [15]:
# --- Step 2: Split Data into Training and Validation Sets ---

# Split the DataFrame (e.g., 90% for training, 10% for validation)
train_df, eval_df = train_test_split(final_df, test_size=0.1, random_state=42, stratify=final_df['label'])

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

print(f"Training set size: {len(train_dataset)}")
print(f"Evaluation set size: {len(eval_dataset)}")

Training set size: 336
Evaluation set size: 38


In [16]:
# --- Step 3: Tokenize the Datasets ---

# Load the tokenizer for our base model
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

# Create a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenization to our datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

print("Tokenization complete.")

Map:   0%|                                                            | 0/336 [00:00<?, ? examples/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|███████████████████████████████████████████████| 336/336 [00:00<00:00, 15863.37 examples/s]
Map: 100%|██████████████████████████████████████████████████| 38/38 [00:00<00:00, 7206.38 examples/s]

Tokenization complete.





In [17]:
# --- Step 4: Load Model and Define Metrics ---

# Load the pre-trained model, configured for our 3 labels
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
).to(device) # Move model to GPU if available

# Define the function to compute metrics during evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    
    return {"accuracy": accuracy, "f1_macro": f1}

print("Base model loaded and metrics function defined.")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Base model loaded and metrics function defined.


In [18]:
# --- Step 5: Define Training Arguments (FINAL, SIMPLIFIED VERSION) ---

# We will evaluate and save at the end of each epoch.
# Training set size = 436, batch_size = 8 -> Steps per epoch = 55.
EVAL_AND_SAVE_STEPS = 55 

training_args = TrainingArguments(
    output_dir=NEW_MODEL_NAME,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    
    # --- Correct arguments for your library version ---
    do_eval=True,                      # Enable evaluation
    eval_steps=EVAL_AND_SAVE_STEPS,    # Evaluate every 55 steps
    save_steps=EVAL_AND_SAVE_STEPS,    # Save every 55 steps
    
    # We are removing `load_best_model_at_end` to work around a version-specific bug.
    # The trainer will still save and evaluate every 55 steps.
)

print("Training arguments set successfully.")

Training arguments set successfully.


In [19]:
# --- Step 6: Create and Run the Trainer (FINAL CORRECTED VERSION) ---

from transformers import DataCollatorWithPadding # <-- 1. IMPORT THE HELPER

# Instantiate the data collator, which will handle padding for each batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator, # <-- 2. TELL THE TRAINER TO USE IT
)

print("Starting the fine-tuning process...")
trainer.train()
print("Fine-tuning complete!")

Starting the fine-tuning process...




Step,Training Loss
10,2.9193
20,0.8713
30,0.7967
40,0.841
50,0.6494
60,0.5201
70,0.2224
80,0.3856
90,0.2805
100,0.247




Fine-tuning complete!


In [21]:
# --- Step 7: Test Your Fine-Tuned Model (Corrected Path) ---

from transformers import pipeline

# The path must point to the specific checkpoint with the saved model files.
# This should be the last checkpoint saved during training.
MODEL_CHECKPOINT_PATH = "review-sentiment-roberta-custom/checkpoint-165" 

print(f"Loading your custom fine-tuned model from '{MODEL_CHECKPOINT_PATH}'...")

my_custom_pipeline = pipeline(
    "sentiment-analysis", 
    model=MODEL_CHECKPOINT_PATH, # <-- This now points to the correct sub-folder
    device=-1
)

print("Model loaded successfully. Running test predictions...")

reviews = [
    "The product stopped working after one day, very disappointing.",
    "It works exactly as described, I'm very happy with this purchase.",
    "The delivery was on time.",
    "this is the worst thing i have ever bought",
    "it is okay, not great but not bad either"
]

results = my_custom_pipeline(reviews)

for review, result in zip(reviews, results):
    print(f"\nReview: '{review}'")
    print(f"Predicted Sentiment: {result['label']} (Score: {result['score']:.4f})")

Loading your custom fine-tuned model from 'review-sentiment-roberta-custom/checkpoint-165'...


Device set to use cpu


Model loaded successfully. Running test predictions...

Review: 'The product stopped working after one day, very disappointing.'
Predicted Sentiment: negative (Score: 0.9935)

Review: 'It works exactly as described, I'm very happy with this purchase.'
Predicted Sentiment: positive (Score: 0.9975)

Review: 'The delivery was on time.'
Predicted Sentiment: positive (Score: 0.8855)

Review: 'this is the worst thing i have ever bought'
Predicted Sentiment: negative (Score: 0.9908)

Review: 'it is okay, not great but not bad either'
Predicted Sentiment: neutral (Score: 0.6627)
