In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

csv_path = "/kaggle/input/sentiment-bert/sentiment-analysis-extended-v2.csv"

# Read single-column CSV
df = pd.read_csv(csv_path, header=None, names=["raw"])

print(df.head())

# Function to extract text + sentiment
def extract_text_and_label(row):
    line = str(row)

    # Regex:
    #   1) quoted text
    #   2) Positive or Negative
    match = re.match(r'\"(.+?)\",\s*(Positive|Negative)', line, flags=re.IGNORECASE)

    if match:
        text = match.group(1).strip()
        label = match.group(2).strip().lower()
        return pd.Series([text, label])
    else:
        # return None so we can drop problematic rows later
        return pd.Series([None, None])

df[['text', 'label']] = df['raw'].apply(extract_text_and_label)

# Drop rows that failed extraction
df = df.dropna(subset=['text', 'label'])

# Map labels to 0/1
label2id = {'negative': 0, 'positive': 1}
df['label_id'] = df['label'].map(label2id)

print(df.head())

# Split 80/10/10
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)
val_df, test_df   = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=42)

print("Splits:", len(train_df), len(val_df), len(test_df))

print("10 rows of train_df:")

print(train_df[['text']].sample(10, random_state=42))

                                                 raw
0  Text, Sentiment, Source, Date/Time, User ID, L...
1  "I love this product!", Positive, Twitter, 202...
2  "The service was terrible.", Negative, Yelp Re...
3  "This movie is amazing!", Positive, IMDb, 2023...
4  "I'm so disappointed with their customer suppo...
                                                 raw  \
1  "I love this product!", Positive, Twitter, 202...   
2  "The service was terrible.", Negative, Yelp Re...   
3  "This movie is amazing!", Positive, IMDb, 2023...   
4  "I'm so disappointed with their customer suppo...   
5  "Just had the best meal of my life!", Positive...   

                                               text     label  label_id  
1                              I love this product!  positive         1  
2                         The service was terrible.  negative         0  
3                            This movie is amazing!  positive         1  
4  I'm so disappointed with their customer suppor

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow, evaluate
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [4]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Tokenization function for datasets
max_length = 128

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

# Build hf datasets from pandas
train_ds = Dataset.from_pandas(train_df[['text','label_id']].reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df[['text','label_id']].reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df[['text','label_id']].reset_index(drop=True))

# rename label column to 'label' (expected by transformers)
train_ds = train_ds.rename_column("label_id", "label")
val_ds   = val_ds.rename_column("label_id", "label")
test_ds  = test_ds.rename_column("label_id", "label")

dataset = DatasetDict({"train": train_ds, "validation": val_ds, "test": test_ds})

# map tokenization (batched)
dataset = dataset.map(lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length),
                      batched=True)

# set format to PyTorch tensors
dataset.set_format(type="torch", columns=['input_ids','attention_mask','label','token_type_ids'])

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Simplified metrics - only accuracy
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    return {"accuracy": acc["accuracy"]}

# Training arguments with UPDATED parameter names
training_args = TrainingArguments(
    output_dir="./bert-sentiment",
    eval_strategy="epoch",           # CHANGED: evaluation_strategy -> eval_strategy
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
    fp16=True,
    logging_dir="./logs",
    logging_strategy="epoch",        
    report_to="none",               
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Starting training...")
train_output = trainer.train()

# Evaluate on test set
print("\nEvaluating on test set...")
test_metrics = trainer.evaluate(dataset["test"])
print(f"Test Loss: {test_metrics['eval_loss']:.4f}")
print(f"Test Accuracy: {test_metrics['eval_accuracy']:.4f}")

# Save final model and tokenizer
trainer.save_model("/kaggle/working/bert-sentiment-final")
tokenizer.save_pretrained("/kaggle/working/bert-sentiment-final")

print("\nModel saved successfully!")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/404 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


Starting training...




Epoch,Training Loss,Validation Loss,Accuracy
1,0.5535,0.341788,1.0
2,0.2913,0.229894,1.0
3,0.2133,0.188164,1.0





Evaluating on test set...




Test Loss: 0.3327
Test Accuracy: 1.0000

Model saved successfully!


In [5]:
from transformers import pipeline
clf = pipeline("text-classification", model="/kaggle/working/bert-sentiment-final", tokenizer=tokenizer, return_all_scores=False)

examples = [
    "I love this product!",
    "The service was terrible.",
    "I'm so disappointed with their customer support."
]

print(clf(examples))
# Output will include label name (e.g., 'LABEL_1') and score. You can map label ids back to label names:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}


Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.86412513256073}, {'label': 'LABEL_0', 'score': 0.6569504141807556}, {'label': 'LABEL_0', 'score': 0.6091249585151672}]


In [6]:
# ---------------------------------
# 1.  create a ZIP of the whole folder
# ---------------------------------
import shutil, os
zip_path = "/kaggle/working/bert-sentiment-final.zip"
shutil.make_archive(zip_path.replace(".zip",""), 'zip', "/kaggle/working/bert-sentiment-final")



'/kaggle/working/bert-sentiment-final.zip'