In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"


In [None]:


import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer

# -----------------------------
# 1. Load your custom CSV dataset
# -----------------------------
df = pd.read_csv("small_movie_dataset.csv")

# Convert sentiment to numerical labels
label_map = {"positive": 1, "negative": 0, "neutral": 2}  # keep neutral if needed
df["labels"] = df["sentiment"].map(label_map)

# Convert to HuggingFace Dataset format
dataset = Dataset.from_pandas(df)

# -----------------------------
# 2. Tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["review"], max_length=256, padding="max_length", truncation=True)

tokenized = dataset.map(tokenize, batched=True)

# -----------------------------
# 3. Prepare data
# -----------------------------
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# -----------------------------
# 4. Model
# -----------------------------
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3   # 3 labels because neutral exists
)

# -----------------------------
# 5. Training args
# -----------------------------
training_args = TrainingArguments(
    output_dir="./bert_large",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    eval_strategy="no",   # small dataset → disable eval
    save_strategy="epoch",
    report_to=None
)

# -----------------------------
# 6. Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized
)

trainer.train()

# -----------------------------
# 7. Save model
# -----------------------------
trainer.save_model("sentiment_bert")
tokenizer.save_pretrained("sentiment_bert")




Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss




('sentiment_bert/tokenizer_config.json',
 'sentiment_bert/special_tokens_map.json',
 'sentiment_bert/vocab.txt',
 'sentiment_bert/added_tokens.json',
 'sentiment_bert/tokenizer.json')

In [None]:


import gradio as gr
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("sentiment_bert")
tokenizer = AutoTokenizer.from_pretrained("sentiment_bert")

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    label = torch.argmax(outputs.logits).item()
    return "Positive" if label == 1 else "Negative"

demo = gr.Interface(fn=predict, inputs="text", outputs="text", title="Movie Review Sentiment Analyzer")
demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2cd5dcc53daea34b9b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


