In [5]:
!pip install -q transformers datasets torch accelerate pandas gradio scikit-learn

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

print("Libraries installed and W&B disabled.")


Libraries installed and W&B disabled.


In [6]:
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import gradio as gr


In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_fn(batch):
    return tokenizer(batch["review"], truncation=True, padding="max_length", max_length=256)


In [8]:
def batch_analyze(df_input, model):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    reviews = df_input["review"].astype(str).tolist()

    preds, probs = [], []

    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(reviews), 32)):
            batch = reviews[i:i+32]
            enc = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)

            out = model(**enc)
            logits = out.logits.cpu().numpy()

            softmax = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)

            preds.extend(logits.argmax(axis=1))
            probs.extend(softmax.tolist())

    inv_map = {1: "positive", 0: "negative", 2: "neutral"}

    df_input["predicted"] = [inv_map[int(p)] for p in preds]
    df_input["confidence"] = [probs[i][int(preds[i])] for i in range(len(preds))]

    stats = df_input["predicted"].value_counts(normalize=True).mul(100).round(2).to_dict()

    return df_input, stats


In [9]:
def web_app(training_file, input_file):
    # STEP A: Load training dataset
    df_train = pd.read_csv(training_file.name)

    # convert sentiment labels
    label_map = {"positive": 1, "negative": 0, "neutral": 2}
    df_train["labels"] = df_train["sentiment"].map(label_map)

    # Convert to HF Dataset
    dataset = Dataset.from_pandas(df_train)

    # Tokenize training data
    tokenized = dataset.map(tokenize_fn, batched=True)
    tokenized.set_format("torch", columns=["input_ids","attention_mask","labels"])

    # STEP B: Train the model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

    training_args = TrainingArguments(
        output_dir="./ui_sentiment_model",
        per_device_train_batch_size=8,
        num_train_epochs=5,
        save_strategy="no",
        eval_strategy="no",
        report_to=None
    )

    trainer = Trainer(model=model, args=training_args, train_dataset=tokenized)
    trainer.train()

    # STEP C: Load input dataset
    df_input = pd.read_csv(input_file.name)

    # STEP D: Analyze input dataset
    df_out, stats = batch_analyze(df_input, model)

    # Convert stats dict â†’ text
    stats_text = "\n".join([f"{k}: {v}%" for k, v in stats.items()])

    # Save output CSV
    out_path = "/tmp/predictions.csv"
    df_out.to_csv(out_path, index=False)

    return stats_text, out_path


In [10]:
ui = gr.Interface(
    fn=web_app,
    inputs=[
        gr.File(label="ðŸ“¥ Upload TRAINING dataset (CSV with review + sentiment)"),
        gr.File(label="ðŸ“¥ Upload INPUT dataset (CSV with review only)")
    ],
    outputs=[
        gr.Textbox(label="ðŸ“Š Sentiment Statistics"),
        gr.File(label="ðŸ“„ Download Predictions CSV")
    ],
    title="Sentiment Analysis â€” FULL Training + Batch Prediction",
    description="Upload training & input CSV. The model will train and analyze automatically."
)

ui.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2b3f8dfc744074b00f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


