# Import required libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset
import gradio as gr

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 1: Load and prepare the dataset
def load_and_prepare_data():
    # Load AG News dataset from Hugging Face
    dataset = load_dataset("ag_news")

    # Get class names
    class_names = dataset['train'].features['label'].names

    # Split the dataset
    train_data = dataset['train']
    test_data = dataset['test']

    # For demonstration, we'll use a smaller subset
    train_data = train_data.select(range(10000))
    test_data = test_data.select(range(2000))

    return train_data, test_data, class_names

# Step 2: Tokenization
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def tokenize_data(train_data, test_data):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the text
    train_texts = train_data['text']
    test_texts = test_data['text']

    train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)


    # Create datasets
    train_dataset = NewsDataset(train_encodings, train_data['label'])
    test_dataset = NewsDataset(test_encodings, test_data['label'])

    return train_dataset, test_dataset, tokenizer

# Step 3: Model Training
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

def train_model(train_dataset, test_dataset, class_names):
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(class_names)
    ).to(device)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    return model

# Step 4: Evaluation
def evaluate_model(model, test_dataset):
    trainer = Trainer(model=model)
    predictions = trainer.predict(test_dataset)
    metrics = compute_metrics(predictions)
    return metrics

# Step 5: Deployment with Gradio
def create_gradio_interface(model, tokenizer, class_names):
    def classify_headline(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        top_prob, top_label = torch.max(probs, dim=1)

        return {
            "Predicted Category": class_names[top_label.item()],
            "Confidence": f"{top_prob.item():.2f}"
        }

    interface = gr.Interface(
        fn=classify_headline,
        inputs=gr.Textbox(label="News Headline", placeholder="Enter news headline here..."),
        outputs=gr.Label(label="Classification Result"),
        title="News Topic Classifier",
        description="Classify news headlines into categories using BERT",
        examples=[
            ["Apple announces new iPhone with revolutionary features"],
            ["Football team wins championship after dramatic final match"],
            ["Stock market reaches all-time high amid economic recovery"],
            ["Scientists discover new species in Amazon rainforest"]
        ]
    )

    return interface

# Main execution
def main():
    # Step 1: Load data
    train_data, test_data, class_names = load_and_prepare_data()
    print(f"Class names: {class_names}")
    print(f"Train samples: {len(train_data)}, Test samples: {len(test_data)}")

    # Step 2: Tokenize
    train_dataset, test_dataset, tokenizer = tokenize_data(train_data, test_data)

    # Step 3: Train model
    model = train_model(train_dataset, test_dataset, class_names)

    # Step 4: Evaluate
    metrics = evaluate_model(model, test_dataset)
    print(f"\nEvaluation Results:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")

    # Step 5: Deploy
    print("\nLaunching Gradio interface...")
    interface = create_gradio_interface(model, tokenizer, class_names)
    interface.launch()

if __name__ == "__main__":
    main()

In [1]:
pip install torch transformers  scikit-learn gradio



In [6]:
# Import required libraries
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset
import gradio as gr
from datetime import datetime

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if GPU is available and enable mixed precision
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Mixed precision training - Removed explicit autocast
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    # torch.cuda.amp.autocast(enabled=True) # Removed this line

# Step 1: Load and prepare the dataset (optimized)
def load_and_prepare_data():
    print(f"{datetime.now().strftime('%H:%M:%S')} - Loading dataset...")
    dataset = load_dataset("ag_news", streaming=False)  # Disable streaming for faster access

    class_names = dataset['train'].features['label'].names

    # Use smaller subsets for demonstration
    train_data = dataset['train'].select(range(8000))  # Reduced from 10000
    test_data = dataset['test'].select(range(1000))    # Reduced from 2000

    # Convert to lists
    train_texts = train_data['text']
    train_labels = train_data['label']
    test_texts = test_data['text']
    test_labels = test_data['label']

    print(f"{datetime.now().strftime('%H:%M:%S')} - Dataset loaded")
    return train_texts, train_labels, test_texts, test_labels, class_names

# Optimized Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

def tokenize_data(train_texts, train_labels, test_texts, test_labels):
    print(f"{datetime.now().strftime('%H:%M:%S')} - Tokenizing data...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize in batches
    train_encodings = tokenizer(
        list(train_texts),
        truncation=True,
        padding='max_length',
        max_length=96,  # Reduced from 128
        return_tensors='pt'
    )

    test_encodings = tokenizer(
        list(test_texts),
        truncation=True,
        padding='max_length',
        max_length=96,
        return_tensors='pt'
    )

    print(f"{datetime.now().strftime('%H:%M:%S')} - Tokenization complete")
    return (
        NewsDataset(train_encodings, list(train_labels)),
        NewsDataset(test_encodings, list(test_labels)),
        tokenizer
    )

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

def train_model(train_dataset, test_dataset, class_names):
    print(f"{datetime.now().strftime('%H:%M:%S')} - Starting training...")

    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(class_names)
    ).to(device)

    # Optimized training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,  # Reduced from 3
        per_device_train_batch_size=8,  # Reduced from 16
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=4,  # Simulates larger batch size
        warmup_steps=100,  # Reduced from 500
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=50,
        # eval_strategy="steps", # Temporarily removed
        # eval_steps=500,  # Temporarily removed
        # save_strategy="steps", # Temporarily removed
        # save_steps=500, # Temporarily removed
        # load_best_model_at_end=True, # Temporarily removed
        report_to="none",
        fp16=torch.cuda.is_available(),  # Enable mixed precision
        dataloader_num_workers=4 if torch.cuda.is_available() else 2,
        gradient_checkpointing=False  # Disabled gradient checkpointing
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    print(f"{datetime.now().strftime('%H:%M:%S')} - Training complete")
    return model

def create_gradio_interface(model, tokenizer, class_names):
    def classify_headline(text):
        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding='max_length',
            max_length=96
        ).to(device)

        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        top_prob, top_label = torch.max(probs, dim=1)

        return {
            "Predicted Category": class_names[top_label.item()],
            "Confidence": f"{top_prob.item():.4f}"
        }

    return gr.Interface(
        fn=classify_headline,
        inputs=gr.Textbox(label="News Headline"),
        outputs=gr.Label(label="Classification Result"),
        title="News Topic Classifier",
        examples=[
            ["Apple announces new iPhone with revolutionary features"],
            ["Football team wins championship after dramatic final match"]
        ]
    )

def main():
    # Load data
    train_texts, train_labels, test_texts, test_labels, class_names = load_and_prepare_data()

    # Tokenize
    train_dataset, test_dataset, tokenizer = tokenize_data(
        train_texts, train_labels, test_texts, test_labels
    )

    # Train
    model = train_model(train_dataset, test_dataset, class_names)

    # Evaluate
    metrics = evaluate_model(model, test_dataset)
    print(f"\nEvaluation Results:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")

    # Deploy
    print("\nLaunching Gradio interface...")
    interface = create_gradio_interface(model, tokenizer, class_names)
    interface.launch()

if __name__ == "__main__":
    main()

Using device: cuda
11:48:29 - Loading dataset...
11:48:31 - Dataset loaded
11:48:31 - Tokenizing data...
11:48:39 - Tokenization complete
11:48:39 - Starting training...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),


Step,Training Loss,Validation Loss,Accuracy,F1
500,0.1775,0.22746,0.928,0.927906


  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention

11:51:31 - Training complete


  'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
  'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfaeezsalman939[0m ([33mfaeezsalman939-bahria-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



Evaluation Results:
Accuracy: 0.9280
F1 Score: 0.9279

Launching Gradio interface...
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ae7e887c5c07b789f8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [10]:
!pip install -U datasets

