In [92]:
!pip install transformers datasets accelerate evaluate pyarrow torch --quiet

In [93]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

dataset = load_dataset("ag_news")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [94]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [95]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
small_test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


In [96]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
from transformers import TrainingArguments, Trainer
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir='./news-category-model',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    report_to='none'  # disable reporting to wandb
)

In [98]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [99]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3458,0.366692,0.892
2,0.2693,0.36205,0.904


TrainOutput(global_step=1250, training_loss=0.3487506675720215, metrics={'train_runtime': 189.7747, 'train_samples_per_second': 52.694, 'train_steps_per_second': 6.587, 'total_flos': 331180308480000.0, 'train_loss': 0.3487506675720215, 'epoch': 2.0})

In [100]:
trainer.evaluate()


{'eval_loss': 0.3620496690273285,
 'eval_accuracy': 0.904,
 'eval_runtime': 3.6609,
 'eval_samples_per_second': 273.156,
 'eval_steps_per_second': 34.145,
 'epoch': 2.0}

In [101]:
import torch

text = "The stock market saw a major rise today due to strong economic reports."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


device = next(model.parameters()).device
inputs = {k: v.to(device) for k, v in inputs.items()}


with torch.no_grad():
    outputs = model(**inputs)

predicted_class = outputs.logits.argmax(dim=-1).item()
labels = ["World", "Sports", "Business", "Sci/Tech"]

print("Predicted Category:", labels[predicted_class])


Predicted Category: Business


In [113]:
from huggingface_hub import HfApi


api = HfApi(token=my_write_token)
try:
  user_info = api.whoami()
  actual_username = user_info['name']
except Exception as e:
  print(f"Could not retrieve username. Please ensure your token is valid and you have internet access. Error: {e}")
  actual_username = "YOUR_HF_USERNAME"


repo_name = f"{actual_username}/news-category-classifier"
print(f"Uploading to: {repo_name}")


trainer.push_to_hub(repo_name, token=my_write_token)
tokenizer.push_to_hub(repo_name, token=my_write_token)

print("Upload successful!")

Uploading to: naitikrishu/news-category-classifier


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...y-model/training_args.bin: 100%|##########| 5.78kB / 5.78kB            

  ...y-model/model.safetensors:  16%|#5        | 41.9MB /  268MB            

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Upload successful!


In [114]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="naitikrishu/news-category-model")

Device set to use cuda:0


**Storing your API Key Securely in Colab Secrets**

1.  **Open the Secrets tab:** In the left sidebar of your Colab notebook, click on the "🔑 Secrets" tab.
2.  **Add a new secret:** Click on "Add new secret".
3.  **Name the secret:** In the "Name" field, enter `HF_API_KEY`.
4.  **Enter the value:** In the "Value" field, paste your Hugging Face API key with write permissions.
5.  **Save:** Click "Save secret".

Now you can access this secret in your notebook code without exposing the key itself.

In [111]:
from google.colab import userdata

# Access the API key from Colab Secrets
my_write_token = userdata.get('HF_API_KEY')



Now you can use the `my_write_token` variable in the code cells that require your Hugging Face token, like when pushing to the hub or using the `HfApi`.

In [109]:
text1 = "The Warriors won the championship game last night."
text2 = "Apple just announced their new iPhone and a new M-series chip."
text3 = "The federal reserve is expected to raise interest rates next quarter."

print(pipe(text1))
print(pipe(text2))
print(pipe(text3))

[{'label': 'LABEL_1', 'score': 0.9934438467025757}]
[{'label': 'LABEL_3', 'score': 0.9860444068908691}]
[{'label': 'LABEL_2', 'score': 0.9818921685218811}]
