In [7]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Load the data
df = pd.read_csv('/kaggle/input/news-dataset/final_data.csv')
df.head()

Unnamed: 0,category,text
0,CRIME,I Won't Be Coming Home for Christmas: The Chri...
1,CRIME,Gang Used Drone Swarm To Thwart FBI Hostage Ra...
2,CRIME,Bystander Opens Fire On Suspected Home Depot S...
3,CRIME,'Very Disorderly' Waldo Makes It Easy For Poli...
4,CRIME,3 Seriously Injured In Grand Central Station S...


In [9]:
encoder = LabelEncoder()
df['categoryEncoded'] = encoder.fit_transform(df['category'])

# df['headline'] = df['headline'].apply(lambda headline: str(headline).lower())
# df['short_description'] = df['short_description'].apply(lambda descr: str(descr).lower())
# df['descr_len'] = df['short_description'].apply(lambda x: len(str(x).split()))
# df['headline_len'] = df['headline'].apply(lambda x: len(str(x).split()))

In [None]:
# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['categoryEncoded'], 
    random_state=2020, test_size=0.15
)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Tokenization function
def tokenize_texts(texts, tokenizer, max_length=80):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

In [None]:
train_encodings = tokenize_texts(train_texts.astype('str'), tokenizer, max_length=80)
test_encodings = tokenize_texts(test_texts.astype('str'), tokenizer, max_length=80)

# 5. Create a custom Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.values  
        
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx] , dtype=torch.long)
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Initialize the model (BERT for sequence classification with 11 classes)
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=11)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [14]:
print(device)

cuda


In [None]:
from tqdm import tqdm

# Training loop
for epoch in range(10):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{10}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)  # the model returns loss and logits
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{10}, Loss: {avg_loss:.4f}")

Epoch 1/10: 100%|██████████| 1183/1183 [15:10<00:00,  1.30it/s, loss=0.597]


Epoch 1/10, Loss: 0.6786


Epoch 2/10: 100%|██████████| 1183/1183 [15:12<00:00,  1.30it/s, loss=0.491] 


Epoch 2/10, Loss: 0.2756


Epoch 3/10: 100%|██████████| 1183/1183 [15:14<00:00,  1.29it/s, loss=0.24]   


Epoch 3/10, Loss: 0.1258


Epoch 4/10: 100%|██████████| 1183/1183 [15:16<00:00,  1.29it/s, loss=0.344]  


Epoch 4/10, Loss: 0.0747


Epoch 5/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.00531]


Epoch 5/10, Loss: 0.0548


Epoch 6/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.00388]


Epoch 6/10, Loss: 0.0442


Epoch 7/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.00889] 


Epoch 7/10, Loss: 0.0398


Epoch 8/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.00251] 


Epoch 8/10, Loss: 0.0274


Epoch 9/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.267]  


Epoch 9/10, Loss: 0.0357


Epoch 10/10: 100%|██████████| 1183/1183 [15:17<00:00,  1.29it/s, loss=0.00121]

Epoch 10/10, Loss: 0.0743





In [None]:
model.eval()
all_preds = []
all_labels = []

# Test DataLoader
for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)
    
    all_preds.extend(preds.cpu().numpy())
    all_labels.extend(batch['labels'].cpu().numpy())

# Map integer predictions back to category names
predicted_categories = [encoder.classes_[i] for i in all_preds]
true_categories = [encoder.classes_[i] for i in all_labels]

# Create a results DataFrame
result_df = pd.DataFrame({
    'description': test_texts.tolist(),  # test_texts is the original text Series from train_test_split
    'true_category': true_categories,
    'predicted_category': predicted_categories
})

print(result_df.head())
print(f"Accuracy is {accuracy_score(true_categories, predicted_categories):.4f}")

                                         description  true_category  \
0  Stop Worrying: 2 Tools That Work \n Worrying i...         HEALTH   
1  get scared people say shes careerbest taapsee ...  ENTERTAINMENT   
2  Lob Haircuts Are Having A Moment In This Week'...      LIFESTYLE   
3  The Future of Venture Capital (Part 2) \n I wa...        SCIENCE   
4  Steve Martin Says ‘Only Murders In The Buildin...  ENTERTAINMENT   

  predicted_category  
0             HEALTH  
1      ENTERTAINMENT  
2          LIFESTYLE  
3            FINANCE  
4      ENTERTAINMENT  
Accuracy is 0.8374


In [None]:
import os
from shutil import make_archive
from IPython.display import FileLink

output_dir = "/kaggle/working/saved_model"

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Zip the model directory
zip_path = "/kaggle/working/saved_model_zip"
make_archive(zip_path, 'zip', output_dir)

FileLink(zip_path + ".zip")

In [None]:
import torch.nn.functional as F

def predict_category(text, model, tokenizer, device, max_length=80 , threshold=0.5 , common_class="COMMON"):
    """
    Given a text input, predicts the news category.
    
    Args:
        text (str): The input text (news description) [if heading available pass heading + \n + content].
        model (torch.nn.Module): The trained classification model.
        tokenizer: Hugging Face tokenizer corresponding to the model.
        device: Device on which model runs (cpu or cuda).
        label_encoder: Fitted LabelEncoder to convert class indices to category names.
        max_length (int): Maximum token length for model inputs.
        
    Returns:
        predicted_category (str): The predicted news category.
    """
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

    # Get model output
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get logits and convert to probabilities
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)

    # Get highest probability and corresponding label
    max_prob, predicted_class = torch.max(probabilities, dim=-1)
    
    # Decode predicted class
    predicted_class = predicted_class.item()
    confidence = max_prob.item()

    # Load the category labels (modify this based on how labels are stored)
    category_labels = ["POLITICS", "SPORTS", "ENTERTAINMENT", "FINANCE", "HEALTH", "SCIENCE", "WORLD" , "LIFESTYLE" , "CRIME" , "ENVIRONMENT" , "EDUCATION"]  # Update with actual labels
    predicted_label = category_labels[predicted_class] if confidence >= threshold else common_class

    return predicted_label, confidence

text = "Stock markets are experiencing a major crash due to economic instability."
predicted_label, confidence = predict_category(model=model, tokenizer=tokenizer, text=text ,device=device)
print(f"Predicted Category: {predicted_label}, Confidence: {confidence}")

Predicted Category: HEALTH, Confidence: 0.9987987279891968


<table align="left">
  <td>
    <a href="https://colab.research.google.com/github/nyandwi/machine_learning_complete/blob/main/9_nlp_with_tensorflow/5_using_pretrained_bert_for_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  </td>
</table>

In [24]:
result_df

Unnamed: 0,description,true_category,predicted_category
0,Stop Worrying: 2 Tools That Work \n Worrying i...,HEALTH,HEALTH
1,get scared people say shes careerbest taapsee ...,ENTERTAINMENT,ENTERTAINMENT
2,Lob Haircuts Are Having A Moment In This Week'...,LIFESTYLE,LIFESTYLE
3,The Future of Venture Capital (Part 2) \n I wa...,SCIENCE,FINANCE
4,Steve Martin Says ‘Only Murders In The Buildin...,ENTERTAINMENT,ENTERTAINMENT
...,...,...,...
6673,Catalan Parliament Declares Independence From ...,WORLD,WORLD
6674,Alden Ehrenreich Cast As The New Han Solo In '...,ENTERTAINMENT,ENTERTAINMENT
6675,Republicans Happy To Have Trump Distract The P...,POLITICS,POLITICS
6676,Brainstorming Middle School \n If we want our ...,EDUCATION,EDUCATION
