In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import GPT2Tokenizer, GPT2Model
import pandas as pd


In [3]:
file="/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json"

In [4]:
import json

def process_file(file_path):
  news = []
  with open(file_path, 'r') as file:
          for line in file:
              json_object = json.loads(line)
              news.append(json_object)
  return news

news = process_file(file)
print(len(news))

209527


In [5]:
import random
# Take 10% subset
subset_size = int(0.1 * len(news))  # Calculate 10% of the data
random.shuffle(news)  # Shuffle the data
subset = news[:subset_size]  # Take the first `subset_size` entries

# Output
print(f"Total news objects: {len(news)}")
print(f"Subset size: {len(subset)}")

Total news objects: 209527
Subset size: 20952


In [6]:
 

text_cat_pairs = []

for news_item in subset:
    headline = news_item.get("headline")
    short_description = news_item.get("short_description")
    text = headline + " || " + short_description
    category = news_item.get("category")
    text_cat_pairs.append((text, category))

text_cat_pairs[5]
 

('Steve Scalise Readmitted To ICU In Serious Condition Over Concerns Of Infection || The GOP whip had been previously upgraded to "fair condition" and moved out of the ICU.',
 'POLITICS')

In [7]:
label_to_index = {}
index_to_label = {}
i = 0

# Assuming text_cat_pairs is a list of labels or a list of (text, label) pairs
for text, label in text_cat_pairs:
    if label not in label_to_index:
        label_to_index[label] = i
        index_to_label[i] = label
        i += 1

# Outputs
print("Label to Index Mapping:", label_to_index)
print("Index to Label Mapping:", index_to_label)


Label to Index Mapping: {'WELLNESS': 0, 'PARENTING': 1, 'STYLE & BEAUTY': 2, 'TRAVEL': 3, 'HEALTHY LIVING': 4, 'POLITICS': 5, 'BUSINESS': 6, 'ENTERTAINMENT': 7, 'STYLE': 8, 'SPORTS': 9, 'THE WORLDPOST': 10, 'QUEER VOICES': 11, 'WOMEN': 12, 'WORLD NEWS': 13, 'WEIRD NEWS': 14, 'WEDDINGS': 15, 'FOOD & DRINK': 16, 'ENVIRONMENT': 17, 'HOME & LIVING': 18, 'ARTS': 19, 'ARTS & CULTURE': 20, 'IMPACT': 21, 'GREEN': 22, 'PARENTS': 23, 'FIFTY': 24, 'DIVORCE': 25, 'MEDIA': 26, 'COLLEGE': 27, 'U.S. NEWS': 28, 'COMEDY': 29, 'CRIME': 30, 'TASTE': 31, 'GOOD NEWS': 32, 'BLACK VOICES': 33, 'WORLDPOST': 34, 'TECH': 35, 'EDUCATION': 36, 'RELIGION': 37, 'LATINO VOICES': 38, 'MONEY': 39, 'SCIENCE': 40, 'CULTURE & ARTS': 41}
Index to Label Mapping: {0: 'WELLNESS', 1: 'PARENTING', 2: 'STYLE & BEAUTY', 3: 'TRAVEL', 4: 'HEALTHY LIVING', 5: 'POLITICS', 6: 'BUSINESS', 7: 'ENTERTAINMENT', 8: 'STYLE', 9: 'SPORTS', 10: 'THE WORLDPOST', 11: 'QUEER VOICES', 12: 'WOMEN', 13: 'WORLD NEWS', 14: 'WEIRD NEWS', 15: 'WEDDINGS

In [8]:
  index_to_label

{0: 'WELLNESS',
 1: 'PARENTING',
 2: 'STYLE & BEAUTY',
 3: 'TRAVEL',
 4: 'HEALTHY LIVING',
 5: 'POLITICS',
 6: 'BUSINESS',
 7: 'ENTERTAINMENT',
 8: 'STYLE',
 9: 'SPORTS',
 10: 'THE WORLDPOST',
 11: 'QUEER VOICES',
 12: 'WOMEN',
 13: 'WORLD NEWS',
 14: 'WEIRD NEWS',
 15: 'WEDDINGS',
 16: 'FOOD & DRINK',
 17: 'ENVIRONMENT',
 18: 'HOME & LIVING',
 19: 'ARTS',
 20: 'ARTS & CULTURE',
 21: 'IMPACT',
 22: 'GREEN',
 23: 'PARENTS',
 24: 'FIFTY',
 25: 'DIVORCE',
 26: 'MEDIA',
 27: 'COLLEGE',
 28: 'U.S. NEWS',
 29: 'COMEDY',
 30: 'CRIME',
 31: 'TASTE',
 32: 'GOOD NEWS',
 33: 'BLACK VOICES',
 34: 'WORLDPOST',
 35: 'TECH',
 36: 'EDUCATION',
 37: 'RELIGION',
 38: 'LATINO VOICES',
 39: 'MONEY',
 40: 'SCIENCE',
 41: 'CULTURE & ARTS'}

In [9]:
import torch

def convert_labels(label):
  return torch.tensor(label_to_index[label])


labels = [cat for (text, cat) in text_cat_pairs]
print(labels[5])
print(convert_labels(labels[5]))

POLITICS
tensor(5)


In [10]:
# Generate the labels
labels = [convert_labels(label) for _, label in text_cat_pairs]
stacked_tensors_y = torch.tensor(labels)

In [11]:
stacked_tensors_y = stacked_tensors_y.long()

In [12]:
from transformers import GPT2Tokenizer, GPT2Model

# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
embedding_model = GPT2Model.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [13]:
import torch

def embed_sentence_with_padding(sentence, max_length=128, device=None):
     

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer(
        sentence,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=max_length
    ).to(device)  # Move inputs to the device

    with torch.no_grad():
        outputs = embedding_model(**inputs)  # Move model and outputs to the device
        embeddings = outputs.last_hidden_state.to(device)
    return embeddings.view(embeddings.size(1), -1)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)


GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [15]:
def generate_embeddings_in_batches(text_cat_pairs, batch_size=100):
    all_embeddings = []
    for i in range(0, len(text_cat_pairs), batch_size):
        batch = text_cat_pairs[i:i + batch_size]
        batch_embeddings = [embed_sentence_with_padding(text,max_length=128) for text, _ in batch]
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

In [17]:
all_embeddings = generate_embeddings_in_batches(text_cat_pairs)
 
torch.save(all_embeddings, 'all_embeddings.pt')
#all_embeddings = torch.load('all_embeddings.pt')

In [18]:
 len(all_embeddings)

20952

In [19]:
batch_size = 100  # You can try to keep your original batch size 
stacked_tensors_x = []

for i in range(0, len(all_embeddings), batch_size):
    batch = all_embeddings[i:i + batch_size]
    
    # Move the batch to CPU before stacking
    batch = [tensor.cpu() for tensor in batch] 
    
    torch.cuda.empty_cache()  
    stacked_batch = torch.stack(batch)
    stacked_tensors_x.append(stacked_batch)
    del batch, stacked_batch  # Delete to free up memory

print(len(stacked_tensors_x))


210


In [20]:
stacked_tensors_x = torch.stack([embedding.cpu() for embedding in all_embeddings])
print(stacked_tensors_x.shape)

torch.Size([20952, 128, 768])


In [21]:
import torch
from torch.utils.data import random_split

# Assuming you have stacked_tensors_x and stacked_tensors_y

# Define the dataset
dataset = torch.utils.data.TensorDataset(stacked_tensors_x, stacked_tensors_y)

# Define the split ratios (e.g., 80% train, 10% validation, 10% test)
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate dataset sizes
dataset_size = len(dataset)
train_size = int(train_ratio * dataset_size)
val_size = int(val_ratio * dataset_size)
test_size = dataset_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 16761
Validation dataset size: 2095
Test dataset size: 2096


In [22]:
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 2 for bidirection

    def forward(self, x):  # Remove hidden
        # Create hidden state inside forward method
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, batch_size, self.hidden_size).to(x.device)

        # Forward propagate LSTM (using provided or initialized hidden)
        out, _ = self.lstm(x, (h0, c0))  # Update hidden

        # Decode the hidden state of the last time step
        last_hidden = self.fc(out[:, -1, :])
        first_hidden = self.fc(out[:, 0, :])
        output = last_hidden + first_hidden

        return output  # Return output

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

 

# Hyperparameters
input_size = stacked_tensors_x.shape[2]  # embedding_size for GPT2 embeddings
hidden_size = 128
num_layers = 2
num_classes = len(label_to_index)
learning_rate = 0.001
num_epochs = 20
batch_size = 128  # Adjust as needed

# Create the model
model = BiLSTM(input_size, hidden_size, num_layers, num_classes)

# Move the model to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

   

In [24]:
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs):
    model.to(device)

    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for batch_idx, (data, targets) in enumerate(train_loader):
            data = data.to(device)
            targets = targets.to(device)

            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Calculate training accuracy and loss
            _, predicted = torch.max(scores.data, 1)
            train_total += targets.size(0)
            train_correct += (predicted == targets).sum().item()
            train_loss += loss.item()

        # Calculate training accuracy for the epoch
        train_accuracy = 100 * train_correct / train_total

        # Validation Phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for data, targets in val_loader:
                data = data.to(device)
                targets = targets.to(device)

                # Forward pass
                scores = model(data)
                loss = criterion(scores, targets)

                # Calculate validation accuracy and loss
                _, predicted = torch.max(scores.data, 1)
                val_total += targets.size(0)
                val_correct += (predicted == targets).sum().item()
                val_loss += loss.item()

        # Calculate validation accuracy for the epoch
        val_accuracy = 100 * val_correct / val_total

        # Print epoch statistics
        print(f"Epoch [{epoch+1}/{num_epochs}] - "
              f"Train Loss: {train_loss / len(train_loader):.4f}, "
              f"Train Acc: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss / len(val_loader):.4f}, "
              f"Val Acc: {val_accuracy:.2f}%")

    print("Training finished!")


In [25]:
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Epoch [1/20] - Train Loss: 2.9440, Train Acc: 26.11%, Val Loss: 2.5565, Val Acc: 34.84%
Epoch [2/20] - Train Loss: 2.3163, Train Acc: 39.13%, Val Loss: 2.2238, Val Acc: 41.29%
Epoch [3/20] - Train Loss: 2.0418, Train Acc: 45.06%, Val Loss: 2.0240, Val Acc: 44.34%
Epoch [4/20] - Train Loss: 1.8345, Train Acc: 49.16%, Val Loss: 1.9231, Val Acc: 47.73%
Epoch [5/20] - Train Loss: 1.6649, Train Acc: 53.16%, Val Loss: 1.7579, Val Acc: 52.17%
Epoch [6/20] - Train Loss: 1.5107, Train Acc: 56.82%, Val Loss: 1.7362, Val Acc: 51.93%
Epoch [7/20] - Train Loss: 1.3835, Train Acc: 60.22%, Val Loss: 1.6391, Val Acc: 54.18%
Epoch [8/20] - Train Loss: 1.2656, Train Acc: 62.83%, Val Loss: 1.6159, Val Acc: 55.42%
Epoch [9/20] - Train Loss: 1.1365, Train Acc: 66.48%, Val Loss: 1.6674, Val Acc: 54.70%
Epoch [10/20] - Train Loss: 1.0360, Train Acc: 69.20%, Val Loss: 1.6293, Val Acc: 55.75%
Epoch [11/20] - Train Loss: 0.9111, Train Acc: 72.95%, Val Loss: 1.6921, Val Acc: 54.46%
Epoch [12/20] - Train Loss: 0.

In [26]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
def calculate_metrics(model, loader, device):
    model.eval()
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for data, targets in loader:
            data = data.to(device)
            targets = targets.to(device)

            # Forward pass
            scores = model(data)
            _, predicted = torch.max(scores, 1)

            # Collect predictions and true labels
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(all_targets, all_preds, average="weighted")
    accuracy = accuracy_score(all_targets, all_preds) * 100

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy
    }


In [27]:
train_metrics = calculate_metrics(model, train_loader, device)
val_metrics = calculate_metrics(model, val_loader, device)

print("\nEvaluation Metrics:")
print(f"Train Precision: {train_metrics['precision']:.4f}")
print(f"Train Recall: {train_metrics['recall']:.4f}")
print(f"Train F1-Score: {train_metrics['f1']:.4f}")
print(f"Train Accuracy: {train_metrics['accuracy']:.2f}%")

print(f"Val Precision: {val_metrics['precision']:.4f}")
print(f"Val Recall: {val_metrics['recall']:.4f}")
print(f"Val F1-Score: {val_metrics['f1']:.4f}")
print(f"Val Accuracy: {val_metrics['accuracy']:.2f}%")



Evaluation Metrics:
Train Precision: 0.9827
Train Recall: 0.9824
Train F1-Score: 0.9824
Train Accuracy: 98.24%
Val Precision: 0.5207
Val Recall: 0.5251
Val F1-Score: 0.5173
Val Accuracy: 52.51%


In [28]:
def test(model, test_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    test_loss = 0
    test_correct = 0
    test_total = 0

    with torch.no_grad():
        for data, targets in test_loader:
            data = data.to(device)
            targets = targets.to(device)

            # Forward pass
            scores = model(data)
            loss = criterion(scores, targets)

            # Calculate test accuracy and loss
            _, predicted = torch.max(scores.data, 1)
            test_total += targets.size(0)
            test_correct += (predicted == targets).sum().item()
            test_loss += loss.item()

    # Calculate test accuracy
    test_accuracy = 100 * test_correct / test_total

    print(f"Test Loss: {test_loss / len(test_loader):.4f}, "
          f"Test Accuracy: {test_accuracy:.2f}%")

    return test_loss / len(test_loader), test_accuracy


In [29]:
 
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [30]:
test_loss, test_accuracy = test(model, test_loader, criterion, device)

Test Loss: 2.3980, Test Accuracy: 55.92%


In [98]:
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=2, dropout=0.7):  # Increased dropout
        super(TransformerClassifier, self).__init__()
         
        self.transformer_encoder = torch.nn.MultiheadAttention(input_dim, num_heads,
                                                               dropout=dropout, bias=False,
                                                               kdim=input_dim, vdim=input_dim,
                                                               batch_first=True)
        
        # Layer normalization
        self.norm = nn.LayerNorm(input_dim)
        
        # Fully connected Head with dropout
        self.fc = nn.Linear(input_dim, num_classes)
        self.dropout = nn.Dropout(dropout)  # Added dropout layer

    def forward(self, x):
         
        # Transformer Encoder
        output, _ = self.transformer_encoder(x, x, x, need_weights=False)
        # apply layer normalization
        output = self.norm(output)
        # take the average over all attention (hidden) states
        output = torch.mean(output, dim=1)
        # Apply dropout before the fully connected layer
        output = self.dropout(output)  
        # Fully Connected Layer for Classification
        output = self.fc(output)

        return output

In [32]:
device = torch.device('cuda')

In [96]:
model = TransformerClassifier(input_size, num_classes).to(device)

In [55]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,weight_decay=0.04 )
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=50)

Epoch [1/50] - Train Loss: 3.3540, Train Acc: 16.97%, Val Loss: 4.1332, Val Acc: 19.81%
Epoch [2/50] - Train Loss: 3.0750, Train Acc: 23.70%, Val Loss: 3.2804, Val Acc: 30.21%
Epoch [3/50] - Train Loss: 2.7273, Train Acc: 32.35%, Val Loss: 2.9430, Val Acc: 35.18%
Epoch [4/50] - Train Loss: 2.3978, Train Acc: 39.61%, Val Loss: 2.4687, Val Acc: 41.00%
Epoch [5/50] - Train Loss: 2.0960, Train Acc: 45.18%, Val Loss: 2.1649, Val Acc: 47.35%
Epoch [6/50] - Train Loss: 1.8501, Train Acc: 50.87%, Val Loss: 1.9937, Val Acc: 50.41%
Epoch [7/50] - Train Loss: 1.6888, Train Acc: 54.13%, Val Loss: 1.9570, Val Acc: 50.55%
Epoch [8/50] - Train Loss: 1.5822, Train Acc: 56.11%, Val Loss: 1.7808, Val Acc: 53.94%
Epoch [9/50] - Train Loss: 1.5002, Train Acc: 58.16%, Val Loss: 1.7835, Val Acc: 54.08%
Epoch [10/50] - Train Loss: 1.4369, Train Acc: 59.44%, Val Loss: 1.7158, Val Acc: 55.80%
Epoch [11/50] - Train Loss: 1.3844, Train Acc: 60.61%, Val Loss: 1.6811, Val Acc: 56.09%
Epoch [12/50] - Train Loss: 1.

In [56]:
train_metrics = calculate_metrics(model, train_loader, device)
val_metrics = calculate_metrics(model, val_loader, device)

print("\nEvaluation Metrics:")
print(f"Train Precision: {train_metrics['precision']:.4f}")
print(f"Train Recall: {train_metrics['recall']:.4f}")
print(f"Train F1-Score: {train_metrics['f1']:.4f}")
print(f"Train Accuracy: {train_metrics['accuracy']:.2f}%")

print(f"Val Precision: {val_metrics['precision']:.4f}")
print(f"Val Recall: {val_metrics['recall']:.4f}")
print(f"Val F1-Score: {val_metrics['f1']:.4f}")
print(f"Val Accuracy: {val_metrics['accuracy']:.2f}%")



Evaluation Metrics:
Train Precision: 0.7956
Train Recall: 0.7935
Train F1-Score: 0.7903
Train Accuracy: 79.35%
Val Precision: 0.5566
Val Recall: 0.5690
Val F1-Score: 0.5548
Val Accuracy: 56.90%


In [57]:
test_loss, test_accuracy = test(model, test_loader, criterion, device)

Test Loss: 2.0631, Test Accuracy: 58.11%


In [130]:
import torch.nn.functional as F

def predict_user_input(model, user_input, tokenizer, embedding_model, device, class_labels):
    # Step 1: Embed the user input
    embeddings = embed_sentence_with_padding(user_input, tokenizer, embedding_model, device=device)
    
    # Step 2: Get the embedding shape
    batch_size, seq_len, embedding_dim = embeddings.size()
    
    # Debugging: Check the shape of the embeddings
    print(f"Embeddings shape: {embeddings.shape}")  # Should be (batch_size, 128, 768)
    
    # Step 3: Transpose the embedding tensor for compatibility with the model
    embeddings = embeddings.permute(0, 2, 1)  # (batch_size, embedding_dim, seq_len)
    
    # Now embeddings will have shape (1, 768, 128), which should match the model's expected input

    # Step 4: Pass the embeddings through the classifier model
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        output = model(embeddings)  # Output shape: (batch_size, num_classes)
    
    # Step 5: Apply softmax to get probabilities
    probabilities = F.softmax(output, dim=1)

    # Step 6: Get the predicted class index
    predicted_idx = torch.argmax(probabilities, dim=1).item()

    # Step 7: Map the predicted index to the class label
    predicted_class = class_labels[predicted_idx]

    return predicted_class, probabilities[0][predicted_idx].item()

# Example usage
user_input = "Apple unveiled its latest iPhone with advanced camera features."
predicted_class, confidence = predict_user_input(model, user_input, tokenizer, embedding_model, device, class_labels)
print(f"Predicted class: {predicted_class} with confidence: {confidence}")


Embeddings shape: torch.Size([1, 128, 768])
Predicted class: ARTS & CULTURE with confidence: 0.03960195556282997
