## Imports

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import utils

## NewDataset class

In [24]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

## Define the NewsClassifier class

In [25]:
class NewsClassifier(nn.Module):
    def __init__(self, n_classes):
        super(NewsClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(outputs.pooler_output)
        return self.out(output)


## Load and preprocess the data

In [26]:
data_df = utils.load_data_from_directories(articles_dir='./data/news', summaries_dir='./data/summaries')
label_encoder = LabelEncoder()
data_df['category_encoded'] = label_encoder.fit_transform(data_df['category'])

train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)
train_texts = train_df['content'].tolist()
test_texts = test_df['content'].tolist()
train_labels = train_df['category_encoded'].tolist()
test_labels = test_df['category_encoded'].tolist()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)


## Train the model

In [27]:
def train_model(model, train_dataset, num_epochs=3, learning_rate=2e-5, batch_size=8):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct_predictions = 0
        with tqdm(total=len(train_loader), desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch') as pbar:
            for batch in train_loader:
                optimizer.zero_grad()
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                epoch_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                correct_predictions += torch.sum(preds == labels)

                pbar.set_postfix(loss=epoch_loss/len(train_loader), accuracy=correct_predictions.double()/len(train_loader.dataset))
                pbar.update(1)

    torch.save(model.state_dict(), 'saved_models/bert_model.pt')
    print('Model saved to saved_models/bert_model.pt')


## Evaluate the model

In [28]:
def evaluate_model(model, test_dataset):
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
    model.eval()
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            all_preds.extend(preds)
            all_labels.extend(labels)

    accuracy = correct_predictions.double() / len(test_loader.dataset)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')


## Initialize and train the model

In [29]:
num_classes = len(label_encoder.classes_)
model = NewsClassifier(n_classes=num_classes)
train_model(model, train_dataset, num_epochs=3, learning_rate=2e-5, batch_size=8)

Epoch 1/3: 100%|██████████| 445/445 [35:51<00:00,  4.84s/batch, accuracy=tensor(0.9236, dtype=torch.float64), loss=0.281] 
Epoch 2/3: 100%|██████████| 445/445 [33:39<00:00,  4.54s/batch, accuracy=tensor(0.9857, dtype=torch.float64), loss=0.0563] 
Epoch 3/3: 100%|██████████| 445/445 [32:55<00:00,  4.44s/batch, accuracy=tensor(0.9944, dtype=torch.float64), loss=0.0207] 


Model saved to saved_models/news_classifier_model.pt


## Evaluate the model

In [30]:
evaluate_model(model, test_dataset)

Accuracy: 98.09%
Precision: 0.9815
Recall: 0.9809
F1 Score: 0.9809
