In [13]:
## BERT Categories

import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report
import time

# Define a custom class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.apply(str)  # Convert all entries to strings
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


training_file_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\Datasets for Classification Layer\training_dataset.csv"
df_train = pd.read_csv(training_file_path)

# Map infobox types to numerical labels
label_mapping = {label: idx for idx, label in enumerate(df_train['infobox_type'].unique())}
df_train['label'] = df_train['infobox_type'].map(label_mapping)

# Initialize the BERT model and tokenizer for classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Create datasets and data loaders
train_dataset = CustomDataset(df_train['categories_text'], df_train['label'], tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function using PyTorch's AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

# Set the model to training mode
model.train()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Training loop
max_epochs = 10
target_accuracy = 0.95  # 95% target accuracy

# Start time for training
start_time_training = time.time()

for epoch in range(max_epochs):
    print(f"Epoch {epoch+1}/{max_epochs}")
    total_loss = 0
    correct_predictions = 0

    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions.double() / len(train_loader.dataset)

    print(f"Training loss: {avg_loss:.4f}, accuracy: {accuracy:.4f}")

    # Check if the target accuracy has been reached
    if accuracy >= target_accuracy:
        print(f"Target accuracy of {target_accuracy * 100}% reached. Stopping training.")
        break


end_time_training = time.time()

###total time
print(f"Total training time: {end_time_training - start_time_training:.2f} seconds")

# Save the trained model (optional)
torch.save(model.state_dict(), 'trained_model.pth')

# Set the model to evaluation mode
model.eval()


test_data_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\Datasets for Classification Layer\testing_data_set_for_Classification_Layer.csv"
test_data = pd.read_csv(test_data_path)

# chunks
chunk_size = 100
total_chunks = len(test_data) // chunk_size + (1 if len(test_data) % chunk_size != 0 else 0)

results_df_list = []
start_time = time.time()

for i in tqdm(range(total_chunks), desc="Processing test data"):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, len(test_data))

 
    chunk_df = test_data.iloc[start_idx:end_idx]
    X_test_texts = chunk_df['categories_text'].apply(str).tolist()
    y_test = chunk_df['infobox_type'].tolist()
    titles_test = chunk_df['title'].tolist()

 
    inputs = tokenizer(X_test_texts, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

   
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Predict the test data chunk
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        y_pred = torch.argmax(logits, dim=1).cpu().numpy()

    # Create a DataFrame with the chunk results
    chunk_results_df = pd.DataFrame({
        'Title': titles_test,
        'Predicted Label': y_pred,
        'True Label': y_test
    })
    results_df_list.append(chunk_results_df)

# Concatenate 
results_df = pd.concat(results_df_list, ignore_index=True)

# Convert numeric predictions back to their original string labels
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
results_df['Predicted Label'] = results_df['Predicted Label'].map(reverse_label_mapping)

misclassified = results_df[results_df['Predicted Label'] != results_df['True Label']]
misclass_counts = misclassified['True Label'].value_counts()
print("Top 5 Most Misclassified Labels:")
print(misclass_counts.head(5))

print(classification_report(results_df['True Label'], results_df['Predicted Label'], target_names=list(label_mapping.keys())))

results_csv_path = r"C:\Users\Mohammad Hijjawi\Desktop\Data Science Project Code\BERT - Categories\Classification Layer\prediction_results.csv"

#  CSV
results_df.to_csv(results_csv_path, index=False)
print(f"Results saved to {results_csv_path}")
print(f"Total execution time: {time.time() - start_time:.2f} seconds")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


Training: 100%|██████████████████████████████████████████████████████████████████| 1863/1863 [3:01:53<00:00,  5.86s/it]


Training loss: 3.9978, accuracy: 0.5123
Epoch 2/10


Training: 100%|██████████████████████████████████████████████████████████████████| 1863/1863 [3:04:26<00:00,  5.94s/it]


Training loss: 1.5484, accuracy: 0.8648
Epoch 3/10


Training: 100%|██████████████████████████████████████████████████████████████████| 1863/1863 [3:05:03<00:00,  5.96s/it]


Training loss: 0.6181, accuracy: 0.9214
Epoch 4/10


Training: 100%|██████████████████████████████████████████████████████████████████| 1863/1863 [3:09:21<00:00,  6.10s/it]


Training loss: 0.3260, accuracy: 0.9438
Epoch 5/10


Training: 100%|██████████████████████████████████████████████████████████████████| 1863/1863 [3:07:56<00:00,  6.05s/it]


Training loss: 0.2180, accuracy: 0.9556
Target accuracy of 95.0% reached. Stopping training.
Total training time: 55721.16 seconds


Processing test data: 100%|████████████████████████████████████████████████████████| 298/298 [2:30:09<00:00, 30.23s/it]


Top 5 Most Misclassified Labels:
True Label
infobox language                    93
infobox gridiron football person    90
infobox person                      82
infobox korean name                 72
infobox event                       56
Name: count, dtype: int64
                                               precision    recall  f1-score   support

                        infobox afl biography       0.54      0.59      0.56       100
                       infobox aircraft begin       0.99      1.00      1.00       100
                              infobox artwork       1.00      1.00      1.00       100
              infobox athletics championships       0.98      0.97      0.97       100
                     infobox australian place       1.00      0.98      0.99       100
                               infobox bridge       0.98      0.93      0.95       100
                          infobox bus transit       0.97      1.00      0.99       100
                        infobox cfl bi