In [1]:
!pip install pandas scikit-learn transformers torch tqdm openpyxl


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load data
df = pd.read_excel('Violation_Training_Data.xlsx')

# Split data
train, val = train_test_split(df, test_size=0.2)

# Tokenize text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train['Combined Text Data'].tolist(), truncation=True, padding=True, return_tensors='pt', max_length=512)
val_encodings = tokenizer(val['Combined Text Data'].tolist(), truncation=True, padding=True, return_tensors='pt', max_length=512)

factor = 'Violation Factor'

# Extract ids, masks, and targets
train_input_ids, train_attention_masks, train_targets = train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train[factor].values)
val_input_ids, val_attention_masks, val_targets = val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val[factor].values)

# Create Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# DataLoaders
batch_size = 16
train_dataloader = DataLoader(TensorDataset(train_input_ids, train_attention_masks, train_targets), batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(TensorDataset(val_input_ids, val_attention_masks, val_targets), batch_size=batch_size)

# Number of training epochs
num_epochs = 30

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Device & Mixed Precision
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
scaler = torch.cuda.amp.GradScaler()

# Training and Validation Loop
gradient_accumulation_steps = 2  # Adjust for available GPU memory
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_preds, train_labels = [], []

    for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}')):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps

        scaler.scale(loss).backward()

        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

        total_loss += loss.item() * gradient_accumulation_steps

        # Collect predictions and labels for metrics calculation
        with torch.no_grad():
            preds = torch.argmax(outputs.logits, dim=1)
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(train_dataloader)
    train_acc = accuracy_score(train_labels, train_preds)
    train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(train_labels, train_preds, average='binary')

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, Accuracy: {train_acc:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}, F1: {train_f1:.4f}')

# Validation step
model.eval()
val_preds, val_labels = [], []
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc='Validating'):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)  # Corrected line here
        preds = torch.argmax(outputs.logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

val_acc = accuracy_score(val_labels, val_preds)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')
print(f'Validation Accuracy: {val_acc:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}')

# Save the model and tokenizer
model_save_path = 'bert_model.pt'
tokenizer_save_path = 'bert_tokenizer'

# Save model state_dict
torch.save(model.state_dict(), model_save_path)
# Save tokenizer
tokenizer.save_pretrained(tokenizer_save_path)

print(f'Model saved to {model_save_path}')
print(f'Tokenizer saved to {tokenizer_save_path}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/30: 100%|██████████| 16/16 [00:09<00:00,  1.69it/s]


Epoch 1/30, Loss: 0.7000, Accuracy: 0.5892, Precision: 0.5373, Recall: 0.6606, F1: 0.5926


Epoch 2/30: 100%|██████████| 16/16 [00:08<00:00,  1.93it/s]


Epoch 2/30, Loss: 0.6353, Accuracy: 0.6556, Precision: 0.5915, Recall: 0.7706, F1: 0.6693


Epoch 3/30: 100%|██████████| 16/16 [00:08<00:00,  1.91it/s]


Epoch 3/30, Loss: 0.6019, Accuracy: 0.6929, Precision: 0.6224, Recall: 0.8165, F1: 0.7063


Epoch 4/30: 100%|██████████| 16/16 [00:08<00:00,  1.87it/s]


Epoch 4/30, Loss: 0.5844, Accuracy: 0.6680, Precision: 0.5973, Recall: 0.8165, F1: 0.6899


Epoch 5/30: 100%|██████████| 16/16 [00:08<00:00,  1.84it/s]


Epoch 5/30, Loss: 0.5448, Accuracy: 0.7386, Precision: 0.6855, Recall: 0.7798, F1: 0.7296


Epoch 6/30: 100%|██████████| 16/16 [00:08<00:00,  1.80it/s]


Epoch 6/30, Loss: 0.5131, Accuracy: 0.7718, Precision: 0.7547, Recall: 0.7339, F1: 0.7442


Epoch 7/30: 100%|██████████| 16/16 [00:08<00:00,  1.78it/s]


Epoch 7/30, Loss: 0.4739, Accuracy: 0.8091, Precision: 0.7944, Recall: 0.7798, F1: 0.7870


Epoch 8/30: 100%|██████████| 16/16 [00:09<00:00,  1.77it/s]


Epoch 8/30, Loss: 0.4261, Accuracy: 0.8755, Precision: 0.9647, Recall: 0.7523, F1: 0.8454


Epoch 9/30: 100%|██████████| 16/16 [00:08<00:00,  1.80it/s]


Epoch 9/30, Loss: 0.3933, Accuracy: 0.8382, Precision: 0.8365, Recall: 0.7982, F1: 0.8169


Epoch 10/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 10/30, Loss: 0.3272, Accuracy: 0.8755, Precision: 0.9438, Recall: 0.7706, F1: 0.8485


Epoch 11/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 11/30, Loss: 0.3131, Accuracy: 0.8838, Precision: 0.9175, Recall: 0.8165, F1: 0.8641


Epoch 12/30: 100%|██████████| 16/16 [00:08<00:00,  1.85it/s]


Epoch 12/30, Loss: 0.2806, Accuracy: 0.9087, Precision: 0.9780, Recall: 0.8165, F1: 0.8900


Epoch 13/30: 100%|██████████| 16/16 [00:08<00:00,  1.85it/s]


Epoch 13/30, Loss: 0.2557, Accuracy: 0.8963, Precision: 0.9468, Recall: 0.8165, F1: 0.8768


Epoch 14/30: 100%|██████████| 16/16 [00:08<00:00,  1.84it/s]


Epoch 14/30, Loss: 0.2450, Accuracy: 0.9129, Precision: 1.0000, Recall: 0.8073, F1: 0.8934


Epoch 15/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 15/30, Loss: 0.2009, Accuracy: 0.9129, Precision: 0.9889, Recall: 0.8165, F1: 0.8945


Epoch 16/30: 100%|██████████| 16/16 [00:08<00:00,  1.81it/s]


Epoch 16/30, Loss: 0.1818, Accuracy: 0.9129, Precision: 0.9889, Recall: 0.8165, F1: 0.8945


Epoch 17/30: 100%|██████████| 16/16 [00:08<00:00,  1.81it/s]


Epoch 17/30, Loss: 0.1440, Accuracy: 0.9544, Precision: 1.0000, Recall: 0.8991, F1: 0.9469


Epoch 18/30: 100%|██████████| 16/16 [00:08<00:00,  1.81it/s]


Epoch 18/30, Loss: 0.1387, Accuracy: 0.9502, Precision: 0.9369, Recall: 0.9541, F1: 0.9455


Epoch 19/30: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


Epoch 19/30, Loss: 0.1682, Accuracy: 0.9336, Precision: 0.9346, Recall: 0.9174, F1: 0.9259


Epoch 20/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 20/30, Loss: 0.1172, Accuracy: 0.9502, Precision: 0.9802, Recall: 0.9083, F1: 0.9429


Epoch 21/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 21/30, Loss: 0.0688, Accuracy: 0.9959, Precision: 1.0000, Recall: 0.9908, F1: 0.9954


Epoch 22/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 22/30, Loss: 0.0569, Accuracy: 0.9876, Precision: 0.9818, Recall: 0.9908, F1: 0.9863


Epoch 23/30: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


Epoch 23/30, Loss: 0.0585, Accuracy: 0.9834, Precision: 0.9817, Recall: 0.9817, F1: 0.9817


Epoch 24/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 24/30, Loss: 0.0485, Accuracy: 0.9917, Precision: 0.9908, Recall: 0.9908, F1: 0.9908


Epoch 25/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 25/30, Loss: 0.0439, Accuracy: 0.9959, Precision: 1.0000, Recall: 0.9908, F1: 0.9954


Epoch 26/30: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


Epoch 26/30, Loss: 0.0431, Accuracy: 0.9959, Precision: 1.0000, Recall: 0.9908, F1: 0.9954


Epoch 27/30: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


Epoch 27/30, Loss: 0.0345, Accuracy: 0.9959, Precision: 1.0000, Recall: 0.9908, F1: 0.9954


Epoch 28/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 28/30, Loss: 0.0346, Accuracy: 0.9959, Precision: 1.0000, Recall: 0.9908, F1: 0.9954


Epoch 29/30: 100%|██████████| 16/16 [00:08<00:00,  1.82it/s]


Epoch 29/30, Loss: 0.0272, Accuracy: 0.9959, Precision: 0.9909, Recall: 1.0000, F1: 0.9954


Epoch 30/30: 100%|██████████| 16/16 [00:08<00:00,  1.83it/s]


Epoch 30/30, Loss: 0.0179, Accuracy: 1.0000, Precision: 1.0000, Recall: 1.0000, F1: 1.0000


Validating: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]


Validation Accuracy: 0.9016, Precision: 0.8571, Recall: 0.9231, F1: 0.8889
Model saved to bert_model.pt
Tokenizer saved to bert_tokenizer
