In [36]:
import torch
import pandas as pd
import random

from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
# Load pretrained BERT model and tokenizer
random.seed(10)
model_name = 'bert-base-uncased'  # You can choose a different BERT variant
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification

tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 440kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 596kB/s]
config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
model.safetensors: 100%|██████████| 440M/440M [00:43<00:00, 10.1MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Replace 'your_dataset.csv' with your own dataset file
# Your dataset should have two columns: 'text' for input text and 'label' for binary labels (0 or 1)
# Example: "text,label\nHello,0\nWorld,1\n..."
# Ensure your dataset is properly preprocessed and split into training and validation sets

# Load and preprocess the dataset
# Here we use a dummy dataset for illustration purposes
# Replace this with your actual dataset loading and preprocessing

data = pd.read_csv("test.csv")
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

train


Unnamed: 0,Messages,gen_label
75,Take steps to secure your wallet. Enable two-f...,1
442,"""Welcome to NFTCollective, the social network ...",1
15,Experience the future of music streaming! Our ...,0
325,"""Experience seamless trading with TrustSwap's ...",1
388,"""Immerse yourself in the world of NFT collater...",1
...,...,...
106,"""Unlock the power of DeFi! Explore our decentr...",1
270,"""Stay ahead of the game with real-time market ...",0
348,"""Calling all lenders and borrowers! Participat...",1
435,"""Attention: Join the QuantumMining revolution ...",1


In [34]:
train_sentences = list(train["Messages"])
train_labels = list(train["gen_label"])
train_tokens = tokenizer(train_sentences, padding=True, truncation=True, return_tensors='pt')

test_sentences = list(test["Messages"])
test_labels = list(test["gen_label"])
test_tokens = tokenizer(test_sentences, padding=True, truncation=True, return_tensors='pt')

In [32]:
# Create PyTorch DataLoader
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_labels))
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Set up training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
num_epochs = 5  # Adjust as needed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}')


Epoch 1/5:   0%|          | 0/26 [00:00<?, ?it/s]

Epoch 1/5: 100%|██████████| 26/26 [03:22<00:00,  7.79s/it]


Epoch 1/5, Average Loss: 0.0470759086489964


Epoch 2/5: 100%|██████████| 26/26 [03:36<00:00,  8.31s/it]


Epoch 2/5, Average Loss: 0.025575558493773524


Epoch 3/5: 100%|██████████| 26/26 [03:56<00:00,  9.11s/it]


Epoch 3/5, Average Loss: 0.020036610801560946


Epoch 4/5: 100%|██████████| 26/26 [03:46<00:00,  8.70s/it]


Epoch 4/5, Average Loss: 0.029863285353013243


Epoch 5/5: 100%|██████████| 26/26 [03:49<00:00,  8.82s/it]

Epoch 5/5, Average Loss: 0.0063331938850191925





In [37]:
test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], torch.tensor(test_labels))
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Iterate through the test dataset
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).tolist()

        all_predictions.extend(predictions)
        all_true_labels.extend(labels.tolist())

# Calculate and print metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
confusion_mat = confusion_matrix(all_true_labels, all_predictions)
classification_rep = classification_report(all_true_labels, all_predictions)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(confusion_mat)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.9900990099009901
Confusion Matrix:
[[33  0]
 [ 1 67]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        33
           1       1.00      0.99      0.99        68

    accuracy                           0.99       101
   macro avg       0.99      0.99      0.99       101
weighted avg       0.99      0.99      0.99       101



In [None]:
# Save the fine-tuned model
model.save_pretrained('finetuned_bert_model')