In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
import joblib

# Configuration
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. Data Preparation
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_data(file_path):
    df = pd.read_csv(file_path)

    # Check required columns
    if not {'Category', 'Message'}.issubset(df.columns):
        raise ValueError("CSV must contain 'Category' and 'Message' columns")

    # Encode labels
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['Category'])

    return df, label_encoder

# 2. Model Training with Progress Tracking
class BERTClassifier:
    def __init__(self, model_name, num_labels):
        self.model = BertForSequenceClassification.from_pretrained(
            model_name, num_labels=num_labels)
        self.model.to(DEVICE)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def train(self, train_data, val_data, epochs, label_encoder):
        train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

        optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        total_steps = len(train_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        best_accuracy = 0
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')
            print('-' * 10)

            # Training phase
            self.model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc='Training')
            for batch in progress_bar:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                train_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

                progress_bar.set_postfix({'loss': loss.item()})

            avg_train_loss = train_loss / len(train_loader)

            # Validation phase
            val_loss, val_accuracy = self.evaluate(val_loader)
            print(f'Train loss: {avg_train_loss:.4f}')
            print(f'Val loss: {val_loss:.4f}')
            print(f'Val accuracy: {val_accuracy:.4f}\n')

            # Save best model
            if val_accuracy > best_accuracy:
                self.save_model(label_encoder)
                best_accuracy = val_accuracy

        print(f'Training complete. Best validation accuracy: {best_accuracy:.4f}')

    def evaluate(self, data_loader):
        self.model.eval()
        val_loss = 0
        correct_predictions = 0

        with torch.no_grad():
            for batch in tqdm(data_loader, desc='Evaluating'):
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['labels'].to(DEVICE)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels)

        avg_val_loss = val_loss / len(data_loader)
        accuracy = correct_predictions.double() / len(data_loader.dataset)
        return avg_val_loss, accuracy

    def save_model(self, label_encoder):
        if not os.path.exists('saved_model'):
            os.makedirs('saved_model')

        self.model.save_pretrained('saved_model')
        self.tokenizer.save_pretrained('saved_model')
        joblib.dump(label_encoder, 'saved_model/label_encoder.pkl')
        print('Model saved successfully!')

# 3. Prediction Function
def predict(text, model_path='saved_model'):
    # Load artifacts
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    label_encoder = joblib.load(f'{model_path}/label_encoder.pkl')
    model.to(DEVICE)
    model.eval()

    # Prepare input
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Predict
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(DEVICE)
        attention_mask = encoding['attention_mask'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    return label_encoder.inverse_transform(preds)[0]

# Main Execution Flow
if __name__ == '__main__':
    # Load and prepare data
    df, label_encoder = load_data('spam.csv')

    # Split data
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'])

    # Create datasets
    classifier = BERTClassifier(MODEL_NAME, num_labels=len(label_encoder.classes_))

    train_dataset = TextDataset(
        texts=train_df.Message.values,
        labels=train_df.label.values,
        tokenizer=classifier.tokenizer,
        max_len=MAX_LEN
    )

    val_dataset = TextDataset(
        texts=val_df.Message.values,
        labels=val_df.label.values,
        tokenizer=classifier.tokenizer,
        max_len=MAX_LEN
    )

    # Train model
    classifier.train(train_dataset, val_dataset, EPOCHS, label_encoder)

    # Example prediction
    test_text = "Congratulations! You've won a free vacation!"
    print(f"\nPrediction for '{test_text}': {predict(test_text)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
----------


Training: 100%|██████████| 279/279 [01:38<00:00,  2.83it/s, loss=0.00328]
Evaluating: 100%|██████████| 70/70 [00:08<00:00,  8.39it/s]


Train loss: 0.0845
Val loss: 0.0387
Val accuracy: 0.9910

Model saved successfully!
Epoch 2/4
----------


Training: 100%|██████████| 279/279 [01:37<00:00,  2.86it/s, loss=0.000585]
Evaluating: 100%|██████████| 70/70 [00:08<00:00,  8.50it/s]


Train loss: 0.0228
Val loss: 0.0521
Val accuracy: 0.9874

Epoch 3/4
----------


Training: 100%|██████████| 279/279 [01:37<00:00,  2.86it/s, loss=0.000254]
Evaluating: 100%|██████████| 70/70 [00:08<00:00,  8.50it/s]


Train loss: 0.0041
Val loss: 0.0538
Val accuracy: 0.9901

Epoch 4/4
----------


Training: 100%|██████████| 279/279 [01:37<00:00,  2.86it/s, loss=0.000208]
Evaluating: 100%|██████████| 70/70 [00:08<00:00,  8.52it/s]


Train loss: 0.0029
Val loss: 0.0524
Val accuracy: 0.9910

Training complete. Best validation accuracy: 0.9910

Prediction for 'Congratulations! You've won a free vacation!': ham


In [4]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import joblib

def classify_text(text, model_path='saved_model'):
    # Load model components
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(model_path)
    label_encoder = joblib.load(f'{model_path}/label_encoder.pkl')

    # Preprocess text
    inputs = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_idx = torch.argmax(probs).item()

    return {
        'prediction': label_encoder.inverse_transform([pred_idx])[0],
        'confidence': probs[0][pred_idx].item()
    }

# Example usage
if __name__ == '__main__':
    texts = [
        "Free entry to win concert tickets! Text NOW to 12345",
        "Hey, are we still meeting for lunch tomorrow?",
        "Your account needs immediate verification. Click here to update: http://secure-login.net"
    ]

    for text in texts:
        result = classify_text(text)
        print(f"Text: {text}")
        print(f"Prediction: {result['prediction']} ({result['confidence']:.2%})")
        print("-" * 50)

Text: Free entry to win concert tickets! Text NOW to 12345
Prediction: spam (99.81%)
--------------------------------------------------
Text: Hey, are we still meeting for lunch tomorrow?
Prediction: ham (99.83%)
--------------------------------------------------
Text: Your account needs immediate verification. Click here to update: http://secure-login.net
Prediction: spam (99.51%)
--------------------------------------------------
