In [40]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import Adam
import torch.nn.functional as F
import torch

## Data Loader & Tokenizer

In [41]:
class ClassifierDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.labels = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.long)
        }

# FLoad data and return DataLoader
def load_data(file, tokenizer, max_len, batch_size=32, shuffle=True):
    df = pd.read_csv(file)
    label_mapping = {'other': 0, 'question': 1, 'concern': 2}
    df['label'] = df['label'].map(label_mapping)
    dataset = ClassifierDataset(df, tokenizer, max_len)
    params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_workers': 0}
    data_loader = DataLoader(dataset, **params)
    return data_loader

# Evaluate model
def evaluate_model(model, data_loader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for _, data in enumerate(data_loader, 0):
            ids = data['ids']
            mask = data['mask']
            token_type_ids = data['token_type_ids']
            labels = data['labels']
            
            outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids)
            _, predicted = torch.max(outputs.logits, 1)
            
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)
            
    accuracy = correct_predictions / total_predictions
    return accuracy

In [42]:
# Settings
MAX_LEN = 100

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Create DataLoader for training data
train_loader = load_data('data/train.csv', tokenizer, MAX_LEN)

## Fine-Tune BERT Classifier

In [43]:
# BERT model initialization
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3)

# Optimizer and Loss function
optimizer = Adam(model.parameters(), lr=1e-5)
loss_function = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
epochs = 3  # Replace with the number of epochs you want

for epoch in range(epochs):
    for _, data in enumerate(data_loader, 0):
        ids = data['ids']
        mask = data['mask']
        token_type_ids = data['token_type_ids']
        labels = data['labels']

        outputs = model(ids, attention_mask=mask, token_type_ids=token_type_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()

print("Fine-tuning completed!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuning completed!


## Evaluating the BERT Classifier

In [44]:
# Create DataLoader for test data
test_loader = load_data('data/test.csv', tokenizer, MAX_LEN, shuffle=False)

# Evaluate the model on test data
accuracy = evaluate_model(model, test_loader)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.3333333333333333


## Using the BERT Classifier

In [36]:
def classify_user_prompt(text, model, tokenizer, label_mapping):
    # Prepare the text into tokenized tensor
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # Run the text through the model
    with torch.no_grad():
        outputs = model(**inputs)
        
    # Get the predicted label index
    _, predicted_idx = torch.max(outputs.logits, 1)
    
    # Convert the index to the corresponding label string
    predicted_label = None
    for label, idx in label_mapping.items():
        if idx == predicted_idx.item():
            predicted_label = label
            break
            
    return predicted_label

In [37]:
# Example usage
text = "Mir geht es schlecht, das Studium ist sehr anstrengend."
predicted_label = classify_user_prompt(text, model, tokenizer, label_mapping)
print(f"The predicted label for the text is: {predicted_label}")


The predicted label for the text is: concern
