In [1]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Sample data with NaN values
data = {
    'Sentence': ["Service was excellent.", "The food quality was amazing.", "The room was not clean.", None],
    'OpinionTerm1': ["excellent service", "amazing food", "not clean room", None],
    'Aspect Category1': ["Company_Service", "Food", "Cleanliness", None],
    'AspectPolarity1': ["positive", "positive", "negative", None]
}

# Create DataFrame
df = pd.DataFrame(data)

# Replace NaN values with an empty string or handle them based on your requirements
df = df.fillna("")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define a function to tokenize and prepare the input data
def prepare_data(sentences, labels, max_len=128):
    # Ensure 'sentences' is a list of strings
    sentences = list(sentences.astype(str))
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
    
    # Convert labels to tensor
    labels_list = []

    for row in labels:
        # Handle NaN values for each element in the row
        processed_labels = [
            tokenizer.encode(str(label), add_special_tokens=False) if pd.notna(label) else [0]
            for label in row
        ]

        # Flatten the list
        flattened_labels = [item for sublist in processed_labels for item in sublist]

        # Pad or truncate the label sequence
        padding_length = max_len - len(flattened_labels)
        if padding_length > 0:
            padding = np.zeros(padding_length, dtype=int)
            flattened_labels += list(padding)
        else:
            flattened_labels = flattened_labels[:max_len]

        labels_list.append(flattened_labels)
    
    labels_tensor = torch.tensor(labels_list)

    return tokenized_inputs, labels_tensor

# Example: Prepare data
# Assuming df contains your dataset
max_len = 128
tokenized_inputs, labels = prepare_data(df['Sentence'], df[['OpinionTerm1', 'Aspect Category1', 'AspectPolarity1']].values, max_len)

# Split the data into training and testing sets with consistent random_state
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    tokenized_inputs['input_ids'], labels, test_size=0.2, random_state=42
)

# Train the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
epochs = 1

for epoch in range(epochs):
    outputs = model(train_inputs, labels=train_labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

# Evaluate the model on the test set
with torch.no_grad():
    model.eval()
    test_outputs = model(test_inputs, labels=test_labels)
    predicted_labels = torch.argmax(test_outputs.logits, dim=2).numpy()

# Flatten the predictions and true labels
flat_predicted_labels = predicted_labels.flatten()
flat_true_labels = test_labels.flatten().numpy()

# Print classification report
print(classification_report(flat_true_labels, flat_predicted_labels))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


ValueError: Expected input batch_size (24) to match target batch_size (384).