In [None]:

# Combine 'CATALOG_NM' and 'KEYWORD' into a single text column
merged_df['combined_text'] = merged_df['CATALOG_NM'] + ' ' + merged_df['KEYWORD']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    merged_df.drop('judge', axis=1),
    merged_df['judge'],
    test_size=0.2,
    random_state=42
)

# Tokenize text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train['combined_text'].astype(str).tolist(), padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(X_test['combined_text'].astype(str).tolist(), padding=True, truncation=True, return_tensors='pt')

# Create DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(y_test.values))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training loop (you may need to adjust this based on your dataset size)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training
model.train()
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(y_test.values, all_preds)
print(f"Accuracy: {accuracy}")