In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score

# Sample data: event titles and their corresponding types
data = [
    ("Tech Summit 2024", "tech"),
    ("Fashion Show Gala", "fashion"),
    ("Music Festival", "music"),
    ("Entertainment Awards Night", "entertainment"),
    ("NLP Conference", "tech"),
    ("Go Karting", "gaming"),
    ("Community Meetup for Networking", "social"),
    ("Formula one", "gaming"),
    ("Football night", "gaming")
    # Add more examples as needed
]

# Split the data into training and testing sets
titles, labels = zip(*data)
X_train, X_test, y_train, y_test = train_test_split(titles, labels, test_size=0.2, random_state=42)

labels = list(set(labels))
print(labels)

  from .autonotebook import tqdm as notebook_tqdm


['tech', 'gaming', 'music', 'fashion', 'entertainment', 'social']


In [2]:
import torch
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))

# Tokenize and encode the training data
X_train_encoded = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt")
X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

# Train the BERT-based model
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):  # You might want to adjust the number of epochs
    optimizer.zero_grad()
    outputs = model(**X_train_encoded, labels=torch.tensor([labels.index(label) for label in y_train]))
    loss = outputs.loss
    loss.backward()
    optimizer.step()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    outputs = model(**X_test_encoded)
    logits = outputs.logits
    predictions = np.argmax(logits, axis=1)

# Convert predictions back to labels
predicted_labels = [labels[idx] for idx in predictions]

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Function to classify a new event title
def classify_event(title):
    title_encoded = tokenizer(title, padding=True, truncation=True, return_tensors="pt")
    model.eval()
    with torch.no_grad():
      outputs = model(**title_encoded)
      logits = outputs.logits
      prediction = np.argmax(logits, axis=1)
    return labels[prediction.item()]


Accuracy: 50.00%


In [4]:
new_event_title = "baseball match"
predicted_type = classify_event(new_event_title)
print(f"The event '{new_event_title}' is classified as '{predicted_type}'.")

The event 'baseball match' is classified as 'gaming'.


In [5]:
model.save_pretrained('bert_model')

# Save the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.save_pretrained('bert_tokenizer')

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')