In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
# import matplotlib.pyplot as plt

In [6]:
def generate_X_Y_data(df, test_size=0.3, random_state=None):
    labels = df.columns.tolist()
    X = []
    y = []
    for _, row in df.iterrows():
        for value, label in zip(row.values,labels):
            X.append(value)
            y.append(label)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train = [str(x) for x in X_train]
    X_test = [str(x) for x in X_test]
    return X_train, X_test, y_train, y_test

df = pd.read_excel(r'..\database\problems\undersampling_test.xlsx')
try:
    df = df.drop('Unnamed: 0', axis=1)
except:
    pass

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train, X_test, y_train, y_test = generate_X_Y_data(df)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

encoded_inputs_train = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
y_train = torch.LongTensor(y_train)

encoded_inputs_test = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')
y_test = torch.LongTensor(y_test)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 10
batch_size = 32

train_dataset = TensorDataset(encoded_inputs_train['input_ids'], encoded_inputs_train['attention_mask'], y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

loss_fn = torch.nn.CrossEntropyLoss()

train_loss_history = []
train_acc_history = []
val_loss_history = []
val_acc_history = []

for epoch in range(epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    total_train_samples = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        loss = loss_fn(outputs.logits, labels)
        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()
        total_train_samples += labels.size(0)
        loss.backward()
        optimizer.step()

    train_loss /= len(train_dataloader)
    train_accuracy = train_correct / total_train_samples
    train_loss_history.append(train_loss)
    train_acc_history.append(train_accuracy)

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Train Accuracy: {train_accuracy:.4f}")

model.save_pretrained('model_bert_problem')
torch.save(label_encoder.classes_, 'label_encoder_classes.pt')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1/10 - Train Loss: 4.7031 - Train Accuracy: 0.0187


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch

def predict_intent(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(r'model_bert')
    label_encoder = LabelEncoder()
    label_encoder.classes_ = torch.load('label_encoder_classes.pt')

    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    model.eval()
    with torch.no_grad():
        logits = model(**encoded_input).logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_labels = torch.argmax(probabilities, dim=1)
        predicted_classes = label_encoder.inverse_transform(predicted_labels)


    probability = probabilities[0][predicted_labels].item()
    if probability > 0.8:
        classe = predicted_classes.item()
    else:
        classe = 'NaN'
        probability = 1.0

    return {"text":text,"class":classe,"probability":probability}
