In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.4 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Load your dataset
data = pd.read_csv('PROMISE_mod.csv')  # Replace with your dataset filename

# Use LabelEncoder to assign numerical labels
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Get the training and testing text and label data
train_texts = train_data['text'].tolist()
train_labels = train_data['encoded_label'].tolist()

test_texts = test_data['text'].tolist()
test_labels = test_data['encoded_label'].tolist()

# Load BERT tokenizer and classification model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
num_classes = len(label_encoder.classes_)
multiclass_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

train_inputs = {key: val for key, val in train_encodings.items()}
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

test_inputs = {key: val for key, val in test_encodings.items()}

# Set up the optimizer
multiclass_optimizer = AdamW(multiclass_model.parameters(), lr=1e-5)

# Training loop for multiclass classification
num_epochs = 3
batch_size = 16

for epoch in range(num_epochs):
    multiclass_model.train()
    for i in range(0, len(train_labels), batch_size):
        batch_inputs = {key: val[i:i+batch_size] for key, val in train_inputs.items()}
        batch_labels = train_labels_tensor[i:i+batch_size]

        multiclass_optimizer.zero_grad()
        outputs = multiclass_model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        multiclass_optimizer.step()

    multiclass_model.eval()
    with torch.no_grad():
        test_outputs = multiclass_model(**test_inputs)
        predicted_labels = torch.argmax(test_outputs.logits, dim=1).cpu().numpy()
        accuracy = accuracy_score(test_labels, predicted_labels)
        print(f'Epoch {epoch + 1} - Multiclass Test Accuracy: {accuracy:.4f}')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 - Multiclass Test Accuracy: 0.3760
Epoch 2 - Multiclass Test Accuracy: 0.3760
Epoch 3 - Multiclass Test Accuracy: 0.4800
