In [None]:
!pip install transformers
!pip install -U sentence-transformers



In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Load your dataset
data = pd.read_csv('PROMISE_mod.csv')
# Assuming you have 'data' DataFrame with 'text' and 'label' columns
# ...

In [None]:
# Use LabelEncoder to assign numerical labels
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

In [None]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Load SBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sbert_model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:

# Tokenize and encode sentences
train_texts = train_data['text'].tolist()
test_texts = test_data['text'].tolist()

train_encodings = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

In [None]:
# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_data['encoded_label'].tolist(), dtype=torch.long)
test_labels = torch.tensor(test_data['encoded_label'].tolist(), dtype=torch.long)


In [None]:
# Extract SBERT embeddings
with torch.no_grad():
    train_embeddings = sbert_model(**train_encodings).last_hidden_state.mean(dim=1)
    test_embeddings = sbert_model(**test_encodings).last_hidden_state.mean(dim=1)

In [None]:
# Define and train the classification model
class SbertClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(SbertClassifier, self).__init__()
        self.fc = torch.nn.Linear(768, num_classes)  # SBERT output size is 768

    def forward(self, x):
        return self.fc(x)

In [None]:
# Initialize the model
num_classes = len(label_encoder.classes_)
model = SbertClassifier(num_classes)

In [None]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Training loop
num_epochs = 3
batch_size = 16

for epoch in range(num_epochs):
    model.train()
    for i in range(0, len(train_labels), batch_size):
        batch_embeddings = train_embeddings[i:i+batch_size]
        batch_labels = train_labels[i:i+batch_size]

        optimizer.zero_grad()
        logits = model(batch_embeddings)
        loss = loss_fn(logits, batch_labels)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        test_logits = model(test_embeddings)
        predicted_labels = torch.argmax(test_logits, dim=1).cpu().numpy()
        accuracy = accuracy_score(test_labels, predicted_labels)
        print(f'Epoch {epoch + 1} - Test Accuracy: {accuracy:.4f}')


Epoch 1 - Test Accuracy: 0.3760
Epoch 2 - Test Accuracy: 0.3760
Epoch 3 - Test Accuracy: 0.3760
