In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.8 MB/s[0m eta [36m0:00:0

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import BartTokenizer, BartForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
# Load and preprocess your dataset
data = pd.read_csv('PROMISE_mod.csv')  # Replace with your dataset filename
texts = data['text'].tolist()
labels = data['label'].tolist()

In [None]:
# Encode categorical labels using LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, encoded_labels, test_size=0.2, random_state=42)

In [None]:
# Load BART tokenizer and classification model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
num_classes = len(label_encoder.classes_)
multiclass_model = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=num_classes)  # Multiclass classification

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and encode the text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

train_inputs = {key: val for key, val in train_encodings.items()}
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

test_inputs = {key: val for key, val in test_encodings.items()}


In [None]:
# Set up the optimizers
multiclass_optimizer = AdamW(multiclass_model.parameters(), lr=1e-5)



In [None]:
num_epochs = 3
batch_size = 16

# Training loop for multiclass classification
for epoch in range(num_epochs):
    multiclass_model.train()
    for i in range(0, len(train_labels), batch_size):
        batch_inputs = {key: val[i:i+batch_size] for key, val in train_inputs.items()}
        batch_labels = train_labels_tensor[i:i+batch_size]

        multiclass_optimizer.zero_grad()
        outputs = multiclass_model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        multiclass_optimizer.step()

    multiclass_model.eval()
    with torch.no_grad():
        test_outputs = multiclass_model(**test_inputs)
        predicted_labels = np.argmax(test_outputs.logits.numpy(), axis=1)
        accuracy = accuracy_score(test_labels, predicted_labels)
        print(f'Epoch {epoch + 1} - Multiclass Test Accuracy: {accuracy:.4f}')

Epoch 1 - Multiclass Test Accuracy: 0.3760
Epoch 2 - Multiclass Test Accuracy: 0.4240
Epoch 3 - Multiclass Test Accuracy: 0.5920
