In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load the BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Read the dataset
dataset_path = '/content/drive/MyDrive/DS204/dataset.txt'
with open(dataset_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

samples = []
labels = []
for line in lines:
    text, label = line.strip().rsplit('_label_', 1)
    samples.append(text.strip())
    labels.append(label.strip().split('|'))

df = pd.DataFrame({'Text': samples, 'Label': labels})

# Preprocess the data
encoded_inputs = tokenizer(df['Text'].tolist(), truncation=True, padding=True, return_tensors='pt')
labels = [label for label in df['Label'].tolist()]

# Convert labels to the appropriate format
label_mapping = {}
label_id = 0
for label_list in labels:
    for label in label_list:
        if label not in label_mapping:
            label_mapping[label] = label_id
            label_id += 1

encoded_labels = []
for label_list in labels:
    encoded_label = [label_mapping[label] for label in label_list]
    encoded_labels.append(encoded_label)

# Convert labels to tensor
labels_tensor = torch.tensor(encoded_labels)

# Apply BERT for multi-output classification
with torch.no_grad():
    outputs = model(**encoded_inputs, labels=labels_tensor)

# Get the predicted labels
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

# Convert the predicted labels to array format
predicted_labels_array = [[k for k, v in label_mapping.items() if v == label_id] for label_id in predicted_labels]

print(predicted_labels_array)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [3]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip -o uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
import numpy as np
from tqdm import tqdm
import os

SEQ_LEN = 256
BATCH_SIZE = 32

dataset = tf.keras.utils.get_file(
    fname="20news-18828.tar.gz", 
    origin="http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz", 
    extract=True,
)

tokenizer = Tokenizer()
def load_data(path, tagset):
    global tokenizer
    indices, labels = [], []
    for folder, label in tagset:
        folder = os.path.join(path, folder)
        for name in tqdm(os.listdir(folder)):
            with open(os.path.join(folder, name), 'r', encoding="utf-8", errors='ignore') as reader:
                  text = reader.read()
            tokenizer.fit_on_texts([text])  # Fit the tokenizer on the current text
            ids = tokenizer.texts_to_sequences([text])[0]  # Convert text to sequences
            indices.append(ids)
            labels.append(label)
  
    items = list(zip(indices, labels))
    np.random.shuffle(items)
    indices, labels = zip(*items)
    indices = np.array(indices)
    mod = indices.shape[0] % BATCH_SIZE
    if mod > 0:
        indices, labels = indices[:-mod], labels[:-mod]
    return [indices, np.zeros_like(indices)], np.array(labels)


path = os.path.join(os.path.dirname(dataset), '20news-18828')
tagset = [(x, i) for i, x in enumerate(os.listdir(path))]
id_to_labels = {id_: label for label, id_ in tagset}

# Load data, split 80-20 for training/testing.
all_x, all_y = load_data(path, tagset)

train_perc = 0.8
total = len(all_y)

n_train = int(train_perc * total)
n_test = total - n_train

test_x = [all_x[0][n_train:], all_x[1][n_train:]]
train_x = [all_x[0][:n_train], all_x[1][:n_train]]

train_y, test_y = all_y[:n_train], all_y[n_train:]

print("# Total: %s, # Train: %s, # Test: %s" % (total, n_train, n_test))


100%|██████████| 775/775 [00:16<00:00, 47.68it/s]
100%|██████████| 999/999 [00:58<00:00, 17.16it/s]
100%|██████████| 973/973 [01:23<00:00, 11.72it/s]
100%|██████████| 994/994 [01:49<00:00,  9.05it/s]
100%|██████████| 799/799 [01:44<00:00,  7.67it/s]
100%|██████████| 990/990 [02:27<00:00,  6.73it/s]
100%|██████████| 987/987 [02:48<00:00,  5.87it/s]
100%|██████████| 961/961 [03:02<00:00,  5.27it/s]
100%|██████████| 980/980 [03:28<00:00,  4.69it/s]
100%|██████████| 940/940 [03:58<00:00,  3.94it/s]
100%|██████████| 985/985 [05:31<00:00,  2.98it/s]
 52%|█████▏    | 326/628 [02:09<01:59,  2.52it/s]


KeyboardInterrupt: ignored

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

# Define your Scires dataset
scires_data = [
    {
        "text": "In other words , when there is sufficient evidence to compute term weights for words , we should do so , but in other cases , we should back off and use bins . These changes may improve our system , which is already faring well against competitors . Additional information about this work can be found at _CITE_ , and we will continue to expand this page as our research continues",
        "label": ["Supplement", "Document", "Produce"]
    },
    # Add more samples to your dataset
]

# Split the dataset into train and test sets
train_data, test_data = train_test_split(scires_data, test_size=0.2, random_state=42)

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a custom dataset class
class SciresDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sample = self.data[index]
        text = sample['text']
        label = sample['label']
        
        # Tokenize and convert text to input features
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        
        return input_ids, attention_mask, label

# Create instances of the dataset class
train_dataset = SciresDataset(train_data, tokenizer)
test_dataset = SciresDataset(test_data, tokenizer)

# Define the collate function to pad sequences
def collate_fn(batch):
    input_ids, attention_mask, labels = zip(*batch)
    
    input_ids = pad_sequence(input_ids, batch_first=True)
    attention_mask = pad_sequence(attention_mask, batch_first=True)
    
    return input_ids, attention_mask, labels

# Create data loaders
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model initialization
num_labels = len(train_dataset[0][-1])  # Number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 10
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = torch.tensor(labels).to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

# Evaluation
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = torch.tensor(labels).to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predicted_labels = torch.argmax(logits, dim=1)
        correct += (predicted_labels == labels).sum().item()
        total += len(labels)
    
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
