In [7]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from sklearn.metrics import classification_report

In [4]:
train_df = pd.read_csv('DBPEDIA_train.csv')
val_df = pd.read_csv('DBPEDIA_val.csv')
test_df = pd.read_csv('DBPEDIA_test.csv')

In [5]:
# Combine l1, l2, l3 into a single column for labels
train_df['labels'] = train_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)
val_df['labels'] = val_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)
test_df['labels'] = test_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)

  train_df['labels'] = train_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)
  val_df['labels'] = val_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)
  test_df['labels'] = test_df[['l1', 'l2', 'l3']].apply(lambda x: f'{x[0]} > {x[1]} > {x[2]}', axis=1)


In [6]:
# Encode labels
le = LabelEncoder()
train_df['label_encoded'] = le.fit_transform(train_df['labels'])
val_df['label_encoded'] = le.transform(val_df['labels'])
test_df['label_encoded'] = le.transform(test_df['labels'])

In [8]:
# Tokenize the text data
def tokenize_text(text, vocab):
    return [vocab[word] if word in vocab else vocab['<UNK>'] for word in text.split()]

# Pad sequences to the same length
def pad_sequences(sequences, maxlen):
    return np.array([np.pad(seq, (0, maxlen - len(seq)), mode='constant') if len(seq) < maxlen else seq[:maxlen] for seq in sequences])

# Create a vocabulary from the training data
vocab = {'<PAD>': 0, '<UNK>': 1}
for text in train_df['text']:
    for word in text.split():
        if word not in vocab:
            vocab[word] = len(vocab)

# Tokenize and pad the text data
maxlen = 100  # Maximum sequence length
train_texts = pad_sequences([tokenize_text(text, vocab) for text in train_df['text']], maxlen)
val_texts = pad_sequences([tokenize_text(text, vocab) for text in val_df['text']], maxlen)
test_texts = pad_sequences([tokenize_text(text, vocab) for text in test_df['text']], maxlen)

In [9]:
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

train_dataset = CustomTextDataset(train_texts, train_df['label_encoded'])
val_dataset = CustomTextDataset(val_texts, val_df['label_encoded'])
test_dataset = CustomTextDataset(test_texts, test_df['label_encoded'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [11]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                            dropout=dropout, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        avg_pool = torch.mean(lstm_out, dim=1)
        logits = self.fc(avg_pool)
        output = self.softmax(logits)
        return output

# Model parameters
embedding_dim = 128
hidden_dim = 256
output_dim = len(le.classes_)
num_layers = 2
dropout = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(len(vocab), embedding_dim, hidden_dim, output_dim, num_layers, dropout)
model.to(device)

LSTMClassifier(
  (embedding): Embedding(1091095, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=219, bias=True)
  (softmax): Softmax(dim=1)
)

In [13]:
import torch.optim as optim
from tqdm  import tqdm

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    loop = tqdm(data_loader, leave=True)

    for batch in loop:
        texts = batch['text'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        loop.set_description(f"Train Loss: {total_loss / (loop.n + 1):.4f}")

    return total_loss / len(data_loader)

# Training loop
n_epochs = 3
for epoch in range(n_epochs):
    print(f'Epoch {epoch + 1}/{n_epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    print(f'Training Loss: {train_loss}')

Epoch 1/3


Train Loss: 5.3758: 100%|██████████| 7530/7530 [08:27<00:00, 14.83it/s]


Training Loss: 5.375785676289998
Epoch 2/3


Train Loss: 5.3312: 100%|██████████| 7530/7530 [08:34<00:00, 14.64it/s]


Training Loss: 5.3304553257358345
Epoch 3/3


Train Loss: 5.1113: 100%|██████████| 7530/7530 [08:31<00:00, 14.71it/s]

Training Loss: 5.111302017184051





In [14]:
from sklearn.metrics import classification_report

def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    loop = tqdm(data_loader, leave=True)

    with torch.no_grad():
        for batch in loop:
            texts = batch['text'].to(device)
            labels = batch['label'].to(device)

            outputs = model(texts)
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return classification_report(true_labels, predictions, target_names=le.classes_)

print(evaluate(model, val_loader))

100%|██████████| 1126/1126 [00:08<00:00, 134.51it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                    precision    recall  f1-score   support

                                        Agent > Actor > AdultActor       0.00      0.00      0.00       125
                                        Agent > Actor > VoiceActor       0.00      0.00      0.00        52
                                         Agent > Artist > Comedian       0.00      0.00      0.00       127
                                    Agent > Artist > ComicsCreator       0.00      0.00      0.00       268
                                  Agent > Artist > FashionDesigner       0.00      0.00      0.00        70
                                          Agent > Artist > Painter       0.00      0.00      0.00       233
                                     Agent > Artist > Photographer       0.00      0.00      0.00        41
                   Agent > Athlete > AustralianRulesFootballPlayer       0.00      0.00      0.00       284
                           

  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
def predict_sentence(sentence, model, vocab, maxlen):
    model.eval()
    tokens = tokenize_text(sentence, vocab)
    tokens_padded = pad_sequences([tokens], maxlen)[0]
    tokens_tensor = torch.tensor([tokens_padded], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = model(tokens_tensor)
        pred = torch.argmax(outputs, dim=1)
        return le.inverse_transform(pred.cpu().numpy())[0]

# Example usage
sentence = "The new car is extremely fast"
prediction = predict_sentence(sentence, model, vocab, maxlen)
print(f"This sentence belongs to: {prediction}")

sentence = "This person is a works for the government now, he was the Presidential candidate."
prediction = predict_sentence(sentence, model, vocab, maxlen)
print(f"This sentence belongs to: {prediction}")

This sentence belongs to: Place > NaturalPlace > MountainRange
This sentence belongs to: Agent > Athlete > GaelicGamesPlayer


In [5]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [7]:
max_len = 128
batch_size = 32

train_dataset = TextDataset(train_df['text'].tolist(), train_df['label_encoded'].tolist(), tokenizer, max_len)
val_dataset = TextDataset(val_df['text'].tolist(), val_df['label_encoded'].tolist(), tokenizer, max_len)
test_dataset = TextDataset(test_df['text'].tolist(), test_df['label_encoded'].tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
class MultiLabelBERTModel(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.logits

num_labels = len(le.classes_)
model = MultiLabelBERTModel(num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [11]:
from tqdm import tqdm

def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    loop = tqdm(data_loader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Update progress bar with the loss value
        loop.set_description(f"Train Loss: {total_loss / (loop.n + 1):.4f}")

    return total_loss / len(data_loader)

def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    loop = tqdm(data_loader, leave=True)

    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            # Optionally, you can update the progress bar with additional info
            loop.set_description(f"Evaluating")

    return classification_report(true_labels, predictions, target_names=le.classes_)

# Training loop with tqdm progress bars
n = 3
for epoch in range(n):
    print(f'Epoch {epoch + 1}/{n}')
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    print(f'Training Loss: {train_loss}')

Epoch 1/3


Train Loss: 2.6702:   1%|          | 66/7530 [00:47<1:30:13,  1.38it/s]


KeyboardInterrupt: 

In [None]:
# Evaluation
print("Validation Report:")
print(evaluate(model, val_loader))

print("Test Report:")
print(evaluate(model, test_loader))