In [1]:
!pip install transformers torch datasets




In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from datasets import load_dataset


In [None]:
dataset = load_dataset('few_rel', split='train_wiki')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dataset = load_dataset('few_rel', split='train_wiki')
relation_labels = {relation: idx for idx, relation in enumerate(sorted(set(dataset['relation'])))}


def preprocess_data(examples):
    tokens = tokenizer(examples['tokens'], is_split_into_words=True, padding='longest', truncation=True, return_tensors='pt')
    labels = [relation_labels[relation] for relation in examples['relation']]
    return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask'], 'labels': labels}

processed_dataset = dataset.map(preprocess_data, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/44800 [00:00<?, ? examples/s]

In [None]:
class CNNDistilBERT(nn.Module):
    def __init__(self, num_classes):
        super(CNNDistilBERT, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.conv1 = nn.Conv1d(in_channels=768, out_channels=128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1)
        self.fc = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state

        hidden_states = hidden_states.permute(0, 2, 1)

        x = self.conv1(hidden_states)
        x = nn.ReLU()(x)
        x = self.conv2(x)
        x = nn.ReLU()(x)
        x = x.max(dim=2)[0]

        x = self.dropout(x)
        x = self.fc(x)
        return x


In [None]:
num_classes = len(set(processed_dataset['labels']))
print(num_classes)
batch_size = 8
learning_rate = 2e-5
num_epochs = 3



def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True)
    labels = torch.stack(labels)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

train_loader = DataLoader(processed_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)



model = CNNDistilBERT(num_classes)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


model.train()
print(range(num_epochs))
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')


64


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

range(0, 3)


  scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision
  with torch.cuda.amp.autocast():  # Apply mixed precision


Epoch 1/3, Loss: 3.048797607421875
Epoch 2/3, Loss: 0.8939785957336426
Epoch 3/3, Loss: 0.6700363159179688


In [None]:
import torch

MODEL_PATH = "relation_extraction_model.pth"

torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")

tokenizer.save_pretrained("tokenizer")

Model saved to relation_extraction_model.pth


('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json')

In [None]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("tokenizer")

model = CNNDistilBERT(num_classes=len(set(processed_dataset['labels'])))
model.load_state_dict(torch.load("relation_extraction_model.pth"))
model.to(device)
model.eval()
print("Model loaded successfully!")


  model.load_state_dict(torch.load("relation_extraction_model.pth"))


Model loaded successfully!


In [None]:
def predict_relation(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs, dim=1)

    predicted_class = torch.argmax(probabilities, dim=1).item()

    return predicted_class, probabilities.cpu().numpy()

text = "Cristiano Ronaldo Plays for Al-Nassr"
predicted_relation, confidence_scores = predict_relation(text)
print(f"Predicted Relation: {predicted_relation}")
print(f"Confidence Scores: {confidence_scores}")


Predicted Relation: 5
Confidence Scores: [[1.1379649e-03 6.4438838e-04 3.8318071e-04 8.2499482e-06 2.3451888e-03
  7.0132929e-01 1.8307143e-03 2.6116478e-03 5.7024212e-04 1.1869547e-03
  8.3972514e-03 5.2410890e-03 1.9094736e-05 3.9782678e-03 1.2684519e-04
  8.8439515e-04 1.9983659e-04 7.7456578e-05 5.0432753e-04 4.5250482e-03
  2.9483477e-03 2.2008121e-02 8.7549444e-03 6.2917883e-04 3.5026969e-04
  1.3016153e-03 4.8163743e-05 4.9895160e-02 3.4968680e-04 3.0399386e-05
  4.3464534e-05 5.9651155e-02 1.1394570e-03 7.7466882e-04 6.4764931e-03
  3.3486716e-04 8.6022858e-03 3.6775458e-04 4.1805021e-03 1.9887355e-03
  4.2238386e-05 1.0787446e-02 1.5875924e-04 3.0063824e-05 8.0546748e-04
  2.0275400e-03 1.5854478e-02 1.5019560e-03 1.0005643e-02 4.0961456e-04
  1.2058190e-03 1.1680715e-02 3.3043427e-04 1.0008109e-03 5.0714011e-03
  2.1618227e-03 2.8832441e-03 1.7430598e-03 8.0360413e-05 1.4873187e-04
  1.9885241e-03 1.8225765e-02 1.7987042e-05 5.9613325e-03]]


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dataset2 = load_dataset('few_rel', split='val_wiki')

relation_labels = {relation: idx for idx, relation in enumerate(sorted(set(dataset2['relation'])))}

def preprocess_data(examples):
    tokens = tokenizer(examples['tokens'], is_split_into_words=True, padding='longest', truncation=True, return_tensors='pt')
    labels = [relation_labels[relation] for relation in examples['relation']]
    return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask'], 'labels': labels}

processed_dataset2 = dataset2.map(preprocess_data, batched=True)

Map:   0%|          | 0/11200 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions = torch.argmax(outputs, dim=1)

            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1

test_loader = DataLoader(
    processed_dataset2, 
    batch_size=batch_size, 
    shuffle=False,
    collate_fn=collate_fn
)

evaluate_model(model, test_loader, device)


Accuracy: 0.0060
Precision: 0.0541
Recall: 0.0060
F1 Score: 0.0101


  _warn_prf(average, modifier, msg_start, len(result))


(0.005982142857142857,
 0.054053044332831104,
 0.005982142857142857,
 0.010142925354795216)