In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizerFast, CamembertForSequenceClassification, CamembertConfig, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import json
from torch.nn.utils.rnn import pad_sequence

In [82]:
# Load the data
data = pd.read_csv('../data/train.csv')
data = data.set_index("id")
data.entities = data.entities.apply(json.loads) # parsing des entités
data.relations = data.relations.apply(json.loads) # parsing des relations
data.head()

Unnamed: 0_level_0,text,entities,relations
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
181,"Anam Destresse, président de l'ONG ""Ma passion...","[{'id': 0, 'mentions': [{'value': 'accident', ...","[[0, STARTED_IN, 9], [7, IS_LOCATED_IN, 9], [5..."
31669,"À Paris, le 8 avril 2022, l'usine de déodorant...","[{'id': 0, 'mentions': [{'value': 'explosé', '...","[[9, IS_LOCATED_IN, 8], [11, OPERATES_IN, 8], ..."
51470,"En Espagne, dans une région agricole, une cont...","[{'id': 0, 'mentions': [{'value': 'contaminati...","[[7, IS_PART_OF, 8], [9, OPERATES_IN, 1], [0, ..."
51332,Un important incendie a fait des ravages dans ...,"[{'id': 0, 'mentions': [{'value': 'incendie', ...","[[12, IS_IN_CONTACT_WITH, 5], [0, IS_LOCATED_I..."
1131,« Je coule » : onze heures après avoir envoyé ...,"[{'id': 0, 'mentions': [{'value': 'renversé', ...","[[9, IS_LOCATED_IN, 2], [0, START_DATE, 17], [..."


In [72]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the CamemBERT tokenizer
tokenizer = CamembertTokenizerFast.from_pretrained('camembert-base')

In [73]:
train_data["text"]

id
3703     Les travailleurs de la Centrale de Surveillanc...
11699    \nCe matin à Athènes, une séance de sensibilis...
2453     Le 18 avril 2005, les stagiaires de l'entrepri...
3726     Le matin du 6 mai 2013, le parvis de la basili...
2484     Un drame s'est produit cette nuit à Pretoria, ...
                               ...                        
2567     Ce matin, Jeanick Martin, présidente de l'asso...
51464    Durant l’inauguration d’une nouvelle station-s...
1223     M. Thompson Charlton, membre de l'association ...
1106     Une collision entre deux camions a eu lieu à S...
3721     Des milliers de personnes se sont retrouvées s...
Name: text, Length: 640, dtype: object

In [74]:
# Function to tokenize the text and align entities
def tokenize_and_align_entities(row):
    text = row['text']
    entities = row['entities']
    relations = row['relations']
    
    # Tokenize the text
    tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=512) #return_offsets_mapping=True
    
    # Align entities with tokenized text
    aligned_entities = []
    for entity in entities:
        for mention in entity["mentions"]:
            start, end = mention['start'], mention['end']
            token_start = tokenized.char_to_token(start)
            token_end = tokenized.char_to_token(end - 1)
            if token_start is not None and token_end is not None:
                aligned_entities.append({
                    'start': token_start,
                    'end': token_end,
                    'type': entity['type']
                })
            
    # Align relations with tokenized text
    aligned_relations = []
    for relation in relations:
        head, tail = relation[0], relation[2]
        aligned_relations.append({
            'head': head,
            'tail': tail,
            'type': relation[1]
        })

    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'entities': aligned_entities,
        'relations': aligned_relations
    }

# Apply the tokenization and alignment function to the data
train_data = train_data.apply(tokenize_and_align_entities, axis=1)
val_data = val_data.apply(tokenize_and_align_entities, axis=1)

In [75]:
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    entities = [item['entities'] for item in batch]
    relations = [item['relations'] for item in batch]

    # Pad input_ids and attention_mask
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

    # Pad entities and relations
    max_entities_len = max(len(e) for e in entities)
    max_relations_len = max(len(r) for r in relations)

    padded_entities = torch.zeros((len(entities), max_entities_len, 3), dtype=torch.long)
    padded_relations = torch.zeros((len(relations), max_relations_len, 3), dtype=torch.long)

    for i, (e, r) in enumerate(zip(entities, relations)):
        for j, entity in enumerate(e):
            padded_entities[i, j, 0] = entity['start']
            padded_entities[i, j, 1] = entity['end']
            padded_entities[i, j, 2] = tokenizer.convert_tokens_to_ids(entity['type'])

        for j, relation in enumerate(r):
            padded_relations[i, j, 0] = relation['head']
            padded_relations[i, j, 1] = tokenizer.convert_tokens_to_ids(relation['type'])
            padded_relations[i, j, 2] = relation['tail']

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'entities': padded_entities,
        'relations': padded_relations
    }


In [76]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

train_dataset = TextDataset(train_data)
val_dataset = TextDataset(val_data)

train_dataset = RelationExtractionDataset(train_data)
val_dataset = RelationExtractionDataset(val_data)

In [77]:
ONTOLOGY_RELATIONS = [
    "HAS_CONTROL_OVER",
    "STARTED_IN",
    "IS_LOCATED_IN",
    "HAS_CATEGORY",
    "IS_PART_OF",
    "INJURED_NUMBER",
    "IS_OF_NATIONALITY",
    "OPERATES_IN",
    "INITIATED",
    "RESIDES_IN",
    "HAS_CONSEQUENCE",
    "IS_COOPERATING_WITH",
    "IS_IN_CONTACT_WITH",
    "IS_OF_SIZE",
    "HAS_QUANTITY",
    "HAS_FOR_LENGTH",
    "IS_BORN_IN",
    "WEIGHS",
    "HAS_FOR_WIDTH",
    "HAS_COLOR",
    "HAS_LATITUDE",
    "IS_REGISTERED_AS",
    "IS_AT_ODDS_WITH",
    "CREATED",
    "HAS_FAMILY_RELATIONSHIP",
    "DEATHS_NUMBER",
    "HAS_FOR_HEIGHT",
    "HAS_LONGITUDE",
    "IS_DEAD_ON",
    "START_DATE",
    "END_DATE",
    "WAS_CREATED_IN",
    "IS_BORN_ON",
    "WAS_DISSOLVED_IN",
    "DIED_IN",
    "GENDER_FEMALE",
    "GENDER_MALE",
]

In [78]:
# Define the model
config = CamembertConfig.from_pretrained('camembert-base', num_labels=len(ONTOLOGY_RELATIONS))
model = CamembertForSequenceClassification.from_pretrained('camembert-base', config=config)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
# Define the DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(3):  # Number of epochs
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['relations'].to(device)
        entities = batch['entities'].to(device)
        relations = batch['relations'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    for batch in tqdm(val_loader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['relations'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}')

  input_ids = [torch.tensor(item['input_ids']) for item in batch]
  attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
  0%|          | 0/80 [00:11<?, ?it/s]


ValueError: Expected input batch_size (8) to match target batch_size (1176).

In [None]:
model.save_pretrained('../model/camembert-relation-extraction')
tokenizer.save_pretrained('../model/camembert-relation-extraction')

Notes:
Relation Types: Ensure you have a list of all possible relation types (relation_types) to map the predicted indices back to relation labels.

Entity Handling: The code assumes that entities are already identified and aligned. If not, you may need to add a Named Entity Recognition (NER) step before relation extraction.

Hyperparameters: Adjust hyperparameters like learning rate, batch size, and number of epochs based on your specific dataset and requirements.