In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import (
    CamembertTokenizer, 
    CamembertForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
import torch
from torch.utils.data import Dataset


In [21]:

# Setup instructions:
# conda create -n city_extraction python=3.9
# conda activate city_extraction
# conda install pytorch torchvision torchaudio -c pytorch
# pip install transformers pandas scikit-learn datasets

# Load your data
df = pd.read_csv('fake_data.csv')


In [22]:

# Prepare data for token classification
class CityExtractionDataset(Dataset):
    def __init__(self, sentences, origins, destinations, tokenizer, max_length=128):
        self.sentences = sentences
        self.origins = origins
        self.destinations = destinations
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        origin = str(self.origins[idx]) if pd.notna(self.origins[idx]) else ""
        destination = str(self.destinations[idx]) if pd.notna(self.destinations[idx]) else ""
        
        # Tokenize
        encoding = self.tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_offsets_mapping=True
        )
        
        # Create labels: 0=O, 1=B-ORIGIN, 2=I-ORIGIN, 3=B-DEST, 4=I-DEST
        labels = [0] * self.max_length
        offset_mapping = encoding.pop('offset_mapping')
        
        # Tag origin city
        if origin and origin in sentence:
            origin_start = sentence.lower().find(origin.lower())
            if origin_start != -1:
                origin_end = origin_start + len(origin)
                for i, (start, end) in enumerate(offset_mapping):
                    if start >= origin_start and end <= origin_end and start != end:
                        labels[i] = 1 if start == origin_start or i == 0 or labels[i-1] == 0 else 2
        
        # Tag destination city
        if destination and destination in sentence:
            dest_start = sentence.lower().find(destination.lower())
            if dest_start != -1:
                dest_end = dest_start + len(destination)
                for i, (start, end) in enumerate(offset_mapping):
                    if start >= dest_start and end <= dest_end and start != end:
                        if labels[i] == 0:  # Don't override origin labels
                            labels[i] = 3 if start == dest_start or i == 0 or labels[i-1] not in [3, 4] else 4
        
        encoding['labels'] = labels
        return {key: torch.tensor(val) for key, val in encoding.items()}


In [23]:
# Initialize tokenizer and model (use Fast tokenizer)
from transformers import CamembertTokenizerFast

tokenizer = CamembertTokenizerFast.from_pretrained('camembert-base')
model = CamembertForTokenClassification.from_pretrained(
    'camembert-base',
    num_labels=5  # O, B-ORIGIN, I-ORIGIN, B-DEST, I-DEST
)


Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [25]:

# Create datasets
train_dataset = CityExtractionDataset(
    train_df['sentence'].values,
    train_df['ville_origine'].values,
    train_df['ville_arrivee'].values,
    tokenizer
)

val_dataset = CityExtractionDataset(
    val_df['sentence'].values,
    val_df['ville_origine'].values,
    val_df['ville_arrivee'].values,
    tokenizer
)


In [26]:

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_pin_memory=False
)


In [27]:

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)


In [28]:

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [29]:

# Train
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0136,0.011521
2,0.0043,0.003584
3,0.0028,0.002481


TrainOutput(global_step=3000, training_loss=0.0714250594060868, metrics={'train_runtime': 319.449, 'train_samples_per_second': 150.259, 'train_steps_per_second': 9.391, 'total_flos': 3135646126080000.0, 'train_loss': 0.0714250594060868, 'epoch': 3.0})

In [30]:

# Save model
model.save_pretrained('./city_extraction_model')
tokenizer.save_pretrained('./city_extraction_model')


('./city_extraction_model/tokenizer_config.json',
 './city_extraction_model/special_tokens_map.json',
 './city_extraction_model/sentencepiece.bpe.model',
 './city_extraction_model/added_tokens.json',
 './city_extraction_model/tokenizer.json')

In [31]:
# Prediction function
def extract_cities(sentence, model, tokenizer):
    # Get the device the model is on
    device = next(model.parameters()).device
    
    # Tokenize
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Move all inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    origin_tokens = []
    dest_tokens = []
    
    for token, pred in zip(tokens, predictions[0]):
        if pred == 1 or pred == 2:  # ORIGIN
            if token not in ['<s>', '</s>', '<pad>']:
                origin_tokens.append(token.replace('▁', ''))
        elif pred == 3 or pred == 4:  # DEST
            if token not in ['<s>', '</s>', '<pad>']:
                dest_tokens.append(token.replace('▁', ''))
    
    origin = ''.join(origin_tokens).strip()
    destination = ''.join(dest_tokens).strip()
    
    return origin, destination

In [None]:

# Test
test_sentence = "Je veux aller à Paris en partant de Lyon demain"
origin, destination = extract_cities(test_sentence, model, tokenizer)
print(f"Origine: {origin}")
print(f"Destination: {destination}")

Origine: Lyon
Destination: Paris
