# Travel Order Resolver — Notebook
This notebook implements an intent classifier + NER (Departure/Destination) for the **Travel Order Resolver** project.
It uses **CamemBERT** (transfer learning) via Hugging Face Transformers.

**What is included**
- Installation of dependencies
- Loading dataset (from Google Drive or upload)
- Preprocessing and tokenization
- Fine-tuning (intent classification) and token-classification (NER)
- Inference pipeline that reads `sentenceID,sentence` and writes outputs in the required format

**Important notes**
- The GPU notebook is optimized to run on Google Colab with a GPU runtime.
- The CPU notebook is lighter and intended for local execution without a GPU (slower).

---


## GPU notebook (Colab)

In [None]:
# Install required libraries (GPU / Colab)
!pip install -q transformers datasets evaluate seqeval accelerate tokenizers sacrebleu
print("Dependencies installed.")

In [None]:
# Mount Google Drive (optional) to load dataset from Drive
from google.colab import drive
drive.mount('/content/drive')
# Example path in Drive: /content/drive/MyDrive/nlp_miniprojects/train_set.csv

In [None]:

# Load dataset (adjust paths as needed)
import os
import pandas as pd

# Default paths (if you uploaded files to Colab /content/)
train_csv = "/content/train_set.csv"
test_csv = "/content/test_set.csv"

# If using Drive, change to the drive path (example shown in previous cell)
if not os.path.exists(train_csv) or not os.path.exists(test_csv):
    print("Train or test CSV not found in /content/. If using Drive, set the paths to your Drive folder.")
else:
    train_df = pd.read_csv(train_csv, encoding="utf-8")
    test_df = pd.read_csv(test_csv, encoding="utf-8")
    print("Train shape:", train_df.shape, "Test shape:", test_df.shape)

In [None]:

# Inspect and parse 'entities' JSON field (if present)
import json
def parse_entities_field(row):
    try:
        ents = json.loads(row['entities'])
    except Exception:
        ents = []
    valid = []
    for ent in ents:
        if 'start' in ent and 'end' in ent and 0 <= ent['start'] < ent['end'] <= len(row['text']):
            valid.append(ent)
    return valid

train_df['parsed_entities'] = train_df.apply(parse_entities_field, axis=1)
test_df['parsed_entities'] = test_df.apply(parse_entities_field, axis=1)
print('Parsed entities (example):')
train_df.head(3)

In [None]:

# Prepare Hugging Face datasets for intent classification
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer

label_encoder = LabelEncoder()
label_encoder.fit(train_df['intent'].unique())
num_labels = len(label_encoder.classes_)
print("Intent classes:", list(label_encoder.classes_))

hf_train = Dataset.from_pandas(train_df[['text','intent']].rename(columns={'intent':'label'}))
hf_test  = Dataset.from_pandas(test_df[['text','intent']].rename(columns={'intent':'label'}))

def encode_label(example):
    example['label'] = int(label_encoder.transform([example['label']])[0])
    return example

hf_train = hf_train.map(encode_label)
hf_test = hf_test.map(encode_label)

model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_classification(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

hf_train = hf_train.map(tokenize_classification, batched=True, remove_columns=['text'])
hf_test = hf_test.map(tokenize_classification, batched=True, remove_columns=['text'])

hf_train = hf_train.rename_column("label", "labels")
hf_test = hf_test.rename_column("label", "labels")
hf_train.set_format(type="torch")
hf_test.set_format(type="torch")

print(hf_train[0])

In [None]:

# Fine-tune CamemBERT for Intent Classification (GPU-optimized args)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate, numpy as np

model_cls = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
metric = evaluate.load("f1")

def compute_metrics_intent(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=preds, references=labels, average="macro")['f1']
    acc = (preds == labels).mean()
    return {"accuracy": acc, "f1_macro": f1}

training_args = TrainingArguments(
    output_dir="./camembert-intent",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True
)

trainer = Trainer(
    model=model_cls,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    compute_metrics=compute_metrics_intent,
    tokenizer=tokenizer
)

trainer.train()
trainer.save_model("./camembert-intent-best")

In [None]:

# Prepare NER dataset (token-classification) - convert spans to BIO using fast tokenizer offsets
from transformers import AutoTokenizer
tokenizer_fast = AutoTokenizer.from_pretrained(model_name, use_fast=True)
ner_labels = ['O', 'B-Departure', 'I-Departure', 'B-Destination', 'I-Destination']
label2id = {l:i for i,l in enumerate(ner_labels)}
id2label = {i:l for l,i in label2id.items()}

def spans_to_bio(text, spans):
    encoding = tokenizer_fast(text, return_offsets_mapping=True, truncation=True, max_length=128)
    offsets = encoding['offset_mapping']
    labels = ['O'] * len(offsets)
    for ent in spans:
        st, ed = ent['start'], ent['end']
        token_indices = []
        for i,(a,b) in enumerate(offsets):
            if a==b==0:
                continue
            if not (b <= st or a >= ed):
                token_indices.append(i)
        if not token_indices:
            continue
        labels[token_indices[0]] = 'B-' + ent['label']
        for idx in token_indices[1:]:
            labels[idx] = 'I-' + ent['label']
    label_ids = [label2id.get(l,0) for l in labels]
    return encoding, label_ids

# Build HF datasets (train/test)
from datasets import Dataset
ner_rows = []
for _, r in train_df.iterrows():
    text = r['text']
    spans = r['parsed_entities'] if r['intent']=='TRIP' else []
    _, label_ids = spans_to_bio(text, spans)
    ner_rows.append({'text': text, 'labels': label_ids})

ner_train_ds = Dataset.from_list(ner_rows)

ner_rows_test = []
for _, r in test_df.iterrows():
    text = r['text']
    spans = r['parsed_entities'] if r['intent']=='TRIP' else []
    _, label_ids = spans_to_bio(text, spans)
    ner_rows_test.append({'text': text, 'labels': label_ids})

ner_test_ds = Dataset.from_list(ner_rows_test)
print('NER datasets prepared (counts):', len(ner_train_ds), len(ner_test_ds))

In [None]:

# Fine-tune CamemBERT for NER (GPU-optimized)
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np, seqeval.metrics as seq_metrics

model_ner = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(ner_labels), id2label=id2label, label2id=label2id)
data_collator = DataCollatorForTokenClassification(tokenizer_fast)

# Tokenize texts and align labels (we'll create input_ids and attention_mask)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_fast(examples['text'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    # We already have labels aligned to token offsets in 'labels'; but to keep notebook concise,
    # we keep labels as precomputed per token position (expected length <= max_length)
    return {'input_ids': tokenized_inputs['input_ids'].tolist(), 'attention_mask': tokenized_inputs['attention_mask'].tolist(), 'labels': examples['labels']}

ner_train_tok = ner_train_ds.map(lambda x: tokenize_and_align_labels(x), batched=True)
ner_test_tok = ner_test_ds.map(lambda x: tokenize_and_align_labels(x), batched=True)

ner_train_tok.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
ner_test_tok.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

def align_preds(predictions, label_ids):
    preds = np.argmax(predictions, axis=-1)
    preds_list = [[id2label[p] for p in pred] for pred in preds]
    labels_list = [[id2label[l] for l in lab] for lab in label_ids]
    return preds_list, labels_list

def compute_metrics_ner(p):
    preds, labels = p
    preds_list, labels_list = align_preds(preds, labels)
    return {
        'precision': seq_metrics.precision_score(labels_list, preds_list),
        'recall': seq_metrics.recall_score(labels_list, preds_list),
        'f1': seq_metrics.f1_score(labels_list, preds_list)
    }

training_args = TrainingArguments(
    output_dir='./camembert-ner',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    num_train_epochs=3,
    save_strategy='epoch',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True
)

trainer_ner = Trainer(
    model=model_ner,
    args=training_args,
    train_dataset=ner_train_tok,
    eval_dataset=ner_test_tok,
    data_collator=data_collator,
    tokenizer=tokenizer_fast,
    compute_metrics=compute_metrics_ner
)

trainer_ner.train()
trainer_ner.save_model('./camembert-ner-best')

In [None]:

# Inference pipeline (load saved models and run on new lines)
from transformers import pipeline, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoTokenizer

# Load intent model
intent_tokenizer = AutoTokenizer.from_pretrained('./camembert-intent-best')
intent_model = AutoModelForSequenceClassification.from_pretrained('./camembert-intent-best')

# Load ner model
ner_tokenizer = AutoTokenizer.from_pretrained('./camembert-ner-best', use_fast=True)
ner_model = AutoModelForTokenClassification.from_pretrained('./camembert-ner-best')

intent_pipe = pipeline('text-classification', model=intent_model, tokenizer=intent_tokenizer)
ner_pipe = pipeline('token-classification', model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy='simple')

# Example inference function
def predict_line(sentenceID, sentence):
    # intent
    inputs = intent_tokenizer(sentence, return_tensors='pt', truncation=True, max_length=128)
    logits = intent_model(**inputs).logits.detach().cpu().numpy()[0]
    pred_id = int(logits.argmax())
    print('Pred intent id:', pred_id)
    # NER
    ner_res = ner_pipe(sentence)
    print('NER result:', ner_res)
    return None

# Example usage
print(predict_line('1', 'Je voudrais un billet Toulouse Paris.'))


# Final notes
- The GPU notebook is for Colab with a GPU runtime (recommended). Use Runtime -> Change runtime type -> GPU.
- The CPU notebook is slower; training on a CPU may take a long time.
- Save your models in the drive if you want to keep them across sessions (example path: /content/drive/MyDrive/nlp_miniprojects/).
- Save the tokenizer and label mappings alongside the model for correct inference later.
