# TP2 – Preparing NER Data
 
**Lab:** TP2 Named Entity Recognition  
**Notebook:** 01 Data Preparation  

This notebook prepares the medical NER datasets for the CNN/LSTM and Transformer
models provided by the instructor.

The original datasets are in CONLL-like format and will be converted into
CSV files compatible with the provided scripts.


## 1. Imports and Dataset Paths


In [17]:
import os
import pandas as pd
from tqdm import tqdm


In [36]:

BASE_DATA_DIR = "../../data/ner_corpus/FrenchMed"

# EMEA corpus 
EMEA_TRAIN = os.path.join(BASE_DATA_DIR, "EMEA/EMEAtrain_layer1_ID.conll")
EMEA_DEV   = os.path.join(BASE_DATA_DIR, "EMEA/EMEAdev_layer1_ID.conll")
EMEA_TEST  = os.path.join(BASE_DATA_DIR, "EMEA/EMEAtest_layer1_ID.conll")

# Output directory
OUTPUT_DIR = "../../data/ner_processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Train exists:", os.path.exists(EMEA_TRAIN))
print("Dev exists:", os.path.exists(EMEA_DEV))
print("Test exists:", os.path.exists(EMEA_TEST))


Train exists: True
Dev exists: True
Test exists: True


In [37]:
print("Train exists:", os.path.exists(EMEA_TRAIN))
print("Dev exists:", os.path.exists(EMEA_DEV))
print("Test exists:", os.path.exists(EMEA_TEST))


Train exists: True
Dev exists: True
Test exists: True


## Reading CONLL Files


In [38]:
def read_conll_file(filepath):
    sentences = []
    current_tokens = []
    current_labels = []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line == "":
                if current_tokens:
                    sentences.append((current_tokens, current_labels))
                    current_tokens = []
                    current_labels = []
            else:
                parts = line.split()
                token = parts[1]
                label = parts[-1]
                current_tokens.append(token)
                current_labels.append(label)

        # Last sentence
        if current_tokens:
            sentences.append((current_tokens, current_labels))

    return sentences


## Converting Sentences to a DataFrame


In [39]:
def sentences_to_dataframe(sentences):
    data = []
    for tokens, labels in sentences:
        data.append({
            "review": " ".join(tokens),
            "label": labels
        })
    return pd.DataFrame(data)


## Processing One Dataset Split


In [40]:
def process_and_save(conll_path, output_csv):
    sentences = read_conll_file(conll_path)
    df = sentences_to_dataframe(sentences)
    df.to_csv(output_csv, index=False)
    print(f"Saved {len(df)} sentences to {output_csv}")


##  Generating CSV Files for NER


In [41]:
process_and_save(
    EMEA_TRAIN,
    os.path.join(OUTPUT_DIR, "emea_train.csv")
)

process_and_save(
    EMEA_DEV,
    os.path.join(OUTPUT_DIR, "emea_dev.csv")
)

process_and_save(
    EMEA_TEST,
    os.path.join(OUTPUT_DIR, "emea_test.csv")
)


Saved 706 sentences to ../../data/ner_processed/emea_train.csv
Saved 649 sentences to ../../data/ner_processed/emea_dev.csv
Saved 578 sentences to ../../data/ner_processed/emea_test.csv


## Inspecting the Prepared Data


In [42]:
train_df = pd.read_csv(os.path.join(OUTPUT_DIR, "emea_train.csv"))
dev_df   = pd.read_csv(os.path.join(OUTPUT_DIR, "emea_dev.csv"))
test_df  = pd.read_csv(os.path.join(OUTPUT_DIR, "emea_test.csv"))

train_df.head()


Unnamed: 0,review,label
0,PRIALT,['B-CHEM']
1,EMEA / H / C / 551,"['O', 'O', 'O', 'O', 'O', 'O', 'O']"
2,Qu ’ est ce que Prialt ?,"['O', 'O', 'O', 'O', 'O', 'B-CHEM', 'O']"
3,Prialt est une solution pour perfusion contena...,"['B-CHEM', 'O', 'O', 'B-CHEM', 'O', 'B-PROC', ..."
4,Dans quel cas Prialt est - il utilisé ?,"['O', 'O', 'O', 'B-CHEM', 'O', 'O', 'O', 'O', ..."


## Sanity Checks


In [43]:
print("Number of training sentences:", len(train_df))

example_idx = 0
print("\nExample sentence:")
print(train_df.iloc[example_idx]["review"])
print("\nNER labels:")
print(train_df.iloc[example_idx]["label"])


Number of training sentences: 706

Example sentence:
PRIALT

NER labels:
['B-CHEM']


## 9. Conclusion

In this notebook, we:
- Loaded the medical NER corpus in CONLL format
- Grouped tokens into sentences
- Preserved BIO NER labels
- Exported clean CSV files compatible with:
  - cnn_classification.py
  - transformer.py

These processed datasets will be used directly in the next notebooks
without modifying the instructor-provided scripts.


## Same with press DATASET: 

In [106]:
BASE_PRESS_DIR = "../../data/ner_corpus/FrenchPress"

PRESS_TRAIN = os.path.join(BASE_PRESS_DIR, "fra4_ID.train")
PRESS_DEV   = os.path.join(BASE_PRESS_DIR, "fra4_ID.dev")
PRESS_TEST  = os.path.join(BASE_PRESS_DIR, "fra4_ID.test")

OUTPUT_DIR = "../../data/ner_processed/final"
os.makedirs(OUTPUT_DIR, exist_ok=True)



In [107]:
import os
import pandas as pd

def read_conll_press(path):
    sentences = []
    tokens = []
    labels = []

    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if not line:  # end of sentence
                if tokens:
                    sentences.append({
                        "review": " ".join(tokens),
                        "label": labels
                    })
                    tokens = []
                    labels = []
                continue

            parts = line.split()
            if len(parts) < 5:
                continue

            token = parts[1]
            ner = parts[-1].upper()  # normalize

            tokens.append(token)
            labels.append(ner)

    if tokens:
        sentences.append({
            "review": " ".join(tokens),
            "label": labels
        })

    return pd.DataFrame(sentences)


## Convert sentences to DataFrame

In [108]:
BASE = "../../data/ner_corpus/FrenchPress"
OUT  = "../../data/ner_processed/final"
os.makedirs(OUT, exist_ok=True)

splits = {
    "train": "fra4_ID.train",
    "dev":   "fra4_ID.dev",
    "test":  "fra4_ID.test"
}

for split, file in splits.items():
    path = os.path.join(BASE, file)
    df = read_conll_press(path)
    out = os.path.join(OUT, f"press_{split}.csv")
    df.to_csv(out, index=False)
    print(f"{split}: {len(df)} sentences saved → {out}")


train: 35723 sentences saved → ../../data/ner_processed/final/press_train.csv
dev: 2825 sentences saved → ../../data/ner_processed/final/press_dev.csv
test: 2880 sentences saved → ../../data/ner_processed/final/press_test.csv


In [109]:
df = pd.read_csv("../../data/ner_processed/final/press_train.csv")
df.head()

# Vérifier alignement
i = 0
print(df.iloc[i]["review"].split())
print(df.iloc[i]["label"])
print(len(df.iloc[i]["review"].split()), len(eval(df.iloc[i]["label"])))


['Patricia', 'Martin', ',', 'que', 'voici', ',', 'que', 'voilà', '!', 'oh', ',', 'bonjour', 'Nicolas', 'Stoufflet', '.']
['B-PERS', 'I-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'I-PERS', 'O']
15 15


In [95]:
print("Press train exists:", os.path.exists("../../data/ner_processed/final/press_train.csv"))
print("Press dev exists:", os.path.exists("../../data/ner_processed/final/press_dev.csv"))
print("Press test exists:", os.path.exists("../../data/ner_processed/final/press_test.csv"))



Press train exists: True
Press dev exists: True
Press test exists: True
