## Imports

In [1]:
import os
import numpy as np
import pandas as pd
from gensim.models import FastText, KeyedVectors


In [2]:
# Final NER datasets
DATA_DIR = "../../data/ner_processed/final"

TRAIN_PATH = os.path.join(DATA_DIR, "emea_train.csv")
DEV_PATH   = os.path.join(DATA_DIR, "emea_dev.csv")
TEST_PATH  = os.path.join(DATA_DIR, "emea_test.csv")

# FastText model trained in TP1 (medical corpus)
FASTTEXT_MODEL_PATH = "../../embeddings/fasttext_medical_cbow.model"

EMBEDDING_DIM = 100


## Load NER datasets

In [3]:
train_df = pd.read_csv(TRAIN_PATH)
dev_df   = pd.read_csv(DEV_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Train size:", train_df.shape)
print("Dev size:", dev_df.shape)
print("Test size:", test_df.shape)

train_df.head()


Train size: (706, 2)
Dev size: (649, 2)
Test size: (578, 2)


Unnamed: 0,review,label
0,PRIALT,1
1,EMEA / H / C / 551,0
2,Qu ’ est ce que Prialt ?,1
3,Prialt est une solution pour perfusion contena...,1
4,Dans quel cas Prialt est - il utilisé ?,1


## Load FastText model (medical)

In [4]:
fasttext_model = FastText.load(FASTTEXT_MODEL_PATH)
ft = fasttext_model.wv

print("FastText vocabulary size:", len(ft))
print("Embedding dimension:", ft.vector_size)


FastText vocabulary size: 9104
Embedding dimension: 100


## Vocabulary coverage analysis

In [5]:
def get_vocab_from_df(df):
    vocab = set()
    for sent in df["review"]:
        for w in sent.split():
            vocab.add(w)
    return vocab

train_vocab = get_vocab_from_df(train_df)

covered = [w for w in train_vocab if w in ft]
oov = [w for w in train_vocab if w not in ft]

print(f"NER vocabulary size: {len(train_vocab)}")
print(f"Covered words: {len(covered)}")
print(f"OOV words: {len(oov)}")
print(f"Coverage ratio: {len(covered) / len(train_vocab):.2%}")


NER vocabulary size: 2599
Covered words: 2599
OOV words: 0
Coverage ratio: 100.00%


## Inspect FastText semantic behavior

In [6]:
medical_words = ["patient", "traitement", "maladie", "solution"]

for word in medical_words:
    print(f"\nMost similar words to '{word}':")
    for w, s in ft.most_similar(word, topn=5):
        print(f"  {w:15s} {s:.3f}")



Most similar words to 'patient':
  Patient         0.999
  tremblements    0.999
  pansements      0.999
  patiente        0.999
  Tremblements    0.999

Most similar words to 'traitement':
  Traitement      1.000
  Taaitement      1.000
  Allaitement     0.999
  allaitement     0.999
  traitements     0.999

Most similar words to 'maladie':
  Maladie         1.000
  malade          1.000
  professionnelle 1.000
  professionnel   1.000
  hyrgathione     1.000

Most similar words to 'solution':
  Dissolution     1.000
  évolution       0.999
  dilution        0.999
  Solution        0.999
  Evolution       0.999


## Medical Training command (CNN + FastText)

In [16]:
!python ../../scripts/cnn_classification.py \
    --model cnn \
    --train ../../data/ner_processed/final/emea_train.csv \
    --valid ../../data/ner_processed/final/emea_dev.csv \
    --test ../../data/ner_processed/final/emea_test.csv \
    --epochs 25


loading files...
Merging files...
Building vocab...
Encoding reviews...
100%|█████████████████████████████████████| 706/706 [00:00<00:00, 365984.26it/s]
100%|█████████████████████████████████████| 578/578 [00:00<00:00, 440542.92it/s]
100%|█████████████████████████████████████| 649/649 [00:00<00:00, 452477.28it/s]
[OK] Vocabulary saved to ../../data/ner_processed/final/emea_train.csv_vocab.pkl
Vocabulary size: 4590
Feature Shapes:
Train set: (706, 128)
Validation set: (578, 128)
Test set: (649, 128)
Taille vocabulaire 4590
SentimentModelCNN(
  (embed): Embedding(4590, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=300, out_features=2, bias=True)
)
Epoch 1/25 | Train Loss: 50.008 Train Acc: 0.561 | Val Loss: 38.842 Val Acc: 0.880
Epoch 2/25 | Train Loss

## Press Training command (CNN + FastText)

In [17]:
!python ../../scripts/cnn_classification.py \
    --model cnn \
    --train ../../data/ner_processed/final/press_train_final.csv \
    --valid ../../data/ner_processed/final/press_dev_final.csv \
    --test ../../data/ner_processed/final/press_test_final.csv \
    --epochs 25


loading files...
Merging files...
Building vocab...
Encoding reviews...
100%|█████████████████████████████████| 35723/35723 [00:00<00:00, 261649.15it/s]
100%|███████████████████████████████████| 2880/2880 [00:00<00:00, 269459.40it/s]
100%|███████████████████████████████████| 2825/2825 [00:00<00:00, 256120.63it/s]
[OK] Vocabulary saved to ../../data/ner_processed/final/press_train_final.csv_vocab.pkl
Vocabulary size: 32002
Feature Shapes:
Train set: (35723, 128)
Validation set: (2880, 128)
Test set: (2825, 128)
Taille vocabulaire 32002
SentimentModelCNN(
  (embed): Embedding(32002, 100)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=300, out_features=2, bias=True)
)
Epoch 1/25 | Train Loss: 66.826 Train Acc: 0.617 | Val Loss: 78.671 Val Acc: 0.661
Epoch 2/2