In [20]:
!pip install seqeval



In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from gensim.models import Word2Vec
from seqeval.metrics import classification_report, f1_score
from tqdm import tqdm
import os

### `CNN Classification`

In [22]:
# --- 1. CONFIGURATION ---
SEQUENCE_LENGTH = 128
EMBEDDING_DIM = 100
BATCH_SIZE = 64
EPOCHS = 150
LEARNING_RATE = 0.003
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Paths
W2V_PATH = "w2v_med_cbow.model"

# CSV Files (Use lists to combine MEDLINE + EMEA)
TRAIN_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEtrain_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAtrain_layer1_ID.csv",
]
VALID_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEdev_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAdev_layer1_ID.csv",
]
TEST_FILES = [
    "TP_ISD2020/QUAERO_FrenchMed/MEDLINE/MEDLINEtest_layer1_ID.csv",
    "TP_ISD2020/QUAERO_FrenchMed/EMEA/EMEAtest_layer1_ID.csv",
]

In [24]:
def load_data_from_csv(file_paths):
    all_sentences = []
    all_tags = []

    for fpath in file_paths:
        if not os.path.exists(fpath):
            print(f"❌ File not found: {fpath}")
            continue

        print(f"Loading {os.path.basename(fpath)}...", end=" ")

        # Read CSV with auto-separator detection
        try:
            df = pd.read_csv(
                fpath,
                sep=None,
                engine="python",
                keep_default_na=False,
                skip_blank_lines=False,
            )
        except:
            print("Read failed.")
            continue

        # Extract Words and Tags
        if "Mot" in df.columns and "Tag" in df.columns:
            words = df["Mot"].astype(str).values
            tags = df["Tag"].astype(str).values
        else:
            words = df.iloc[:, 0].astype(str).values
            tags = df.iloc[:, -1].astype(str).values

        # Group into sentences
        curr_s, curr_t = [], []
        file_s, file_t = [], []

        for w, t in zip(words, tags):
            if not w.strip():  # Empty line = Sentence Break
                if curr_s:
                    file_s.append(curr_s)
                    file_t.append(curr_t)
                    curr_s, curr_t = [], []
            else:
                curr_s.append(w)
                curr_t.append(t)
        if curr_s:
            file_s.append(curr_s)
            file_t.append(curr_t)

        # Fallback: If chunking failed (1 giant sentence), force split
        if len(file_s) < 10 and len(words) > 500:
            print("[Chunking Fallback]", end=" ")
            flat_w = [w for s in file_s for w in s]
            flat_t = [t for s in file_t for t in s]
            file_s = [
                flat_w[i : i + SEQUENCE_LENGTH]
                for i in range(0, len(flat_w), SEQUENCE_LENGTH)
            ]
            file_t = [
                flat_t[i : i + SEQUENCE_LENGTH]
                for i in range(0, len(flat_t), SEQUENCE_LENGTH)
            ]

        print(f"-> {len(file_s)} sentences.")
        all_sentences.extend(file_s)
        all_tags.extend(file_t)

    return all_sentences, all_tags


print("--- 1. LOADING TEXT DATA ---")
train_sents, train_tags = load_data_from_csv(TRAIN_FILES)
valid_sents, valid_tags = load_data_from_csv(VALID_FILES)
test_sents, test_tags = load_data_from_csv(TEST_FILES)

print(f"Train Size: {len(train_sents)}")
print(f"Test Size:  {len(test_sents)} (Should be > 1000)")

--- 1. LOADING TEXT DATA ---
Loading MEDLINEtrain_layer1_ID.csv... [Chunking Fallback] -> 91 sentences.
Loading EMEAtrain_layer1_ID.csv... [Chunking Fallback] -> 120 sentences.
Loading MEDLINEdev_layer1_ID.csv... [Chunking Fallback] -> 90 sentences.
Loading EMEAdev_layer1_ID.csv... [Chunking Fallback] -> 106 sentences.
Loading MEDLINEtest_layer1_ID.csv... [Chunking Fallback] -> 94 sentences.
Loading EMEAtest_layer1_ID.csv... [Chunking Fallback] -> 97 sentences.
Train Size: 211
Test Size:  191 (Should be > 1000)


In [25]:
print("\n--- 2. VECTORIZING FEATURES ---")
w2v_model = Word2Vec.load(W2V_PATH)


def vectorize(sentences, model, max_len=128, dim=100):
    X = np.zeros((len(sentences), max_len, dim), dtype=np.float32)
    for i, sent in enumerate(sentences):
        length = min(len(sent), max_len)
        for j in range(length):
            word = sent[j]
            # Try Exact match -> Lowercase match -> Zero
            if word in model.wv:
                X[i, j] = model.wv[word]
            elif word.lower() in model.wv:
                X[i, j] = model.wv[word.lower()]
    return torch.tensor(X)


X_train = vectorize(train_sents, w2v_model)
X_valid = vectorize(valid_sents, w2v_model)
X_test = vectorize(test_sents, w2v_model)

# --- 4. ENCODING LABELS (Y) ---
print("--- 3. ENCODING LABELS ---")
tag_set = set(t for s in train_tags + valid_tags + test_tags for t in s)
tag2idx = {t: i + 1 for i, t in enumerate(sorted(list(tag_set)))}
tag2idx["<PAD>"] = 0
idx2tag = {v: k for k, v in tag2idx.items()}
print(f"Tags: {tag2idx}")


def encode_labels(labels, mapping, max_len=128):
    Y = []
    for s in labels:
        seq = [mapping.get(t, 0) for t in s]
        if len(seq) < max_len:
            seq += [0] * (max_len - len(seq))
        else:
            seq = seq[:max_len]
        Y.append(seq)
    return torch.tensor(Y, dtype=torch.long)


y_train = encode_labels(train_tags, tag2idx)
y_valid = encode_labels(valid_tags, tag2idx)
y_test = encode_labels(test_tags, tag2idx)

# DataLoaders
train_loader = DataLoader(
    TensorDataset(X_train, y_train), shuffle=True, batch_size=BATCH_SIZE
)
valid_loader = DataLoader(
    TensorDataset(X_valid, y_valid), shuffle=False, batch_size=BATCH_SIZE
)
test_loader = DataLoader(
    TensorDataset(X_test, y_test), shuffle=False, batch_size=BATCH_SIZE
)


--- 2. VECTORIZING FEATURES ---
--- 3. ENCODING LABELS ---
Tags: {'B-ANAT': 1, 'B-CHEM': 2, 'B-DEVI': 3, 'B-DISO': 4, 'B-GEOG': 5, 'B-LIVB': 6, 'B-OBJC': 7, 'B-PHEN': 8, 'B-PHYS': 9, 'B-PROC': 10, 'I-ANAT': 11, 'I-CHEM': 12, 'I-DEVI': 13, 'I-DISO': 14, 'I-GEOG': 15, 'I-LIVB': 16, 'I-OBJC': 17, 'I-PHEN': 18, 'I-PHYS': 19, 'I-PROC': 20, 'O': 21, '<PAD>': 0}


In [None]:
# --- 5. CNN MODEL ---
class CNN_NER(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN_NER, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        out = F.relu(self.conv1(x))
        out = self.dropout(out)
        out = F.relu(self.conv2(out))
        out = self.dropout(out)
        out = out.permute(0, 2, 1) 
        return self.fc(out)


model = CNN_NER(EMBEDDING_DIM, len(tag2idx)).to(DEVICE)

# Class Weights to handle "O" dominance
weights = torch.ones(len(tag2idx)).to(DEVICE)
if "O" in tag2idx:
    weights[tag2idx["O"]] = 0.5
criterion = nn.CrossEntropyLoss(weight=weights, ignore_index=0)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

In [27]:
# --- 6. TRAINING ---
print(f"\n--- 4. TRAINING ON {DEVICE} ---")
best_f1 = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out.view(-1, len(tag2idx)), y.view(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    all_true, all_pred = [], []
    with torch.no_grad():
        for x, y in valid_loader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            out = model(x)
            preds = torch.argmax(out, dim=2).cpu().numpy()
            labels = y.cpu().numpy()

            for i in range(len(x)):
                p_s, t_s = [], []
                for j in range(SEQUENCE_LENGTH):
                    if labels[i][j] == 0:
                        break
                    p_s.append(idx2tag[preds[i][j]])
                    t_s.append(idx2tag[labels[i][j]])
                all_pred.append(p_s)
                all_true.append(t_s)

    val_f1 = f1_score(all_true, all_pred)
    print(f"Loss: {train_loss/len(train_loader):.4f} | Val F1: {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), "best_ner_cnn.pt")


--- 4. TRAINING ON cpu ---


Epoch 1: 100%|██████████| 4/4 [00:00<00:00,  6.32it/s]


Loss: 2.6544 | Val F1: 0.0000


Epoch 2: 100%|██████████| 4/4 [00:00<00:00,  6.76it/s]


Loss: 1.9641 | Val F1: 0.0394


Epoch 3: 100%|██████████| 4/4 [00:00<00:00,  5.82it/s]


Loss: 1.7628 | Val F1: 0.0857


Epoch 4: 100%|██████████| 4/4 [00:00<00:00,  6.53it/s]


Loss: 1.6147 | Val F1: 0.1300


Epoch 5: 100%|██████████| 4/4 [00:00<00:00,  6.43it/s]


Loss: 1.5609 | Val F1: 0.1657


Epoch 6: 100%|██████████| 4/4 [00:00<00:00,  6.91it/s]


Loss: 1.4890 | Val F1: 0.1637


Epoch 7: 100%|██████████| 4/4 [00:00<00:00,  6.88it/s]


Loss: 1.4067 | Val F1: 0.1895


Epoch 8: 100%|██████████| 4/4 [00:00<00:00,  6.86it/s]


Loss: 1.3409 | Val F1: 0.1896


Epoch 9: 100%|██████████| 4/4 [00:00<00:00,  6.81it/s]


Loss: 1.3286 | Val F1: 0.2267


Epoch 10: 100%|██████████| 4/4 [00:00<00:00,  6.79it/s]


Loss: 1.3123 | Val F1: 0.2342


Epoch 11: 100%|██████████| 4/4 [00:00<00:00,  6.77it/s]


Loss: 1.2652 | Val F1: 0.2421


Epoch 12: 100%|██████████| 4/4 [00:00<00:00,  6.84it/s]


Loss: 1.2856 | Val F1: 0.2491


Epoch 13: 100%|██████████| 4/4 [00:00<00:00,  6.85it/s]


Loss: 1.2654 | Val F1: 0.2651


Epoch 14: 100%|██████████| 4/4 [00:00<00:00,  6.85it/s]


Loss: 1.2112 | Val F1: 0.2851


Epoch 15: 100%|██████████| 4/4 [00:00<00:00,  6.42it/s]


Loss: 1.2025 | Val F1: 0.2898


Epoch 16: 100%|██████████| 4/4 [00:00<00:00,  6.61it/s]


Loss: 1.2081 | Val F1: 0.3017


Epoch 17: 100%|██████████| 4/4 [00:00<00:00,  6.76it/s]


Loss: 1.1825 | Val F1: 0.2967


Epoch 18: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]


Loss: 1.1832 | Val F1: 0.3119


Epoch 19: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 1.1623 | Val F1: 0.3140


Epoch 20: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 1.1479 | Val F1: 0.3274


Epoch 21: 100%|██████████| 4/4 [00:00<00:00,  6.90it/s]


Loss: 1.1486 | Val F1: 0.3343


Epoch 22: 100%|██████████| 4/4 [00:00<00:00,  6.89it/s]


Loss: 1.1254 | Val F1: 0.3362


Epoch 23: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 1.1235 | Val F1: 0.3448


Epoch 24: 100%|██████████| 4/4 [00:00<00:00,  6.77it/s]


Loss: 1.0961 | Val F1: 0.3461


Epoch 25: 100%|██████████| 4/4 [00:00<00:00,  6.79it/s]


Loss: 1.0787 | Val F1: 0.3451


Epoch 26: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 1.0995 | Val F1: 0.3445


Epoch 27: 100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


Loss: 1.0794 | Val F1: 0.3494


Epoch 28: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 1.0564 | Val F1: 0.3488


Epoch 29: 100%|██████████| 4/4 [00:00<00:00,  6.71it/s]


Loss: 1.0626 | Val F1: 0.3483


Epoch 30: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]


Loss: 1.0607 | Val F1: 0.3510


Epoch 31: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 1.0434 | Val F1: 0.3595


Epoch 32: 100%|██████████| 4/4 [00:00<00:00,  5.60it/s]


Loss: 1.0529 | Val F1: 0.3472


Epoch 33: 100%|██████████| 4/4 [00:00<00:00,  6.55it/s]


Loss: 1.0565 | Val F1: 0.3594


Epoch 34: 100%|██████████| 4/4 [00:00<00:00,  6.76it/s]


Loss: 1.0325 | Val F1: 0.3565


Epoch 35: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]


Loss: 1.0258 | Val F1: 0.3558


Epoch 36: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 1.0287 | Val F1: 0.3566


Epoch 37: 100%|██████████| 4/4 [00:00<00:00,  5.86it/s]


Loss: 1.0050 | Val F1: 0.3589


Epoch 38: 100%|██████████| 4/4 [00:00<00:00,  6.74it/s]


Loss: 0.9927 | Val F1: 0.3606


Epoch 39: 100%|██████████| 4/4 [00:00<00:00,  6.54it/s]


Loss: 1.0090 | Val F1: 0.3585


Epoch 40: 100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


Loss: 1.0367 | Val F1: 0.3668


Epoch 41: 100%|██████████| 4/4 [00:00<00:00,  6.63it/s]


Loss: 1.0093 | Val F1: 0.3592


Epoch 42: 100%|██████████| 4/4 [00:00<00:00,  6.58it/s]


Loss: 0.9902 | Val F1: 0.3746


Epoch 43: 100%|██████████| 4/4 [00:00<00:00,  6.65it/s]


Loss: 0.9737 | Val F1: 0.3574


Epoch 44: 100%|██████████| 4/4 [00:00<00:00,  6.70it/s]


Loss: 0.9595 | Val F1: 0.3835


Epoch 45: 100%|██████████| 4/4 [00:00<00:00,  4.75it/s]


Loss: 0.9728 | Val F1: 0.3717


Epoch 46: 100%|██████████| 4/4 [00:00<00:00,  5.28it/s]


Loss: 0.9822 | Val F1: 0.3761


Epoch 47: 100%|██████████| 4/4 [00:00<00:00,  6.65it/s]


Loss: 0.9794 | Val F1: 0.3705


Epoch 48: 100%|██████████| 4/4 [00:00<00:00,  6.60it/s]


Loss: 0.9517 | Val F1: 0.3714


Epoch 49: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.9582 | Val F1: 0.3707


Epoch 50: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.9692 | Val F1: 0.3711


Epoch 51: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.9728 | Val F1: 0.3760


Epoch 52: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]


Loss: 0.9713 | Val F1: 0.3694


Epoch 53: 100%|██████████| 4/4 [00:00<00:00,  6.75it/s]


Loss: 0.9368 | Val F1: 0.3783


Epoch 54: 100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


Loss: 0.9651 | Val F1: 0.3595


Epoch 55: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 0.9610 | Val F1: 0.3714


Epoch 56: 100%|██████████| 4/4 [00:00<00:00,  6.59it/s]


Loss: 0.9554 | Val F1: 0.3715


Epoch 57: 100%|██████████| 4/4 [00:00<00:00,  6.77it/s]


Loss: 0.9224 | Val F1: 0.3804


Epoch 58: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 0.9561 | Val F1: 0.3727


Epoch 59: 100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


Loss: 0.9412 | Val F1: 0.3848


Epoch 60: 100%|██████████| 4/4 [00:00<00:00,  6.61it/s]


Loss: 0.9304 | Val F1: 0.3750


Epoch 61: 100%|██████████| 4/4 [00:00<00:00,  6.42it/s]


Loss: 0.9220 | Val F1: 0.3747


Epoch 62: 100%|██████████| 4/4 [00:00<00:00,  4.92it/s]


Loss: 0.9429 | Val F1: 0.3832


Epoch 63: 100%|██████████| 4/4 [00:00<00:00,  5.01it/s]


Loss: 0.9360 | Val F1: 0.3931


Epoch 64: 100%|██████████| 4/4 [00:00<00:00,  5.45it/s]


Loss: 0.9313 | Val F1: 0.3713


Epoch 65: 100%|██████████| 4/4 [00:00<00:00,  6.38it/s]


Loss: 0.9348 | Val F1: 0.3890


Epoch 66: 100%|██████████| 4/4 [00:00<00:00,  6.60it/s]


Loss: 0.8964 | Val F1: 0.3780


Epoch 67: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.9253 | Val F1: 0.3784


Epoch 68: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.8994 | Val F1: 0.3848


Epoch 69: 100%|██████████| 4/4 [00:00<00:00,  6.59it/s]


Loss: 0.9114 | Val F1: 0.3690


Epoch 70: 100%|██████████| 4/4 [00:00<00:00,  6.65it/s]


Loss: 0.9150 | Val F1: 0.3886


Epoch 71: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 0.9270 | Val F1: 0.3748


Epoch 72: 100%|██████████| 4/4 [00:00<00:00,  6.34it/s]


Loss: 0.9312 | Val F1: 0.3793


Epoch 73: 100%|██████████| 4/4 [00:00<00:00,  6.58it/s]


Loss: 0.9053 | Val F1: 0.3905


Epoch 74: 100%|██████████| 4/4 [00:00<00:00,  6.63it/s]


Loss: 0.8999 | Val F1: 0.3823


Epoch 75: 100%|██████████| 4/4 [00:00<00:00,  5.78it/s]


Loss: 0.8765 | Val F1: 0.3915


Epoch 76: 100%|██████████| 4/4 [00:00<00:00,  6.46it/s]


Loss: 0.8916 | Val F1: 0.3821


Epoch 77: 100%|██████████| 4/4 [00:00<00:00,  5.55it/s]


Loss: 0.9005 | Val F1: 0.3892


Epoch 78: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.8582 | Val F1: 0.3859


Epoch 79: 100%|██████████| 4/4 [00:00<00:00,  6.46it/s]


Loss: 0.8809 | Val F1: 0.3788


Epoch 80: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.9286 | Val F1: 0.3886


Epoch 81: 100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


Loss: 0.8690 | Val F1: 0.3688


Epoch 82: 100%|██████████| 4/4 [00:00<00:00,  6.65it/s]


Loss: 0.9007 | Val F1: 0.3844


Epoch 83: 100%|██████████| 4/4 [00:00<00:00,  6.67it/s]


Loss: 0.8732 | Val F1: 0.3908


Epoch 84: 100%|██████████| 4/4 [00:00<00:00,  6.59it/s]


Loss: 0.9025 | Val F1: 0.3769


Epoch 85: 100%|██████████| 4/4 [00:00<00:00,  6.59it/s]


Loss: 0.8915 | Val F1: 0.3897


Epoch 86: 100%|██████████| 4/4 [00:00<00:00,  6.44it/s]


Loss: 0.8553 | Val F1: 0.3923


Epoch 87: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.8728 | Val F1: 0.3917


Epoch 88: 100%|██████████| 4/4 [00:00<00:00,  6.49it/s]


Loss: 0.8871 | Val F1: 0.3820


Epoch 89: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.8936 | Val F1: 0.3848


Epoch 90: 100%|██████████| 4/4 [00:00<00:00,  6.58it/s]


Loss: 0.9027 | Val F1: 0.3879


Epoch 91: 100%|██████████| 4/4 [00:00<00:00,  6.59it/s]


Loss: 0.8714 | Val F1: 0.3873


Epoch 92: 100%|██████████| 4/4 [00:00<00:00,  6.14it/s]


Loss: 0.8552 | Val F1: 0.3857


Epoch 93: 100%|██████████| 4/4 [00:00<00:00,  6.65it/s]


Loss: 0.8601 | Val F1: 0.3895


Epoch 94: 100%|██████████| 4/4 [00:00<00:00,  6.58it/s]


Loss: 0.8536 | Val F1: 0.3882


Epoch 95: 100%|██████████| 4/4 [00:00<00:00,  6.54it/s]


Loss: 0.8413 | Val F1: 0.3934


Epoch 96: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 0.8670 | Val F1: 0.3790


Epoch 97: 100%|██████████| 4/4 [00:00<00:00,  5.34it/s]


Loss: 0.8559 | Val F1: 0.3923


Epoch 98: 100%|██████████| 4/4 [00:00<00:00,  6.49it/s]


Loss: 0.8495 | Val F1: 0.3870


Epoch 99: 100%|██████████| 4/4 [00:00<00:00,  6.68it/s]


Loss: 0.8441 | Val F1: 0.3968


Epoch 100: 100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


Loss: 0.8679 | Val F1: 0.3759


Epoch 101: 100%|██████████| 4/4 [00:00<00:00,  6.51it/s]


Loss: 0.8607 | Val F1: 0.3912


Epoch 102: 100%|██████████| 4/4 [00:00<00:00,  6.62it/s]


Loss: 0.8672 | Val F1: 0.3791


Epoch 103: 100%|██████████| 4/4 [00:00<00:00,  6.60it/s]


Loss: 0.8374 | Val F1: 0.3888


Epoch 104: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 0.8702 | Val F1: 0.3895


Epoch 105: 100%|██████████| 4/4 [00:00<00:00,  6.76it/s]


Loss: 0.8633 | Val F1: 0.3941


Epoch 106: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 0.8438 | Val F1: 0.3973


Epoch 107: 100%|██████████| 4/4 [00:00<00:00,  6.72it/s]


Loss: 0.8286 | Val F1: 0.3906


Epoch 108: 100%|██████████| 4/4 [00:00<00:00,  6.50it/s]


Loss: 0.8201 | Val F1: 0.3941


Epoch 109: 100%|██████████| 4/4 [00:00<00:00,  6.61it/s]


Loss: 0.8804 | Val F1: 0.3887


Epoch 110: 100%|██████████| 4/4 [00:00<00:00,  6.74it/s]


Loss: 0.8319 | Val F1: 0.3870


Epoch 111: 100%|██████████| 4/4 [00:00<00:00,  6.77it/s]


Loss: 0.8317 | Val F1: 0.3899


Epoch 112: 100%|██████████| 4/4 [00:00<00:00,  6.41it/s]


Loss: 0.8343 | Val F1: 0.3905


Epoch 113: 100%|██████████| 4/4 [00:00<00:00,  6.71it/s]


Loss: 0.8393 | Val F1: 0.3920


Epoch 114: 100%|██████████| 4/4 [00:00<00:00,  6.55it/s]


Loss: 0.8410 | Val F1: 0.3929


Epoch 115: 100%|██████████| 4/4 [00:00<00:00,  6.57it/s]


Loss: 0.8127 | Val F1: 0.3935


Epoch 116: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 0.8267 | Val F1: 0.3988


Epoch 117: 100%|██████████| 4/4 [00:00<00:00,  6.48it/s]


Loss: 0.8420 | Val F1: 0.3907


Epoch 118: 100%|██████████| 4/4 [00:00<00:00,  5.88it/s]


Loss: 0.8125 | Val F1: 0.3854


Epoch 119: 100%|██████████| 4/4 [00:00<00:00,  6.68it/s]


Loss: 0.8382 | Val F1: 0.4003


Epoch 120: 100%|██████████| 4/4 [00:00<00:00,  6.73it/s]


Loss: 0.8118 | Val F1: 0.3871


Epoch 121: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.8225 | Val F1: 0.3928


Epoch 122: 100%|██████████| 4/4 [00:00<00:00,  6.63it/s]


Loss: 0.8242 | Val F1: 0.3887


Epoch 123: 100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


Loss: 0.8359 | Val F1: 0.3958


Epoch 124: 100%|██████████| 4/4 [00:00<00:00,  5.83it/s]


Loss: 0.8352 | Val F1: 0.3902


Epoch 125: 100%|██████████| 4/4 [00:00<00:00,  6.56it/s]


Loss: 0.8403 | Val F1: 0.3966


Epoch 126: 100%|██████████| 4/4 [00:00<00:00,  6.44it/s]


Loss: 0.8096 | Val F1: 0.3952


Epoch 127: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 0.8306 | Val F1: 0.3853


Epoch 128: 100%|██████████| 4/4 [00:00<00:00,  6.70it/s]


Loss: 0.8250 | Val F1: 0.3942


Epoch 129: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.8096 | Val F1: 0.3925


Epoch 130: 100%|██████████| 4/4 [00:00<00:00,  6.69it/s]


Loss: 0.8410 | Val F1: 0.3941


Epoch 131: 100%|██████████| 4/4 [00:00<00:00,  6.53it/s]


Loss: 0.8208 | Val F1: 0.3894


Epoch 132: 100%|██████████| 4/4 [00:00<00:00,  6.74it/s]


Loss: 0.8032 | Val F1: 0.3941


Epoch 133: 100%|██████████| 4/4 [00:00<00:00,  6.30it/s]


Loss: 0.8275 | Val F1: 0.3922


Epoch 134: 100%|██████████| 4/4 [00:00<00:00,  6.60it/s]


Loss: 0.8244 | Val F1: 0.3893


Epoch 135: 100%|██████████| 4/4 [00:00<00:00,  5.89it/s]


Loss: 0.7979 | Val F1: 0.3962


Epoch 136: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.7987 | Val F1: 0.3947


Epoch 137: 100%|██████████| 4/4 [00:00<00:00,  6.74it/s]


Loss: 0.8014 | Val F1: 0.3916


Epoch 138: 100%|██████████| 4/4 [00:00<00:00,  6.71it/s]


Loss: 0.7856 | Val F1: 0.4006


Epoch 139: 100%|██████████| 4/4 [00:00<00:00,  6.24it/s]


Loss: 0.7901 | Val F1: 0.3908


Epoch 140: 100%|██████████| 4/4 [00:00<00:00,  6.27it/s]


Loss: 0.7924 | Val F1: 0.3951


Epoch 141: 100%|██████████| 4/4 [00:00<00:00,  6.19it/s]


Loss: 0.7940 | Val F1: 0.3832


Epoch 142: 100%|██████████| 4/4 [00:00<00:00,  5.13it/s]


Loss: 0.8161 | Val F1: 0.3949


Epoch 143: 100%|██████████| 4/4 [00:00<00:00,  6.39it/s]


Loss: 0.7828 | Val F1: 0.3906


Epoch 144: 100%|██████████| 4/4 [00:00<00:00,  6.36it/s]


Loss: 0.7885 | Val F1: 0.3948


Epoch 145: 100%|██████████| 4/4 [00:00<00:00,  6.66it/s]


Loss: 0.7896 | Val F1: 0.3920


Epoch 146: 100%|██████████| 4/4 [00:00<00:00,  6.61it/s]


Loss: 0.8033 | Val F1: 0.3917


Epoch 147: 100%|██████████| 4/4 [00:00<00:00,  6.63it/s]


Loss: 0.7853 | Val F1: 0.4040


Epoch 148: 100%|██████████| 4/4 [00:00<00:00,  6.37it/s]


Loss: 0.8056 | Val F1: 0.3960


Epoch 149: 100%|██████████| 4/4 [00:00<00:00,  6.24it/s]


Loss: 0.8062 | Val F1: 0.3949


Epoch 150: 100%|██████████| 4/4 [00:00<00:00,  6.64it/s]


Loss: 0.7762 | Val F1: 0.3988


In [28]:
# --- 7. FINAL TEST ---
print("\n--- FINAL TEST EVALUATION ---")
model.load_state_dict(torch.load("best_ner_cnn.pt"))
model.eval()
test_true, test_pred = [], []

with torch.no_grad():
    for x, y in test_loader:
        x = x.to(DEVICE)
        out = model(x)
        preds = torch.argmax(out, dim=2).cpu().numpy()
        labels = y.numpy()
        for i in range(len(x)):
            p_s, t_s = [], []
            for j in range(SEQUENCE_LENGTH):
                if labels[i][j] == 0:
                    break
                p_s.append(idx2tag[preds[i][j]])
                t_s.append(idx2tag[labels[i][j]])
            test_pred.append(p_s)
            test_true.append(t_s)

print(classification_report(test_true, test_pred))


--- FINAL TEST EVALUATION ---
              precision    recall  f1-score   support

        ANAT       0.27      0.11      0.16       364
        CHEM       0.29      0.22      0.25      1037
        DEVI       0.00      0.00      0.00       107
        DISO       0.22      0.31      0.26       977
        GEOG       0.33      0.02      0.03        63
        LIVB       0.70      0.53      0.61       498
        OBJC       0.38      0.07      0.12        81
        PHEN       0.00      0.00      0.00        70
        PHYS       0.45      0.16      0.24       190
        PROC       0.48      0.38      0.42       761

   micro avg       0.34      0.28      0.31      4148
   macro avg       0.31      0.18      0.21      4148
weighted avg       0.35      0.28      0.30      4148



### LSTM