In [1]:
from mednlpix.data.loader import get_medical_dataset
from mednlpix.data.splitter import split_medical_dataset
from mednlpix.features.vectorization import TFIDFVectorizerWrapper, Word2VecVectorizer
from mednlpix.features.preprocessing.spacy_preprocessor import MedicalTextPreprocessor

In [2]:
print("\n" + "="*60)
print("Load Medical Dataset")
print("="*60 + "\n")

data, dataset_path = get_medical_dataset(return_path=True)

print(f"Loaded dataset: {dataset_path.name}, shape = {data.shape}")
display(data.head())

2025-10-08 01:02:00,148 - mednlpix.data.loader - INFO - Searching for dataset in: /Users/surelmanda/Data-Science-Projects/MedNLPix/data/raw
2025-10-08 01:02:00,150 - mednlpix.data.loader - INFO - Dataset found: medical-abstracts-dataset.csv
2025-10-08 01:02:00,152 - mednlpix.data.loader - INFO - Loading dataset from: /Users/surelmanda/Data-Science-Projects/MedNLPix/data/raw/medical-abstracts-dataset.csv
2025-10-08 01:02:00,189 - mednlpix.data.loader - INFO - Dataset loaded successfully — the input data has 2286 rows and 2 columns.
2025-10-08 01:02:00,190 - mednlpix.data.loader - INFO - Dataset successfully retrieved and loaded.



Load Medical Dataset

Loaded dataset: medical-abstracts-dataset.csv, shape = (2286, 2)


Unnamed: 0,condition_label,medical_abstract
0,cardiovascular diseases,Coronary artery bypass grafting in a patient w...
1,cardiovascular diseases,Conservative management of aortic lacerations ...
2,cardiovascular diseases,Sudden death from acute cocaine intoxication i...
3,cardiovascular diseases,Catheterization of coronary artery bypass graf...
4,cardiovascular diseases,Spontaneous rupture of an aortic aneurysm into...


In [3]:
# ==============================================
# Split Medical Dataset into train/valid/test
# ==============================================
print("\n" + "=" * 60)
print("Split Medical Dataset into train/valid/test")
print("=" * 60 + "\n")

# Perform the split
train_size, valid_size, test_size = 0.7, 0.15, 0.15
train_df, valid_df, test_df = split_medical_dataset(data,train_size, valid_size, test_size)

# Display summary information
print(f"Training set: {train_df.shape[0]} rows × {train_df.shape[1]} columns")
display(train_df.head(3))

print(f"Validation set: {valid_df.shape[0]} rows × {valid_df.shape[1]} columns")
display(valid_df.head(3))

print(f"Test set: {test_df.shape[0]} rows × {test_df.shape[1]} columns")
display(test_df.head(3))


2025-10-08 01:02:00,291 - mednlpix.data.splitter - INFO - Split - Train: (1600, 2), Valid: (343, 2), Test: (343, 2)



Split Medical Dataset into train/valid/test

Training set: 1600 rows × 2 columns


Unnamed: 0,condition_label,medical_abstract
300,cardiovascular diseases,Recurrent intracaval renal cell carcinoma: the...
771,general pathological conditions,Anterior cervical discectomy and fusion. A ret...
1085,general pathological conditions,Stiffman syndrome: a rare paraneoplastic disor...


Validation set: 343 rows × 2 columns


Unnamed: 0,condition_label,medical_abstract
1270,neoplasms,Primary prevention of cancer. The case for com...
1264,general pathological conditions,Spontaneous rupture of a normal bladder. We ha...
1933,nervous system diseases,Orbital intramuscular schwannoma. In an 8-year...


Test set: 343 rows × 2 columns


Unnamed: 0,condition_label,medical_abstract
785,general pathological conditions,Urinary tract infections. Urinary tract infect...
436,digestive system diseases,Endoscopic appearance and significance of func...
1831,neoplasms,Flexible sigmoidoscopy as a screening procedur...


In [4]:
# ==============================================
# Apply Text Preprocessing with spaCy
# ==============================================
print("\n" + "=" * 60)
print("Preprocess Medical Abstracts with spaCy")
print("=" * 60 + "\n")

# Initialize the spaCy preprocessor
preprocessor = MedicalTextPreprocessor(model_name="en_core_web_sm")

# Apply preprocessing separately to each subset
train_df = preprocessor.apply(train_df.head(), text_column="medical_abstract")
#valid_df = preprocessor.apply(valid_df, text_column="medical_abstract")
#test_df = preprocessor.apply(test_df, text_column="medical_abstract")

# Display result
display(train_df.head())



2025-10-08 01:02:00,936 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Loading spaCy model: en_core_web_sm



Preprocess Medical Abstracts with spaCy



2025-10-08 01:02:01,350 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - spaCy model loaded successfully.
2025-10-08 01:02:01,351 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing column 'medical_abstract' using spaCy...
2025-10-08 01:02:01,411 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing completed. Average token count: 41.60


Unnamed: 0,condition_label,medical_abstract,cleaned_medical_abstract
300,cardiovascular diseases,Recurrent intracaval renal cell carcinoma: the...,recurrent intracaval renal cell carcinoma role...
771,general pathological conditions,Anterior cervical discectomy and fusion. A ret...,anterior cervical discectomy fusion retrospect...
1085,general pathological conditions,Stiffman syndrome: a rare paraneoplastic disor...,stiffman syndrome rare paraneoplastic disorder...
1033,general pathological conditions,Adult respiratory distress syndrome after limi...,adult respiratory distress syndrome limited re...
1803,neoplasms,Bowenoid papulosis. Bowenoid papulosis is an u...,bowenoid papulosis bowenoid papulosis uncommon...


In [11]:
# src/mednlpix/training/train_logreg.py
from mednlpix.pipelines.preprocessing_pipeline import PreprocessingPipeline
from mednlpix.pipelines.registry_manager import update_registry
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
import joblib
from pathlib import Path
import datetime

# ==========================================================
# 1. Data overview
# ==========================================================
print(f"Training set: {train_df.shape[0]} rows × {train_df.shape[1]} columns")
display(train_df.head(3))

print(f"Validation set: {valid_df.shape[0]} rows × {valid_df.shape[1]} columns")
display(valid_df.head(3))

# ==========================================================
# 2. Preprocessing (TF-IDF)
# ==========================================================
tfidf_pipe = PreprocessingPipeline(method="tfidf", max_features=3000)

print("\n--- Fitting TF-IDF preprocessing pipeline ---")
X_train_tfidf = tfidf_pipe.fit_transform(train_df)
X_valid_tfidf = tfidf_pipe.transform(valid_df)

# Save preprocessing pipeline
preproc_path = tfidf_pipe.save()

# ==========================================================
# 3. Model definition and training
# ==========================================================
random_state = 42
model_lr = LogisticRegression(
    C=1.0,
    solver="lbfgs",
    class_weight="balanced",
    max_iter=3000,
    random_state=42
)


print("\n--- Training Logistic Regression model ---")
y_train = train_df["condition_label"]
model_lr.fit(X_train_tfidf, y_train)
print("Training completed successfully.")

# ==========================================================
# 4. Evaluation on validation set
# ==========================================================
print("\n--- Evaluating model on validation set ---")
y_valid = valid_df["condition_label"]
y_pred = model_lr.predict(X_valid_tfidf)

print("\nClassification report:")
print(classification_report(y_valid, y_pred))

f1_macro = f1_score(y_valid, y_pred, average="macro")
print(f"Validation F1 (macro): {f1_macro:.4f}")

# ==========================================================
# 5. Save model and update registry
# ==========================================================
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_dir = Path("models/artifacts/")
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / f"logreg_tfidf_{timestamp}.joblib"

joblib.dump(model_lr, model_path)
print(f"\nModel saved successfully: {model_path}")

# Update registry
update_registry(str(preproc_path), model_type="preprocessor_tfidf")
update_registry(str(model_path), model_type="model_tfidf")

print("Registry updated successfully.")


Training set: 1600 rows × 3 columns


Unnamed: 0,condition_label,medical_abstract,cleaned_medical_abstract
300,cardiovascular diseases,Recurrent intracaval renal cell carcinoma: the...,recurrent intracaval renal cell carcinoma role...
771,general pathological conditions,Anterior cervical discectomy and fusion. A ret...,anterior cervical discectomy fusion retrospect...
1085,general pathological conditions,Stiffman syndrome: a rare paraneoplastic disor...,stiffman syndrome rare paraneoplastic disorder...


Validation set: 343 rows × 3 columns


Unnamed: 0,condition_label,medical_abstract,cleaned_medical_abstract
1270,neoplasms,Primary prevention of cancer. The case for com...,primary prevention cancer case comprehensive s...
1264,general pathological conditions,Spontaneous rupture of a normal bladder. We ha...,spontaneous rupture normal bladder report case...
1933,nervous system diseases,Orbital intramuscular schwannoma. In an 8-year...,orbital intramuscular schwannoma year old girl...


2025-10-08 00:30:20,409 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Loading spaCy model: en_core_web_sm
2025-10-08 00:30:20,744 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - spaCy model loaded successfully.
2025-10-08 00:30:20,745 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TFIDFVectorizerWrapper initialized with params: {'max_features': 3000, 'ngram_range': (1, 2), 'sublinear_tf': True, 'stop_words': 'english'}
2025-10-08 00:30:20,746 - preprocessing_pipeline - INFO - Initialized TF-IDF pipeline with parameters: {'max_features': 3000}
2025-10-08 00:30:20,750 - preprocessing_pipeline - INFO - Starting preprocessing training (tfidf) on 1600 samples.
2025-10-08 00:30:20,751 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing column 'medical_abstract' using spaCy...



--- Fitting TF-IDF preprocessing pipeline ---


2025-10-08 00:30:32,006 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Done. Avg. token count: 47.76
2025-10-08 00:30:32,008 - preprocessing_pipeline - INFO - Text cleaning and lemmatization completed. Starting vectorization.
2025-10-08 00:30:32,009 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - Fitting and transforming 1600 documents with TF-IDF...
2025-10-08 00:30:32,122 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TF-IDF fit_transform complete. Output shape: (1600, 3000)
2025-10-08 00:30:32,122 - preprocessing_pipeline - INFO - Vectorization completed (tfidf) — feature matrix shape: (1600, 3000).
2025-10-08 00:30:32,123 - preprocessing_pipeline - INFO - Applying preprocessing pipeline (tfidf) to new dataset (343 rows).
2025-10-08 00:30:32,123 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing column 'medical_abstract' using spaCy...
2025-10-08 00:30:34,209 - mednlpix.features.preprocessing.spacy_preprocessor - INFO 


--- Training Logistic Regression model ---
Training completed successfully.

--- Evaluating model on validation set ---

Classification report:
                                 precision    recall  f1-score   support

        cardiovascular diseases       0.63      0.60      0.62        63
      digestive system diseases       0.50      0.54      0.52        52
general pathological conditions       0.37      0.27      0.31        86
                      neoplasms       0.64      0.71      0.68        76
        nervous system diseases       0.51      0.62      0.56        66

                       accuracy                           0.54       343
                      macro avg       0.53      0.55      0.54       343
                   weighted avg       0.52      0.54      0.53       343

Validation F1 (macro): 0.5364

Model saved successfully: models/artifacts/logreg_tfidf_20251008_003034.joblib
Registry updated for preprocessor_tfidf: models/artifacts/preprocessors/preprocessor_

In [None]:
# src/mednlpix/training/train_logreg.py
import joblib
import datetime
from pathlib import Path
from mednlpix.inference.preprocessing_pipeline import PreprocessingPipeline
from mednlpix.inference.registry_manager import update_registry
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score


# ==========================================================
# 1. Data overview
# ==========================================================
print(f"Training set: {train_df.shape[0]} rows × {train_df.shape[1]} columns")
display(train_df.head(3))

print(f"Validation set: {valid_df.shape[0]} rows × {valid_df.shape[1]} columns")
display(valid_df.head(3))

# ==========================================================
# 2. Preprocessing (TF-IDF)
# ==========================================================
tfidf_pipe = PreprocessingPipeline(method="tfidf", max_features=3000)

print("\n--- Fitting TF-IDF preprocessing pipeline ---")
X_train_tfidf = tfidf_pipe.fit_transform(train_df)
X_valid_tfidf = tfidf_pipe.transform(valid_df)

# Save preprocessing pipeline
preproc_path = tfidf_pipe.save()

# ==========================================================
# 3. Model definition
# ==========================================================
random_state = 42
model_lr = LogisticRegression(
    multi_class="multinomial",
    random_state=random_state,
    class_weight="balanced"
)

logistic_params = {
    "C": [0.1, 1.0, 10.0],
    "solver": ["lbfgs", "saga"],
    "max_iter": [3000]
}

# ==========================================================
# 4. Grid search on training data
# ==========================================================
print("\n--- Starting hyperparameter optimization ---")
grid = GridSearchCV(
    estimator=model_lr,
    param_grid=logistic_params,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=2,
)

grid.fit(X_train_tfidf, train_df["condition_label"])

best_model = grid.best_estimator_
print(f"\nBest parameters: {grid.best_params_}")
print(f"Best cross-validation F1 (macro): {grid.best_score_:.4f}")

# ==========================================================
# 5. Evaluation on validation set
# ==========================================================
print("\n--- Evaluating model on validation set ---")
y_valid = valid_df["condition_label"]
y_pred = best_model.predict(X_valid_tfidf)

print("\nClassification report:")
print(classification_report(y_valid, y_pred))

f1_macro = f1_score(y_valid, y_pred, average="macro")
print(f"Validation F1 (macro): {f1_macro:.4f}")

# ==========================================================
# 6. Save model and update registry
# ==========================================================
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_dir = Path("models/artifacts/")
model_dir.mkdir(parents=True, exist_ok=True)
model_path = model_dir / f"logreg_tfidf_{timestamp}.joblib"

joblib.dump(best_model, model_path)
print(f"\nModel saved successfully: {model_path}")

# Update registry
update_registry(str(preproc_path), model_type="preprocessor_tfidf")
update_registry(str(model_path), model_type="model_tfidf")

print("Registry updated successfully.")


In [None]:
from mednlpix.inference.preprocessing_pipeline import PreprocessingPipeline
from mednlpix.inference.registry_manager import update_registry


# Exemple avec TF-IDF
tfidf_pipe = PreprocessingPipeline(method="tfidf", max_features=3000)
y_train = train_df["condition_label"].head()
X_train = tfidf_pipe.fit_transform(train_df.head())




In [6]:
from mednlpix.inference.preprocessing_pipeline import PreprocessingPipeline
from mednlpix.inference.registry_manager import update_registry


# Exemple avec TF-IDF
tfidf_pipe = PreprocessingPipeline(method="tfidf", max_features=3000)
X_train = tfidf_pipe.fit_transform(train_df.head())

path = tfidf_pipe.save()
#update_registry(str(path), model_type="preprocessor_tfidf")

# Exemple avec Word2Vec
#w2v_pipe = PreprocessingPipeline(method="word2vec", vector_size=150, window=10)
#X_train = w2v_pipe.fit_transform(train_df.head())

#path = w2v_pipe.save()
#update_registry(str(path), model_type="preprocessor_word2vec")


2025-10-07 23:34:11,795 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Loading spaCy model: en_core_web_sm
2025-10-07 23:34:12,033 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - spaCy model loaded successfully.
2025-10-07 23:34:12,034 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TFIDFVectorizerWrapper initialized with params: {'max_features': 3000, 'ngram_range': (1, 2), 'sublinear_tf': True, 'stop_words': 'english'}
2025-10-07 23:34:12,035 - preprocessing_pipeline - INFO - Initialized TF-IDF pipeline with parameters: {'max_features': 3000}
2025-10-07 23:34:12,039 - preprocessing_pipeline - INFO - Starting preprocessing training (tfidf) on 5 samples.
2025-10-07 23:34:12,040 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing column 'medical_abstract' using spaCy...
2025-10-07 23:34:12,083 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Done. Avg. token count: 41.60
2025-10-07 23:34:12,084 - preprocessing

In [7]:
# Exemple avec Word2Vec
w2v_pipe = PreprocessingPipeline(method="word2vec", vector_size=150, window=10)
X_train = w2v_pipe.fit_transform(train_df.head())

path = w2v_pipe.save()
#update_registry(str(path), model_type="preprocessor_word2vec")

2025-10-07 23:34:25,895 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Loading spaCy model: en_core_web_sm
2025-10-07 23:34:26,118 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - spaCy model loaded successfully.
2025-10-07 23:34:26,119 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Word2VecVectorizer initialized with parameters: {'vector_size': 150, 'window': 10, 'min_count': 2, 'workers': 4, 'sg': 1, 'epochs': 10}
2025-10-07 23:34:26,119 - preprocessing_pipeline - INFO - Initialized Word2Vec pipeline with parameters: {'vector_size': 150, 'window': 10}
2025-10-07 23:34:26,120 - preprocessing_pipeline - INFO - Starting preprocessing training (word2vec) on 5 samples.
2025-10-07 23:34:26,121 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Preprocessing column 'medical_abstract' using spaCy...
2025-10-07 23:34:26,160 - mednlpix.features.preprocessing.spacy_preprocessor - INFO - Done. Avg. token count: 41.60
2025-10-07 23:34:26,160 

In [9]:
# --- Entraînement (offline) ---

vectorizer = TFIDFVectorizerWrapper(max_features=3000)
X_train = vectorizer.fit_transform(train_df)


# Automatically detect project root (where pyproject.toml or src/ is)
ROOT = Path().resolve().parents[1]  # Adjust depending on script depth


print(f"Project root detected at: {ROOT}")

# Define a portable models directory inside your package
path_model = ROOT /"MedNLPix" / "src" / "mednlpix" / "models"
path_model.mkdir(parents=True, exist_ok=True)

# Define the output model path
model_path = path_model / "vectorizer_tfidf_vectorizer.joblib"

# Save the trained model
vectorizer.save(model_path)

print(f"Model saved at: {model_path}")


# --- En production (online) ---
#from mednlpix.features.vectorization import TFIDFVectorizerWrapper

#vectorizer = TFIDFVectorizerWrapper().load("models/tfidf_vectorizer.joblib")
#X_new = vectorizer.transform(new_data)


2025-10-07 17:30:48,790 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TFIDFVectorizerWrapper initialized with params: {'max_features': 3000, 'ngram_range': (1, 2), 'sublinear_tf': True, 'stop_words': 'english'}
2025-10-07 17:30:48,802 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - Fitting and transforming 1600 documents with TF-IDF...
2025-10-07 17:30:48,996 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TF-IDF fit_transform complete. Output shape: (1600, 3000)
2025-10-07 17:30:49,034 - mednlpix.features.vectorization.tfidf_vectorizer - INFO - TF-IDF vectorizer saved successfully to /Users/surelmanda/Data-Science-Projects/MedNLPix/src/mednlpix/models/vectorizer_tfidf_vectorizer.joblib


Project root detected at: /Users/surelmanda/Data-Science-Projects
Model saved at: /Users/surelmanda/Data-Science-Projects/MedNLPix/src/mednlpix/models/vectorizer_tfidf_vectorizer.joblib


<1600x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 51617 stored elements in Compressed Sparse Row format>

In [8]:
# --- Entraînement ---

vectorizer = Word2VecVectorizer(vector_size=150, window=10, min_count=3, sg=1)
X_train = vectorizer.fit_transform(train_df)

# Automatically detect project root (where pyproject.toml or src/ is)
ROOT = Path().resolve().parents[1]  # Adjust depending on script depth


print(f"Project root detected at: {ROOT}")

# Define a portable models directory inside your package
path_model = ROOT /"MedNLPix" / "src" / "mednlpix" / "models"
path_model.mkdir(parents=True, exist_ok=True)

# Define the output model path
model_path = path_model / "word2vec_vectorizer.joblib"

# Save the trained model
vectorizer.save(model_path)

print(f"Model saved at: {model_path}")


# --- En production ---
#from mednlpix.features.vectorization import Word2VecVectorizer

#vectorizer = Word2VecVectorizer().load("models/word2vec_vectorizer.model")
#X_new = vectorizer.transform(new_data)


2025-10-07 17:23:34,769 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Word2VecVectorizer initialized with parameters: {'vector_size': 150, 'window': 10, 'min_count': 3, 'workers': 4, 'sg': 1, 'epochs': 10}
2025-10-07 17:23:34,776 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Training Word2Vec model on 1600 documents...
2025-10-07 17:23:38,186 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Word2Vec model trained successfully. Vocabulary size: 4229 words.
2025-10-07 17:23:38,188 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Generating averaged embeddings for 1600 documents...
2025-10-07 17:23:38,343 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Word2Vec transformation complete. Output shape: (1600, 150)
2025-10-07 17:23:38,350 - mednlpix.features.vectorization.word2vec_vectorizer - INFO - Word2Vec model saved successfully to /Users/surelmanda/Data-Science-Projects/MedNLPix/src/mednlpix/models/word2vec_vec

Project root detected at: /Users/surelmanda/Data-Science-Projects
Model saved at: /Users/surelmanda/Data-Science-Projects/MedNLPix/src/mednlpix/models/word2vec_vectorizer.joblib
