In [None]:
!pip install sentence-transformers
!pip install pandas scikit-learn matplotlib seaborn


# ----------------------------
# SECTION 1: Imports & Setup
# ----------------------------

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import umap
import re
import joblib
from sentence_transformers import SentenceTransformer, util


# ----------------------------
# SECTION 2: Data Loading
# ----------------------------

In [None]:
# Load your CSV file
df = pd.read_csv("liste_amm.csv")

# Show a preview
print("Dataset preview:")
display(df[['Nom', 'DCI', 'Classe', 'Indications', 'Dosage']].head(3))

# Plotting the top 10 therapeutic classes
plt.figure(figsize=(10, 5))
df['Classe'].value_counts().head(10).plot(kind='barh')
plt.title('Top 10 Therapeutic Classes')
plt.xlabel('Number of Drugs')
plt.show()



In [None]:
df.head()

# ----------------------------
# SECTION 3: Feature Engineering
# ----------------------------

In [None]:
print("\nEngineering features...")

# --- 1. Extracting the dose ---
def extract_dose(dosage):
    if isinstance(dosage, str):
        match = re.search(r'(\d+(?:\.\d+)?)\s*mg', dosage, re.IGNORECASE)
        return float(match.group(1)) if match else np.nan
    return np.nan

df['Dose_mg'] = df['Dosage'].apply(extract_dose)

# --- 2. Detecting biological drugs ---
BIOLOGIC_KEYWORDS = r'\b(MAB|ZUMA|CEPT|CPT|KIN)\b'
df['Biologic'] = df['DCI'].fillna('').str.contains(BIOLOGIC_KEYWORDS, case=False, regex=True)

# --- 3. Normalizing the route of administration ---
ADMIN_ROUTE_MAPPING = {
    'SOLUTION INJECTABLE': 'Intravenous (IV)',
    'COMPRIMÉ': 'Oral',
    'GEL': 'Topical',
    'SIROP': 'Oral',
    'POMMADE': 'Topical',
    'POUDRE': 'Other',
    'CAPSULE': 'Oral'
}
df['Admin_Route'] = df['Forme'].str.upper().map(ADMIN_ROUTE_MAPPING).fillna('Unknown')


# ----------------------------
# SECTION 3.1: Smart NaN Imputation
# ----------------------------

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("\nImputing missing values in low-memory mode...")

def smart_fill_low_memory(df, target_column, text_columns):
    df_filled = df.copy()
    missing_idx = df[df[target_column].isna()].index

    # Create the text corpus
    corpus = df[text_columns].fillna('').agg(' '.join, axis=1)

    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    for idx in missing_idx:
        # Embedding of the drug with NaN
        tfidf_vector = tfidf_matrix[idx]

        # Compare only with drugs that have non-NaN target values
        valid_idx = df[df[target_column].notna()].index
        valid_vectors = tfidf_matrix[valid_idx]

        # Calculate similarity (lightweight: 1 against a few rows)
        similarities = cosine_similarity(tfidf_vector, valid_vectors).flatten()

        # Find the best match
        best_match_local_idx = similarities.argmax()
        best_match_idx = valid_idx[best_match_local_idx]

        if similarities[best_match_local_idx] > 0:
            df_filled.at[idx, target_column] = df.at[best_match_idx, target_column]
        else:
            df_filled.at[idx, target_column] = f"{target_column} unknown"

    return df_filled

# Columns to use for similarity
text_cols = ['Nom', 'Forme', 'Dosage']

# Smart imputation without memory explosion
df = smart_fill_low_memory(df, 'DCI', text_cols)
df = smart_fill_low_memory(df, 'Classe', text_cols)
df = smart_fill_low_memory(df, 'Indications', text_cols)

print("\nAfter smart filling (low-memory mode):")
display(df[['Nom', 'DCI', 'Classe', 'Indications']].sample(5))


In [None]:

print("\nSample of engineered data:")
display(df[['Nom', 'DCI', 'Classe', 'Indications', 'Dose_mg', 'Admin_Route', 'Biologic']].head())

In [None]:
# ----------------------------
# SECTION 3bis: Enhanced Visualization
# ----------------------------

import matplotlib.pyplot as plt
import seaborn as sns

print("\nVisualizing features...")

# --- 1. Distribution of doses ---
plt.figure(figsize=(10, 6))
sns.histplot(df['Dose_mg'].dropna(), bins=30, kde=True, color="royalblue")
plt.title('Distribution of doses (mg)', fontsize=16)
plt.xlabel('Dose (mg)', fontsize=14)
plt.ylabel('Number of drugs', fontsize=14)
plt.grid(True)
plt.show()

# --- 2. Distribution of administration routes ---
plt.figure(figsize=(8, 6))
admin_order = df['Admin_Route'].value_counts().index
sns.countplot(data=df, y='Admin_Route', order=admin_order, palette='Set2')
plt.title('Distribution of administration routes', fontsize=16)
plt.xlabel('Number of drugs', fontsize=14)
plt.ylabel('Administration route', fontsize=14)
plt.grid(axis='x')
plt.show()

# --- 3. Top therapeutic classes ---
plt.figure(figsize=(12, 6))
top_classes = df['Classe'].value_counts().head(10)
sns.barplot(x=top_classes.values, y=top_classes.index, palette="coolwarm")
plt.title('Top 10 therapeutic classes', fontsize=16)
plt.xlabel('Number of drugs', fontsize=14)
plt.ylabel('Therapeutic class', fontsize=14)
plt.grid(True)
plt.show()


In [None]:
print("\nGenerating embeddings with Sentence-BERT...")

!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer

class DrugEncoder:
    def __init__(self):
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def encode(self, texts):
        return self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

# Preparing the drug profiles
# We build a text profile by combining all the useful columns

# Replace NaN with empty strings to avoid concatenation problems
df['Drug_Profile'] = (
    df['DCI'].fillna('') + " " +
    df['Classe'].fillna('') + " " +
    df['Sous Classe'].fillna('') + " " +
    df['Indications'].fillna('') + " " +
    df['Dosage'].fillna('') + " " +
    df['Biologic'].fillna('').astype(str) + " " +
    df['G/P/B'].fillna('') + " " +
    df['VEIC'].fillna('') + " " +
    df['Forme'].fillna('')
)

# Encoding the drug profiles
encoder = DrugEncoder()
embeddings = encoder.encode(df['Drug_Profile'].tolist())

print(f"Embeddings generated with shape: {embeddings.shape}")



In [None]:
plt.figure(figsize=(8, 4))
plt.hist(embeddings[:, 0], bins=50, alpha=0.7)
plt.title('Embedding Value Distribution')
plt.xlabel('Embedding Dimension 1 Values')
plt.ylabel('Frequency')
plt.show()

In [None]:
# ----------------------------
# SECTION 9: Clustering Visualization with UMAP
# ----------------------------

print("\nVisualisation des embeddings avec UMAP...")

import umap
import matplotlib.pyplot as plt

def plot_umap(embeddings, labels, title="UMAP of Drug Embeddings"):
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        embeddings_2d[:, 0], embeddings_2d[:, 1],
        c=labels, cmap='Spectral', s=10, alpha=0.8
    )
    plt.colorbar(scatter)
    plt.title(title, fontsize=16)
    plt.xlabel('UMAP-1', fontsize=14)
    plt.ylabel('UMAP-2', fontsize=14)
    plt.grid(True)
    plt.show()

# Exemple: colorier selon la "Classe thérapeutique"
# Attention: pour avoir des couleurs il faut encoder les classes en nombres
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
classe_labels = le.fit_transform(df['Classe'].astype(str))

plot_umap(embeddings, classe_labels, title="UMAP - Drug Embeddings colored by Classe")


# ----------------------------
# SECTION 5: Drug Recommendation System
# ----------------------------

In [None]:
def generate_clinical_comment(ref_row, alt_row):
    """
    Generates a clinical comment based on the comparison of form and dose
    between the reference drug and the proposed alternative.

    Args:
        ref_row (pd.Series): Reference row (base drug)
        alt_row (pd.Series): Alternative row

    Returns:
        str: Clinical comment
    """
    comments = []

    # Form Check
    if ref_row['Forme'] != alt_row['Forme']:
        comments.append(f"⚠️ Different form: {alt_row['Forme']} vs {ref_row['Forme']}.")

    else:
        comments.append(f"✅ Same form: {alt_row['Forme']}.")

    # Dose Check
    if pd.isna(ref_row['Dose_mg']) or pd.isna(alt_row['Dose_mg']):
        comments.append("ℹ️ Missing dose for comparison.")
    else:
        dose_ratio = alt_row['Dose_mg'] / ref_row['Dose_mg']
        if 0.8 <= dose_ratio <= 1.2:
            comments.append("✅ Equivalent or close dose.")
        elif dose_ratio < 0.8:
            comments.append(f"⚠️ Lower dose ({alt_row['Dose_mg']} mg vs {ref_row['Dose_mg']} mg), adjustment needed.")
        else:
            comments.append(f"⚠️ Higher dose ({alt_row['Dose_mg']} mg vs {ref_row['Dose_mg']} mg), adjustment needed.")

    # Final result
    return " ".join(comments)



In [None]:
# ---------------------------------------------------------------
# UPDATE TO THE RECOMMENDER CLASS
# ---------------------------------------------------------------

print("\nUpdating the recommendation engine...")

from sentence_transformers.util import cos_sim

class DrugRecommender:
    def __init__(self, embeddings, df):
        self.embeddings = embeddings
        self.df = df
        self.drug_names = df['Nom']

    def recommend(self, drug_name, top_n=5):
        # Normalize the input drug name
        drug_name_normalized = drug_name.strip().upper()

        matches = self.drug_names.str.strip().str.upper() == drug_name_normalized
        idx = np.where(matches)[0]

        if idx.size == 0:
            print(f"Warning: '{drug_name}' not found in the dataset.")
            return pd.DataFrame()

        idx = idx[0]  # first matching index

        similarities = cos_sim([self.embeddings[idx]], self.embeddings)[0].cpu().numpy()

        valid_indices = np.argsort(-similarities)[1:top_n+1]
        results = self.df.iloc[valid_indices].copy()
        results['Similarity'] = similarities[valid_indices]

        # Generate the clinical comment
        ref_row = self.df.iloc[idx]
        results['Clinical Comment'] = results.apply(lambda row: generate_clinical_comment(ref_row, row), axis=1)

        return results[['Nom', 'DCI', 'Classe', 'Dose_mg', 'Indications', 'Similarity', 'Clinical Comment']]


In [None]:
recommender = DrugRecommender(embeddings, df)
sample_recommendations = recommender.recommend("ABEVMY", top_n=5)

if not sample_recommendations.empty:
    print("\nSample recommendations:")
    display(sample_recommendations)


# ----------------------------
# SECTION 6: Save the model
# ----------------------------

In [None]:
print("\nSaving the recommendation model...")
joblib.dump(recommender, 'drug_recommender.pkl')
print("Model saved as 'drug_recommender.pkl'")