In [1]:
!pip install PyTDC



In [2]:
!pip install scikit-learn



In [3]:
!pip install rdkit



In [4]:
!pip install psutil pynvml



In [5]:
%env MLFLOW_TRACKING_SERVER_CERT_PATH=/home/jovyan/work/cert/ca.pem
%env MLFLOW_TRACKING_URI=https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud
%env MLFLOW_TRACKING_USERNAME=mlops_course_admin
%env MLFLOW_TRACKING_PASSWORD=NAv2VcZafgku5D$
%env MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=true

env: MLFLOW_TRACKING_SERVER_CERT_PATH=/home/jovyan/work/cert/ca.pem
env: MLFLOW_TRACKING_URI=https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud
env: MLFLOW_TRACKING_USERNAME=mlops_course_admin
env: MLFLOW_TRACKING_PASSWORD=NAv2VcZafgku5D$
env: MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING=true


In [6]:
from tdc.multi_pred import TrialOutcome
data = TrialOutcome(name = 'phase1') # 'phase2' / 'phase3'
split = data.get_split()

Found local copy...
Loading...
Done!


In [7]:
import time
from datetime import datetime

def get_timestamp(date_str):
    try:
        # Cas où le jour est présent (ex: "November 2, 2018")
        date_obj = datetime.strptime(date_str, "%B %d, %Y")
    except ValueError:
        # Cas où seul le mois et l'année sont présents (ex: "July 2000")
        date_obj = datetime.strptime(date_str, "%B %Y")
        date_obj = date_obj.replace(day=1)  # Ajouter le 1er du mois

    return int(date_obj.timestamp())  # Convertir en timestamp (secondes)

In [8]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from rdkit import Chem
from rdkit.Chem import AllChem

def compute_duration(start_date, complete_date):
    """ Calcule la durée en gérant les erreurs """
    duration = []
    for i in range(len(complete_date)):
        try:
            duration.append(get_timestamp(complete_date[i]) - get_timestamp(start_date[i]))
        except:
            duration.append(10**20)  # Valeur extrême pour éviter les erreurs
    return np.array(duration).reshape(-1, 1)  # S'assurer que c'est bien une colonne

def smiles_to_fp(smiles):
    """ Convertit une liste de SMILES en une empreinte moléculaire moyenne """
    mols = [Chem.MolFromSmiles(s) for s in smiles.split('__') if Chem.MolFromSmiles(s)]
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) for mol in mols]
    return np.array(fps).mean(axis=0) if fps else np.zeros(2048)

# Initialisation des encodeurs
icd_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
eligibility_encoder = TfidfVectorizer()
duration_scaler = StandardScaler()

def fit_transform_data(data):
    """ Applique fit_transform sur les données d'entraînement """
    global icd_encoder, eligibility_encoder, duration_scaler  # Garder les mêmes encodeurs pour le test
    
    # Durée normalisée
    duration = compute_duration(data["start_date"], data["complete_date"])
    duration = duration_scaler.fit_transform(duration)  # Normalisation
    
    # Empreintes moléculaires RDKit
    drug_molecules = np.array([smiles_to_fp(smiles) for smiles in data["drug_molecules"]])
    
    # Encodage des autres features
    icdcodes = icd_encoder.fit_transform(np.array(data["icdcodes"]).reshape(-1, 1))
    eligibility_criteria = eligibility_encoder.fit_transform(np.array(data["eligibility_criteria"])).toarray()
    
    # Concaténation des features
    X_train = np.concatenate([duration, drug_molecules, icdcodes, eligibility_criteria], axis=1)
    return X_train

def transform_data(data):
    """ Applique transform sur les données de test (sans réentraîner les encodeurs) """
    global icd_encoder, eligibility_encoder, duration_scaler
    
    duration = compute_duration(data["start_date"], data["complete_date"])
    duration = duration_scaler.transform(duration)  # Normalisation
    
    drug_molecules = np.array([smiles_to_fp(smiles) for smiles in data["drug_molecules"]])
    icdcodes = icd_encoder.transform(np.array(data["icdcodes"]).reshape(-1, 1))
    eligibility_criteria = eligibility_encoder.transform(np.array(data["eligibility_criteria"])).toarray()
    
    X_test = np.concatenate([duration, drug_molecules, icdcodes, eligibility_criteria], axis=1)
    return X_test


In [9]:
X_train = fit_transform_data(split["train"])  # donnée d'entraînement
Y_train_string = split["train"]["Y"]
Y_train = []

for k in range(len(Y_train_string)):
    Y_train.append(int(Y_train_string[k]))

Y_train = np.array(Y_train)

X_test = transform_data(split["test"])  # Transformation des données de test sans réentraîner

Y_test_string = split["test"]["Y"]
Y_test = []
for k in range(len(Y_test_string)):
    Y_test.append(int(Y_test_string[k]))

Y_test = np.array(Y_test) # donée attendu

In [10]:
import mlflow
mlflow.autolog() # enable mlflow

2025/01/29 23:09:06 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [11]:
# Model random lineaire
from sklearn.linear_model import LogisticRegression  # Or LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Initialize the model (for logistic regression if Y is binary)
model = LogisticRegression()

model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)  # Assuming X_test and Y_test are available for evaluation

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

2025/01/29 23:09:07 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/01/29 23:09:07 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '899c707094484da7b3ce291293ac8d43', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


🏃 View run secretive-ox-654 at: https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud/#/experiments/0/runs/899c707094484da7b3ce291293ac8d43
🧪 View experiment at: https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud/#/experiments/0


2025/01/29 23:10:18 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/01/29 23:10:19 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Accuracy: 0.6
Confusion Matrix:
 [[ 85 121]
 [ 71 203]]


In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb

# Initialize the XGBoost model for binary classification
model = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    colsample_bytree=0.3,         # Fraction of features to use
    learning_rate=0.1,            # Learning rate
    max_depth=10,                  # Maximum depth of trees
    alpha=10,                     # L2 regularization
    n_estimators=200              # Number of trees
)

# Fit the model to the training data
model.fit(X_train, Y_train)

# Make predictions on the test data
Y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

2025/01/29 23:10:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/01/29 23:10:21 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
2025/01/29 23:10:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2de01b319a374e32a4decde6e288cb88', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


🏃 View run glamorous-conch-539 at: https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud/#/experiments/0/runs/2de01b319a374e32a4decde6e288cb88
🧪 View experiment at: https://public-tracking.mlflow-e00v0qa5n6ywddys75.backbone-e00m8trsn6mra9a7kk.msp.eu-north1.nebius.cloud/#/experiments/0


2025/01/30 07:19:23 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/01/30 07:19:23 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


Accuracy: 0.6104166666666667
Confusion Matrix:
 [[ 81 125]
 [ 62 212]]
