In [1]:
!pip install PyTDC



In [2]:
!pip install scikit-learn



In [3]:
from tdc.multi_pred import TrialOutcome
data = TrialOutcome(name = 'phase1') # 'phase2' / 'phase3'
split = data.get_split()

Found local copy...
Loading...
Done!


In [4]:
split["train"]

Unnamed: 0,nctid,start_date,complete_date,drug_molecules,icdcodes,eligibility_criteria,Y
0,NCT00002790,March 1996,,CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\\C=C\\C...,D89.810__D89.811__D89.813__D89.812__C95.91__C9...,DISEASE CHARACTERISTICS: See General Eligibili...,0
1,NCT00002863,June 1996,July 2000,[H][C@@]12N(C)C3=CC(OC)=C(C=C3[C@@]11CCN3CC=C[...,C96.A__C46.9__C96.22__C46.0__C46.2__C92.31__C9...,DISEASE CHARACTERISTICS: Biopsy-proven soft ti...,0
2,NCT00003005,December 1997,March 2001,[H][C@@]1(CO)C[C@@]([H])(O)[C@@]([H])(O1)N1C=N...,C95.91__C95.92__Z80.6__Z85.6__C90.11__C90.12__...,DISEASE CHARACTERISTICS: TdT positive acute ly...,0
3,NCT00003060,March 1995,"February 22, 2001",CS(=O)(=O)OCCCCOS(C)(=O)=O__ClCCN(CCCl)P1(=O)N...,C43.51__C43.9__C43.52__D03.51__C43.8__Z85.820_...,DISEASE CHARACTERISTICS: Biopsy proven relapse...,0
4,NCT00003194,July 1997,"December 19, 2002",N[C@@H](CCCNC(N)=N)C(O)=O__ClCCN(CCCl)P1(=O)NC...,H01.009__H02.209__H02.009__H02.109__H04.209__H...,DISEASE CHARACTERISTICS: - Histologically prov...,0
...,...,...,...,...,...,...,...
1677,NCT03605212,"February 27, 2017","July 25, 2018",CC(C)COC1=C(C=C(C=C1)C1=NC(C)=C(S1)C(O)=O)C#N,E88.3,Inclusion Criteria: - male and female children...,0
1678,NCT03606707,"June 1, 2018","July 15, 2018",[H][C@@]12C[C@@H](C)[C@](O)(C(=O)CO)[C@@]1(C)C...,H53.9__O90.6__F01.50__F01.51__F03.90__F03.91__...,Inclusion Criteria: - Patients with rheumatoid...,1
1679,NCT03652233,"November 2, 2018",November 2021,[H][N]1([H])[C@@H]2CCCC[C@H]2[N]([H])([H])[Pt]...,C22.0__C4A.9__C7B.1__C4A.0__C4A.31__C4A.51__C4...,Inclusion Criteria: - Recurrent/metastatic Sq...,0
1680,NCT03672851,"April 17, 2019","July 31, 2019",COC1=CC=CC=C1OC1=C(NS(=O)(=O)C2=CC=C(C=C2)C(C)...,C91.01__C91.02__C92.01__C92.02__C92.41__C92.42...,Inclusion Criteria: 1. Research patients enrol...,0


In [5]:
import time
from datetime import datetime

def get_timestamp(date_str):
    try:
        # Cas où le jour est présent (ex: "November 2, 2018")
        date_obj = datetime.strptime(date_str, "%B %d, %Y")
    except ValueError:
        # Cas où seul le mois et l'année sont présents (ex: "July 2000")
        date_obj = datetime.strptime(date_str, "%B %Y")
        date_obj = date_obj.replace(day=1)  # Ajouter le 1er du mois

    return int(date_obj.timestamp())  # Convertir en timestamp (secondes)

In [6]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialisation des encodeurs
drug_encoder = TfidfVectorizer()
icd_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
eligibility_encoder = TfidfVectorizer()
duration_scaler = StandardScaler()

def compute_duration(start_date, complete_date):
    """ Calcule la durée en gérant les erreurs """
    duration = []
    for i in range(len(complete_date)):
        try:
            duration.append(get_timestamp(complete_date[i]) - get_timestamp(start_date[i]))
        except:
            duration.append(10**20)  # Valeur extrême pour éviter les erreurs
    return np.array(duration).reshape(-1, 1)  # S'assurer que c'est bien une colonne

def fit_transform_data(data):
    """ Applique fit_transform sur les données d'entraînement """
    global drug_encoder, icd_encoder, eligibility_encoder, duration_scaler  # On garde les mêmes encodeurs pour le test
    
    # Durée normalisée
    duration = compute_duration(data["start_date"], data["complete_date"])
    duration = duration_scaler.fit_transform(duration)  # Normalisation

    # Encodage des features
    drug_molecules = drug_encoder.fit_transform(np.array(data["drug_molecules"])).toarray()
    icdcodes = icd_encoder.fit_transform(np.array(data["icdcodes"]).reshape(-1, 1))
    eligibility_criteria = eligibility_encoder.fit_transform(np.array(data["eligibility_criteria"])).toarray()

    # Concaténation des features
    X_train = np.concatenate([duration, drug_molecules, icdcodes, eligibility_criteria], axis=1)
    return X_train

def transform_data(data):
    """ Applique transform sur les données de test (sans réentraîner les encodeurs) """
    global drug_encoder, icd_encoder, eligibility_encoder, duration_scaler

    duration = compute_duration(data["start_date"], data["complete_date"])
    duration = duration_scaler.transform(duration)  # Normalisation

    drug_molecules = drug_encoder.transform(np.array(data["drug_molecules"])).toarray()
    icdcodes = icd_encoder.transform(np.array(data["icdcodes"]).reshape(-1, 1))
    eligibility_criteria = eligibility_encoder.transform(np.array(data["eligibility_criteria"])).toarray()

    X_test = np.concatenate([duration, drug_molecules, icdcodes, eligibility_criteria], axis=1)
    return X_test

In [7]:
X_train = fit_transform_data(split["train"])  # Entraînement
Y_train_string = split["train"]["Y"]
Y_train = []

for k in range(len(Y_train_string)):
    Y_train.append(int(Y_train_string[k]))

Y_train = np.array(Y_train)

In [8]:
X_train

array([[ 9.6148034 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.10400629,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.10400629,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.10400629,  0.        ,  0.        , ...,  0.11414377,
         0.        ,  0.        ],
       [-0.10400629,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.10400629,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
X_test = transform_data(split["test"])  # Transformation des données de test sans réentraîner

Y_test_string = split["test"]["Y"]
Y_test = []
for k in range(len(Y_test_string)):
    Y_test.append(int(Y_test_string[k]))

Y_test = np.array(Y_test)

In [None]:
# Model random lineaire
from sklearn.linear_model import LogisticRegression  # Or LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# Initialize the model (for logistic regression if Y is binary)
model = LogisticRegression()

model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)  # Assuming X_test and Y_test are available for evaluation

# Evaluate the model
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

# Paramètres à tester pour le RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialiser le modèle
rf = RandomForestClassifier(random_state=42)

# Effectuer la recherche sur les hyperparamètres avec une validation croisée
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Entraînement du modèle sur les données d'entraînement
grid_search.fit(X_train, Y_train)

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Faire des prédictions
Y_pred = best_model.predict(X_test)

# Évaluation du modèle
print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("Confusion Matrix:\n", confusion_matrix(Y_test, Y_pred))