In [3]:
def get_trajectories():
    
    """
    Get values, timestamps and categorical value for each trajectories from a specify dataset.
    :return: (values, timestamps, categorical) dataframes for each trajectory
    """
    
    print("-- PROMPT --------------------------------------------")
    
    global cat_var, score_var
    
    # Inputs for CHRU dataset
    data_type = input("Do you want clinical or biological data ? (CLINICAL/BIOLOGICAL) : ").upper()
    cat_var = input("Which categorical variable do you want ? (SEX/GROUPEDEB...) : ").upper()
    if data_type == "BIOLOGICAL":
        score_var_details = "(CHOLESTEROL/SODIUM/CREATININE...)"
    else:
        score_var_details = "(ALS/WEIGHT/CVF)"
        data_type = "CLINICAL"
    
    # Inputs for all datasets
    score_var = input("Which trajectory dimension do you want ? " + score_var_details + " : ").upper()
    is_normalized = input("Do you want normalized values ? (YES/NO) : ").upper()
    thresold_nb_timestamp = input("How many appointments at least, trajectories have to get ? : ")
    thresold_duration = input("Which largest duration do you want (Number of days) ? : ")
    
    # Check input type
    if thresold_nb_timestamp.isdigit():
        thresold_nb_timestamp = int(thresold_nb_timestamp)
    else:
        thresold_nb_timestamp = 5
        
    if thresold_duration.isdigit():
        thresold_duration = int(thresold_duration)
    else:
        thresold_duration = 10000
        
    if is_normalized not in ("YES", "NO"):
        is_normalized = False
    
    print("-- RESULTS --------------------------------------------")
    
    return get_chru_trajectories(data_type, is_normalized, score_var, cat_var, thresold_nb_timestamp, thresold_duration)

In [1]:
def get_chru_trajectories(data_type, is_normalized, score_var, cat_var, thresold_nb_timestamp, thresold_duration):
    
    """
    Get values, timestamps and categorical value for each trajectories from a CHRU dataset.
    
    :param is_clinical: Boolean that specify if we want biological or clinical data
    :param is_normalized: Boolean that specify if we want normalized values
    :param score_var: String that specify which trajectory dimension we want
    :param cat_var: String that specify which categorical value we want from trajectories
    :param thresold_nb_timestamp: Integer that set the minimum thresold appointment
    :param thresold_duration: Integer (Number of days) that set the largest thresold that trajectories do not exceed
    :return: (values, timestamps, categorical) dataframes for each trajectory
    """

    # SQL query
    if data_type == "BIOLOGICAL":
        sql = '''SELECT DISTINCT * FROM "CHRU_CLINIQUE" INNER JOIN "CHRU_BIOLOGIQUE" ON "CHRU_CLINIQUE"."ID" = "CHRU_BIOLOGIQUE"."ID" 
                    ORDER BY "CHRU_CLINIQUE"."ID" ASC, "CHRU_BIOLOGIQUE"."DATE_DE_PRELEVEMENT" ASC;'''
    else:
        sql = 'SELECT * FROM "CHRU_CLINIQUE" ;'

    # Fetch all data
    with engine.connect().execution_options(autocommit=True) as conn:
        query = conn.execute(sql)
    df = pd.DataFrame(query.fetchall())
    
    # Remove duplicated ID columns
    df = df.loc[:,~df.columns.duplicated()].copy()
    
    # Set ID column as row index
    df = df.set_index("ID")
    
    # Set first and last column timestamps (clinical dimensions)
    if score_var == "CVF":
        first_timestamp = "DATEXAM_DELAY"
        last_timestamp = "DATEXAM_DELAY_33"
    else:
        first_timestamp = "DATEXAM_DELAY"
        last_timestamp = "DATEXAM_DELAY_34"

    # Set first and last column values (clinical dimensions)
    if score_var not in ("ALS", "WEIGHT", "CVF") and data_type == "CLINICAL":
        score_var = "ALS"
        
    if score_var == "CVF":
        first_value = score_var + "_1"
        last_value = score_var + "_34"
    else:
        first_value = score_var + "_1"
        last_value = score_var + "_35"
    
    print("Number of trajectories before filtering :", len(np.unique(df.index.values)))
    
    # Create values, timestamps and categorical dataframes for each ID patient
    if data_type == "BIOLOGICAL":
        # For biological dimensions
        df_values = df[[score_var]]
        df_timestamps = df[['DURATION_BIO']]
    else:
        # For clinical dimensions
        df_values = df.loc[:, first_value:last_value]
        df_timestamps = df.loc[:, first_timestamp:last_timestamp]

    if cat_var not in df.columns.values: cat_var = "GROUPEDEB"
        
    df_cat = pd.DataFrame(data=df[cat_var], columns=[cat_var], dtype="str")
        
    # Limit duration from timestamps until n days
    df_timestamps = limit_trajectory_duration(df_timestamps, thresold_duration)
    
    # Remove too short trajectories with less than two timestamps
    df_values, df_timestamps, df_cat = remove_short_trajectories(df_values, df_timestamps, df_cat, thresold_nb_timestamp)
   
    # Remove trajectories without categorical value
    df_values, df_timestamps, df_cat = remove_trajectories_without_categorical_value(df_values, df_timestamps, df_cat)
    
    # Remove onset respiratory trajectory patients
    if cat_var == "GROUPEDEB":
        df_values, df_timestamps, df_cat = remove_respiratory_trajectories(df_values, df_timestamps, df_cat)

    # Remove ALS trajectories that increase too much
    if score_var not in ("CVF", "WEIGHT") and data_type == "CLINICAL":
        thresold_rise, keep_wrong = 2, False
        df_values, df_timestamps, df_cat = remove_wrong_rise_trajectories(
            df_values, df_timestamps, df_cat, thresold_rise, keep_wrong
        )
        
    # Normalize data values
    if is_normalized == "YES": df_values = normalize_trajectory_values(df_values)
        
    # Drop duplicate rows from categorical dataframe if any
    df_cat = df_cat[~df_cat.index.duplicated()]
    
    print("Number of trajectories after filtering :", len(np.unique(df_values.index.values)))
    
    # Print count for each distinct categorical value
    print("Count for each distinct categorical value :", df_cat.value_counts())
    
    # Print min and max values from trajectories
    print("max trajectory value :", max(df_values.max()))
    print("min trajectory value :", min(df_values.min()))
    
    # Return filtered dataframes
    return df_values, df_timestamps, df_cat

In [None]:
def get_filtered_trajectory_values_timestamps(index, df_values, df_timestamps):
    
    """
    Get values and timestamps from a trajectory patient ID.
    
    :param index: ID patient
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :return: A pair (values, timestamps) from the trajectory patient
    """
    
    values = np.array(df_values.loc[index,:]).flatten()
    timestamps = np.array(df_timestamps.loc[index,:]).flatten()
    
    return filter_trajectory(values, timestamps)

In [1]:
def get_diagnosis():
    
    """
    Get diagnosis data (that do not change over time).
    
    :return: Dataframe with all diagnosis data
    """
    
    dataset = input("On which dataset do you want diagnosis data ?:").upper()
    list_cat = input("Which diagnosis data do you want (present list of data like this 'GROUPEDEB, SEX,...')?:").replace(" ", "").upper()
    
    if dataset == "CHRU": 
        sql = 'SELECT * FROM "CHRU_CLINIQUE";'
        
    if len(list_cat) != 0: list_cat = list_cat.split(",")

    # Fetch all data
    with engine.connect().execution_options(autocommit=True) as conn: query = conn.execute(sql)         
    df = pd.DataFrame(query.fetchall())

    # Remove duplicated ID columns
    df = df.loc[:,~df.columns.duplicated()].copy()

    # Set ID column as row index
    df = df.set_index("ID")
    
    # Check if user want some additionnal features
    if len(list_cat) == 0:     
        return None
    else:
        return df[list_cat]