In [3]:
def normalize_trajectory_values(df_values):
    
    """
    Normalize between 0 and 1 (Min-Max scaler) values from trajectories.
    
    :param df_values: Dataframe that contains trajectory values
    :return: New dataframe with normalized values
    """
    
    # Find the max value and the minimum value for all trajectories
    max_val = max(df_values.max())
    min_val = min(df_values.min())
    
    # Apply min max scaler for each value
    for c in df_values.columns:
        df_values[c] = df_values[c].apply(lambda x: (x - min_val) / (max_val - min_val))
    
    return df_values

In [4]:
def filter_trajectory(values, timestamps):
    
    """
    Remove null values and timestamps from a trajectory for each appointment.
    
    :param values: List of values from a trajectory
    :param timestamps: List of timestamps from a trajectory
    :return: A pair (values, timestamps) without null values and timestamps
    """
    
    values_filter = []
    timestamps_filter = []
    
    for i in range(len(values)):
        if np.isnan(values[i]) == False and np.isnan(timestamps[i]) == False:
            values_filter.append(values[i])
            timestamps_filter.append(timestamps[i])
    
    return values_filter, timestamps_filter

In [5]:
def limit_trajectory_duration(df_timestamps, thresold_duration):
    
    """
    Limit follow-up to n days from patients.
    
    :param df_timestamps: Dataframe that contains duration from follow-up in days
    :param thresold_duration: Max duration authorized
    :return: New dataframe of timestamps limited to n days
    """
    
    for c in df_timestamps.columns:
        df_timestamps[c] = df_timestamps[c].apply(lambda x: x if x <= thresold_duration else np.NaN)
            
    return df_timestamps

In [1]:
def remove_short_trajectories(df_values, df_timestamps, df_cat, thresold_nb_timestamp):
    
    """
    Remove trajectories that contains less than 'thresold_nb_timestamp' appointments.
    
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :param df_cat: Dataframe that contains a categorical value for each trajectory
    :return: Values, timestamps and categorical Dataframes without short trajectories
    """
    
    index_list = []
    
    for index in np.unique(df_values.index.values):
        
        values_filter, timestamps_filter = get_filtered_trajectory_values_timestamps(index, df_values, df_timestamps)

        if len(values_filter) >= thresold_nb_timestamp:
            index_list.append(index)
    
    index_list = np.unique(index_list)
    count_wrong = len(np.unique(df_values.index.values)) - len(index_list)
    
    print("Number of removed too short trajectory with less than two timestamps :", count_wrong)

    return df_values.loc[index_list], df_timestamps.loc[index_list], df_cat.loc[index_list]

In [None]:
def remove_low_ped_len_trajectories(df_values, df_timestamps, df_cat, thresold_nb_timestamp, thresold_ped):
    
    """
    Remove too short trajectories with low periodicity
    
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :param df_cat: Dataframe that contains a categorical value for each trajectory
    :param thresold_nb_timestamp: Numerical value that set the minimum thresold appointment
    :param thresold_ped: Numerical value that set the minimum thresold periodicity
    :return: Values, timestamps and categorical Dataframes without short trajectories
    """
    
    index_list = df_values.index.values
    index_list_reject = []
    
    for index in np.unique(df_values.index.values):
        
        values_filter, timestamps_filter = get_filtered_trajectory_values_timestamps(index, df_values, df_timestamps)
        
        # Store trajectories that do not respect conditions into index_list_reject
        if len(values_filter) < thresold_nb_timestamp and len(values_filter) >= 2:
            if get_periodicity(timestamps_filter) < thresold_ped:
                index_list_reject.append(index)
                
    
    # Get unique index
    index_list = np.unique(index_list)
    # Remove trajectories that do not respect conditions
    index_list = list(set(index_list).difference(set(index_list_reject)))
    
    count_wrong = len(np.unique(df_values.index.values)) - len(index_list)
    
    print("Number of removed trajectories with low periodicity and low timestamp frequency :", count_wrong)

    return df_values.loc[index_list], df_timestamps.loc[index_list], df_cat.loc[index_list]

In [7]:
def remove_trajectories_without_categorical_value(df_values, df_timestamps, df_cat):
    
    """
    Remove trajectories that do not contains a categorical value from 'df_cat'.
    
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :param df_cat: Dataframe that contains a categorical value for each trajectory
    :return: Values, timestamps and categorical Dataframes that do contains categorical value
    """

    wrong_indexes = np.unique(df_cat[df_cat[df_cat.columns[0]] == "None"].index.values)
    count_wrong = len(wrong_indexes)
   
    print("Number of removed trajectory without categorical value :", count_wrong)

    return df_values.drop(wrong_indexes), df_timestamps.drop(wrong_indexes), df_cat.drop(wrong_indexes)

In [8]:
def remove_wrong_rise_trajectories(df_values, df_timestamps, df_cat, thresold_rise, keep_wrong):
    
    """
    Remove trajetories that contains rises more than 'thresold_rise'.
    
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :param df_cat: Dataframe that contains a categorical value for each trajectory
    :param thresold_rise: Integer that set the thresold rise to not exceed
    :param keep_wrong: Specify if we return just wrong trajectories (Contains greater rises than the thresold)
    :return: Values, timestamps and categorical Dataframes with trajectories that do not contains rises greater than the thresold
    """
    
    index_list = []
    
    for index in np.unique(df_values.index.values):
        
        values_filter, timestamps_filter = get_filtered_trajectory_values_timestamps(index, df_values, df_timestamps)
        
        is_wrong = False
        
        for i, v in enumerate(values_filter):
            if i + 1 < len(values_filter) and values_filter[i + 1] - values_filter[i] > thresold_rise:
                is_wrong = True
        
        if is_wrong == False and keep_wrong == False:
            index_list.append(index)
        elif is_wrong == True and keep_wrong:
            index_list.append(index)
    
    index_list = np.unique(index_list)
    count = len(np.unique(df_values.index.values)) - len(index_list)
    
    if keep_wrong:
        print("Number of removed trajectory that respect thresold :", count)
    else:
        print("Number of removed trajectory that do not respect thresold rise :", count)
        
    return df_values.loc[index_list], df_timestamps.loc[index_list], df_cat.loc[index_list]

In [None]:
def remove_respiratory_trajectories(df_values, df_timestamps, df_cat):
    
    """
    Remove trajectories that contains 'Respiratoire' as categorical value from 'df_cat'.
    
    :param df_values: Dataframe that contains values from trajectories
    :param df_timestamps: Dataframe that contains timestamps from trajectories
    :param df_cat: Dataframe that contains a categorical value for each trajectory
    :return: Values, timestamps and categorical Dataframes that do contains categorical value
    """

    wrong_indexes = np.unique(df_cat[df_cat[df_cat.columns[0]] == "Respiratoire"].index.values)
    count_wrong = len(wrong_indexes)
   
    print("Number of removed trajectory with that contains 'Respiratoire' as categorical value :", count_wrong)

    return df_values.drop(wrong_indexes), df_timestamps.drop(wrong_indexes), df_cat.drop(wrong_indexes)

In [None]:
def sample_dataframe(df, cat_var):
    
    """
    Sample dataframe from categorical variable groups. Every group has the same amount of data point.
    
    :param df: Dataframe to sample
    :param cat_var: Categorical variable name 
    :return: Dataframe sampled
    """
    
    # Get number from the smallest categorical group
    nb_min_samples = np.inf
    for v in np.unique(df[cat_var].values):
        # Get minimun number of sample
        if len(df_tr_labels_cat[df_tr_labels_cat[cat_var] == v]) < nb_min_samples:
            nb_min_samples = len(df_tr_labels_cat[df_tr_labels_cat[cat_var] == v])
            
    # Sample each group
    df_sampled = pd.DataFrame(columns=df.columns)
    for v in np.unique(df[cat_var].values):
        # Sample Dataframe from categorical value (Balance each categorical group)
        df_tmp = df_tr_labels_cat[df_tr_labels_cat[cat_var] == v].sample(n = int(nb_min_samples))
        df_sampled = pd.concat([df_sampled, df_tmp], axis=0)
        
    print("number of sample for each categorical group :", nb_min_samples)
        
    return df_sampled