In [1]:
import pickle
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt


In [2]:
def load_all_pickles(folder_path):
    '''The function load all the pickle files from a given folder. It assumes that the folder only
    contains .pkl files. If different extensions exist it will return an error
    Parameters
    ----------
    folder_path : str
        String with the path where the pkl files are saved i.e.,
        '/Users/luisescobar/Documents/Thesis/DataSets/Dictionary/02_Clotting_Labelling'

    Returns
    -------
    pickles : dict
        Dictionary since from the previous notebook 02_Clotting_Labelling data were saved in this format
    '''
    # List all files in the directory (assuming all are .pkl files)
    all_files = os.listdir(folder_path)
    
    # Load each .pkl file and store the results in a dictionary
    pickles = {}
    for pkl_file in all_files:
        file_path = os.path.join(folder_path, pkl_file)
        with open(file_path, 'rb') as f:
            pickles[pkl_file] = pickle.load(f)
    
    return pickles

# Load dictionaries 
folder_path = '/Users/luisescobar/Documents/Thesis/DataSets/Dictionary/02_Clotting_Labelling'  
loaded_pickles = load_all_pickles(folder_path)


In [3]:
no_clotting_dict = {}
clotting_dict = {}

# Iterate through the original dictionary and sort based on the key
for key, value in loaded_pickles.items():
    if "no_clotting" in key:
        no_clotting_dict[key] = value
    elif "clotting" in key:
        clotting_dict[key] = value


Now I want to run the quality control. I need to iterate through each .pkl file. Each file is composed of multiple time series; I want to discard those whose len(time series) < 40. I would like to print the len of each dictionary after running this test    

In [4]:
# Function to remove DataFrames with length < 70
def remove_small_dfs(outer_dict, min_length):
    '''The function returns only the Data Frames in which the length is >= 70
    Parameters
    ----------
    outer_dict : dict 
        Dictionary of dictionaries. See the structure below
            data_dict = {
                        'inner_dict_1': {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                        },
                        'inner_dict_2': {
                            'df_trt_3': df3
                        }
                    }
        Each one of the inner dictionaries contains a set of Data Frames, each Data Frame corresponds to a single treatment 

    Returns
    -------
    outer_dict : dict
        Dictionary with the same structure as the input but without those Data Frames whose length < min_length  
    '''
    for key in outer_dict:
        inner_dict = outer_dict[key]
        outer_dict[key] = {df_name: df for df_name, df in inner_dict.items() if len(df) >= min_length}


def remove_undesired_columns(outer_dict,columns):
    '''The function returns a new version of Data Frames in which we preserve only the columns of interest 
    Parameters
    ----------
    outer_dict : dict 
        Dictionary of dictionaries. See the structure below
            data_dict = {
                        'inner_dict_1': {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                        },
                        'inner_dict_2': {
                            'df_trt_3': df3
                        }
                    }
        Each one of the inner dictionaries contains a set of Data Frames, each Data Frame corresponds to a single treatment 

     columns : list
        List with the columns we want to remove
        
    Returns
    -------
    outer_dict : dict
        Dictionary with the same structure as the input but in which each Data Frame has only the columns of interest
        
    '''    
    for key in outer_dict:
        inner_dict = outer_dict[key]
        outer_dict[key] = {df_name: df.drop(columns=columns, errors='ignore') 
                           for df_name, df in inner_dict.items()
                          }


def remove_remaining_data(outer_dict):
    '''The function returns a new version of Data Frames, only in the case of blocking/clotting, in which we cut all the time series data
    after detecting the blocking/clotting event 
    Parameters
    ----------
    outer_dict : dict 
        Dictionary of dictionaries. See the structure below
            data_dict = {
                        'inner_dict_1': {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                        },
                        'inner_dict_2': {
                            'df_trt_3': df3
                        }
                    }
        Each one of the inner dictionaries contains a set of Data Frames, each Data Frame corresponds to a single treatment         
  
    Returns
    -------
    outer_dict : dict
        Dictionary with the same structure as the input but in which each Data Frame, from blocking/clotting, has cut all the time series data
    after detecting the blocking/clotting event 
    '''
    for key in outer_dict:
        inner_dict = outer_dict[key]
        outer_dict[key] = {df_name: df[df['Clotting_2'].ne(df['Clotting_2'].shift()).cumsum() <= 2]
                           for df_name, df in inner_dict.items()
                          }

def length_total(dict_primal):
    '''The function returns the length of the seconday dictionaries embedded on a primal dictionary
    Parameters
    ----------
    dict_primal : dict 
        Dictionary of dictionaries. See the structure below
            data_dict = {
                        'inner_dict_1': {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                        },
                        'inner_dict_2': {
                            'df_trt_3': df3
                        }
                    }
        Each one of the inner dictionaries contains a set of Data Frames, each Data Frame corresponds to a single treatment 

    Returns
    -------
    None : 
        
    '''
    for keys in list(dict_primal.keys()):
        print(f'{keys} {len(dict_primal[keys].items())}')

    
def combined_items(dict_primal):
    '''The function returns a dictionary combining the Data Frames corresponding to the seconday dictionaries 
    Parameters
    ----------
    dict_primal : dict 
        Dictionary of dictionaries. See the structure below
            dict_primal = {
                        'inner_dict_1': {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                        },
                        'inner_dict_2': {
                            'df_trt_3': df3
                        }
                    }
        Each one of the inner dictionaries contains a set of Data Frames, each Data Frame corresponds to a single treatment 

    Returns
    -------
    combined_dict : dict
        Combined dictionary from dict_primal. See the structure below
            combined_dict = {
                            'df_trt_1': df1,
                            'df_trt_2': df2
                            'df_trt_3': df3
                        }
    '''
    combined_dict = {}
    for keys in list(dict_primal.keys()):
        for name, df in dict_primal[keys].items():
            combined_dict[name] = df
    return combined_dict



In [5]:
columns=['Date__Heure', 'trt', 'Patient_weight__Kg_' , 'Set', 'Condition_1', 'Condition_2', 'Delta_P_ref', 'TMP_ref', 'Clotting_1', 'group']
min_length = 70

# Filter for removing small Data Frames (length)
remove_small_dfs(no_clotting_dict, min_length)
remove_small_dfs(clotting_dict, min_length)

#Filter for removing undesired columns
remove_undesired_columns(no_clotting_dict,columns)
remove_undesired_columns(clotting_dict,columns)

#Filter for cutting, in the case of blocking data, the elements after the blocking event
remove_remaining_data(clotting_dict)

Lets see how many data frames from each class do we have

In [8]:
length_total(clotting_dict)

completo1007_(edit)_clotting.pkl 18
completo600_(edit)_clotting.pkl 15
completo400_(edit)_clotting.pkl 19
completo_800_output_file_clotting.pkl 25
completo200_(edit)_clotting.pkl 14


In [9]:
length_total(no_clotting_dict)

completo400_(edit)_no_clotting.pkl 121
completo600_(edit)_no_clotting.pkl 155
completo1007_(edit)_no_clotting.pkl 157
completo200_(edit)_no_clotting.pkl 141
completo_800_output_file_no_clotting.pkl 131


In [13]:
#Combine all the data into a single random shuffle dictionary 
random.seed(42)
global_dict = {**combined_items(clotting_dict), **combined_items(no_clotting_dict)}
items = list(global_dict.items())
random.shuffle(items)
global_dict = dict(items)

In [14]:
len(global_dict)

796

In [16]:
#Create a single DataFrame ready to train
dataframes_list = list(global_dict.values())

# Concatenate all DataFrames vertically
combined_df = pd.concat(dataframes_list, ignore_index=True)

In [17]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586575 entries, 0 to 1586574
Data columns (total 13 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   P_Access                 1586575 non-null  int64  
 1   P_Filter                 1586575 non-null  int64  
 2   P_Effluent               1586575 non-null  int64  
 3   P_Return                 1586575 non-null  int64  
 4   Q_Blood_Pump             1586575 non-null  int64  
 5   Q_Replacement            1586575 non-null  int64  
 6   Q_Dialysate              1586575 non-null  int64  
 7   Q_PBP                    1586575 non-null  int64  
 8   Q_Patient_Fluid_Removal  1586575 non-null  int64  
 9   DeltaP                   1586575 non-null  int64  
 10  TMP                      1586575 non-null  float64
 11  TMPa                     1586575 non-null  int64  
 12  Clotting_2               1586575 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 157