In [1]:
#Import the corresponding libraries
import pandas as pd
import pickle
import os

In [2]:
#Read Apache Parquet DataSet
'''
Each of the files for this project contains 2 tabs. For instance:
The file completo800_(edit).xlsx contains the tabs:

* totale
* Foglio 1

For performance reasons the original .xlsx files were converted to Apache_Parquet files
After this transformation the only remainig tab is

* totale

which contains all the data concerning to the research
''' 
#DataFrame used for deployment
# '/Users/luisescobar/Documents/Thesis/DataSets/Apache_parquet/completo_800_output_file.parquet'

path_to_read = '/Users/luisescobar/Documents/Thesis/DataSets/Apache_parquet'
path_to_save = '/Users/luisescobar/Documents/Thesis/DataSets/Dictionary'
file = 'completo1007_(edit).parquet'
name_read = f'{path_to_read}/{file}'
df = pd.read_parquet(name_read, engine='fastparquet')

In [3]:
df.head()

Unnamed: 0,Patient,Date__Heure,P_Access,P_Filter,P_Effluent,P_Return,FIFTH_PRESSURE,RUN_TIME,Post_Replacement,Pre_Replacement,...,Citric_Acid_Concentration__mmol_,Citrate_Dose__mmol_L_,Calcium_Compensation____,Calcium_syringe_Concentration__m,Treatment_Duration__h_,effluent2,delta_eff,max_line,reverse,sum
0,,2011-07-28 22:58:00,54,-25,-26,-38,-37,73854,31168,0,...,_,_,_,_,35.179443,109341,-23,3710,2119,-146001
1,,2011-07-28 22:59:00,7,57,-13,33,-1,73878,31170,0,...,_,_,_,_,35.179443,109423,-82,3710,2118,-145978
2,,2011-07-28 23:00:00,7,94,-1,60,-2,73938,31192,0,...,_,_,_,_,35.179443,109497,-74,3710,2117,-145896
3,,2011-07-28 23:01:00,27,121,28,95,-3,73998,31218,0,...,_,_,_,_,35.179443,109570,-73,3710,2116,-145822
4,,2011-07-28 23:02:00,6,112,19,72,-3,74058,31238,0,...,_,_,_,_,35.179443,109647,-77,3710,2115,-145749


In [4]:
#Info of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289492 entries, 0 to 289491
Data columns (total 94 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   Patient                           0 non-null       float64       
 1   Date__Heure                       289492 non-null  datetime64[ns]
 2   P_Access                          289492 non-null  int64         
 3   P_Filter                          289492 non-null  int64         
 4   P_Effluent                        289492 non-null  int64         
 5   P_Return                          289492 non-null  int64         
 6   FIFTH_PRESSURE                    289492 non-null  int64         
 7   RUN_TIME                          289492 non-null  int64         
 8   Post_Replacement                  289492 non-null  int64         
 9   Pre_Replacement                   289492 non-null  int64         
 10  Dialysate                       

In [5]:
def dict_trt(df_spec_col):
    '''The function returns a dictionary with keys associated to
    the different treatment 'trt' values
    Parameters
    ----------
    df_spec_col : pandas.core.frame.DataFrame
        DataFrame with specific columns related to the research focus 

        
    Returns
    -------
    df_dict : dict
        Dictionary with DataFrames divided according to the treatment 'trt' value.
        The output of dict.keys will be something as follows
            dict_keys(['df_601', 'df_602', 'df_603', 'df_604', 'df_605', ...
        df_601 is by itself a DataFrame with all the information corresponding to the
        treatment 'trt' 601
    '''
    
    #List with the different treatments values
    trt = df_spec_col["trt"].unique()
    df_dict = {}

    for value in trt:
        # Create a dataframe for each unique value in trt and store it in the dictionary
        df_dict[f"df_{value}"] = df_spec_col[df_spec_col['trt'] == value]
        
    return df_dict


In [6]:
def trt_weight_vis(df_dict):
    '''The function returns a sub DataFrame with the columns 
    trt and Patient_weight__Kg_
    Parameters
    ----------
    df_dict : dict
        Dictionary organized according to the different treatments 'trt'
        values

        
    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame with 'weight' set as index and 'trt' as reference value 
        the 'weight' (index) is sorted ascending
        
                    
        weight(index)----trt
        7.0--------------706
        7.0--------------702
        45.0-------------616
        45.0-------------613
        46.0-------------618

    '''
    merge_dict = {
        'trt':[],
        'weight':[]
    }
    for value in df_dict.keys():
        df = df_dict[value]
        trt = df["trt"].unique()
        weight = df["Patient_weight__Kg_"].unique()
        merge_dict['trt'].extend(trt)
        merge_dict['weight'].extend(weight)
        
    df = pd.DataFrame(merge_dict)
    df = df.sort_values(by=['weight', 'trt']).reset_index(drop=True)
    df = df.set_index("weight", inplace=False)
    
    return df

<div class="alert alert-block alert-warning">
  <p>
    <b>To Do</b>
  </p>

  <p>
    I would like to have a function in which I choose, arbitrary, the maximum difference, between trt values, and according to this merge the DataFrames
  </p>
    
  <p>
  </p>
</div>

In [7]:
def trt_to_merge(df, tolerance):
    '''The function returns a DataFrame with boolean information, in the column merge, to 
    join DataFrames that correspond to the same patient
    
    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame containing just the 'trt' and 'weight' value for all the cases of interest. Output of the 
        trt_weight_vis function
        
    tolerance : int
        Tolerance of difference between the treatment values corresponding to a patient with the 
        same weight
        i.e., a tolerance of 1 indicates that given the follwing DataFrame
        
            weight   trt    diff    tolerance   merge
        [0] 7.0      702    4.0     False       False
        [1] 7.0      706    93.0    False       False
        [2] 45.0     613    3.0     False       False
        [3] 45.0     616    2.0     False       False
        [4] 46.0     618    1.0     True        True
        [5] 46.0     619    7.0     False       False
        
        The row [4] will be True since, is the only one in which, the difference between trt[i] and 
        trt[i+1] is 1
        
    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame with the following columns
        ['weight'   'trt'    'diff'    'tolerance'   'merge']
        
        weight(index): weight value of the different patients
        trt: treatment value
        diff: difference between consecutive treatment values after sorting the DataFrame
        tolerance:  if diff <= tolerance
                        return True
                    else
                        return False
        merge:  if (df['tolerance'] == True) & (shifted_index == df.index)
                    return True
                else
                    return True
        
        shifted_index is the value of the index in the position [i+1] 
        
        where: 
            i is the current index position 
                    
    '''
    # Shifting the 'trt' column by one position
    df['shifted_trt'] = df['trt'].shift(-1)

    # Calculating the difference between the 'trt' column and the shifted 'trt' column
    df['diff'] = abs(df['trt'] - df['shifted_trt'])

    # Dropping the 'shifted_trt' column as it's no longer needed
    df.drop(columns=['shifted_trt'], inplace=True)
    
    #Search values in which the diff is <= tolerance
    df['tolerance'] = df['diff'] <= tolerance
    
    #Create a new column merge to do the final comparison between trt values and weight values
    shifted_index = df.index.to_series().shift(-1)
    df['merge'] = ((df['tolerance'] == True) & (shifted_index == df.index))

    
    return df

In [8]:
def patient_recognition(df, columns, tolerance):
    '''The function returns a DataFrame with columns containing information
    useful to match patients with 'trt' number. This is a global function, cointaining all the 
    previous defined functions.
    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame with all the patients information
    columns : list
        Columns of interest i.e.,
        columns=["Date__Heure","P_Access","P_Filter","P_Effluent","P_Return", ...
    tolerance : int
        Tolerance of difference between the treatment values corresponding to a patient with the 
        same weight
        i.e., a tolerance of 1 indicates that given the follwing DataFrame
        
            weight   trt    diff    tolerance   merge
        [0] 7.0      702    4.0     False       False
        [1] 7.0      706    93.0    False       False
        [2] 45.0     613    3.0     False       False
        [3] 45.0     616    2.0     False       False
        [4] 46.0     618    1.0     True        True
        [5] 46.0     619    7.0     False       False
        
        The value [4] will be True since, is the only one in which, the difference between trt[i] and 
        trt[i+1] is 1 

        
    Returns
    -------
    df : pandas.core.frame.DataFrame
        DataFrame with the following columns
        ['weight'   'trt'    'diff'    'tolerance'   'merge']
        weight(index): weight value of the different patients
        trt: treatment value
        diff: difference between consecutive treatment values after sorting the DataFrame
        tolerance:  if diff <= tolerance
                        return True
                    else
                        return False
        merge:  if (df['tolerance'] == True) & (shifted_index == df.index)
                    return True
                else
                    return True
        
        shifted_index is the value of the index in the position [i+1] 
        
        where: 
            i is the current index position 
        
    '''
    #Extract specific columns from the original DataFrame
    df_spec_col = df[columns]
    #Dictionary in which the indexes are the different trt values
    df_dict = dict_trt(df_spec_col)
    #DataFrame to evaluate weight with trt
    df = trt_weight_vis(df_dict)
    #DataFrame to evaluate which trt should be merged 
    df = trt_to_merge(df, tolerance)
    
    return df,df_dict

<div class="alert alert-block alert-info"> <b>Part One</b> 
    <p>
        <ul>
        <li>DataFrame ['df_patients'] with information of the possible trt values to merge. This decision depends on the diff value; tolerance parameter.  </li>
        <li>Dictionary ['df_dict_initial'] in which each key corresponds to the DataFrame of the respective trt number. The dictionary is a collection of DataFrames.</li>
        </ul>
             </p> </div>

In [9]:
#Columns to consider from the original Data
columns=["Date__Heure","P_Access","P_Filter","P_Effluent","P_Return","Q_Blood_Pump",
          "Q_Replacement", "Q_Dialysate", "Q_PBP", "DeltaP", "TMP", "TMPa", "trt", "Patient_weight__Kg_", "Set"]
df_patients, df_dict_initial = patient_recognition(df,columns,1)

In [10]:
df_patients.head(10)

Unnamed: 0_level_0,trt,diff,tolerance,merge
weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
50.0,833,56.0,False,False
50.0,889,16.0,False,False
50.0,905,10.0,False,False
50.0,915,4.0,False,False
55.0,911,1.0,True,True
55.0,912,1.0,True,True
55.0,913,110.0,False,False
56.0,803,50.0,False,False
56.0,853,100.0,False,False
56.0,953,32.0,False,False


<div class="alert alert-block alert-info"> <b>Part Two</b> 
    <p>
        Dictionary ['dictionary_merge']  with the information of the trt to be merged. Form the previous DataFrame ['df_patients'], if merge == True, we join the current [i] and the next trt [i+1] as treatments to be merged
    </p> 
</div>

In [11]:
def dictionary_merge(df_patients):
    '''The function returns a dictionary with the pair of treatments to merge
    Parameters
    ----------
    df_patients : pandas.core.frame.DataFrame
        DataFrame with the columns
         weight   trt    diff    tolerance   merge

    Returns
    -------
    trt_merge_dict : dict
        Dictionary with tuples corresponding to the trt values to merge
        
    '''
    df = df_patients.reset_index()
    indexes = df[df['merge'] == True].index.tolist()
    trt_merge_dict = {}
    
    for i in range(len(indexes)):  
        index_val = indexes[i]
        trt_val = df['trt'][index_val]
        dict_name = f"index_{index_val}"
        new_trt_merge = {'index': index_val, 'trt': {trt_val,trt_val+1}}
        trt_merge_dict.update({dict_name: new_trt_merge})
    
    #Sort dict according to 'trt'
    sorted_data = dict(sorted(trt_merge_dict.items(), key=lambda item: min(item[1]['trt'])))
    
    # Convert 'trt' sets to sorted lists
    for key in sorted_data:
        sorted_data[key]['trt'] = sorted(sorted_data[key]['trt'])
        
    return sorted_data
        

In [12]:
trt_merge_dict = dictionary_merge(df_patients)
trt_merge_dict

{'index_30': {'index': 30, 'trt': [804, 805]},
 'index_121': {'index': 121, 'trt': [817, 818]},
 'index_52': {'index': 52, 'trt': [837, 838]},
 'index_55': {'index': 55, 'trt': [842, 843]},
 'index_56': {'index': 56, 'trt': [843, 844]},
 'index_171': {'index': 171, 'trt': [847, 848]},
 'index_59': {'index': 59, 'trt': [864, 865]},
 'index_60': {'index': 60, 'trt': [865, 866]},
 'index_61': {'index': 61, 'trt': [866, 867]},
 'index_124': {'index': 124, 'trt': [874, 875]},
 'index_125': {'index': 125, 'trt': [875, 876]},
 'index_126': {'index': 126, 'trt': [876, 877]},
 'index_65': {'index': 65, 'trt': [894, 895]},
 'index_67': {'index': 67, 'trt': [899, 900]},
 'index_4': {'index': 4, 'trt': [911, 912]},
 'index_5': {'index': 5, 'trt': [912, 913]},
 'index_43': {'index': 43, 'trt': [916, 917]},
 'index_74': {'index': 74, 'trt': [935, 936]},
 'index_75': {'index': 75, 'trt': [936, 937]},
 'index_96': {'index': 96, 'trt': [946, 947]},
 'index_97': {'index': 97, 'trt': [947, 948]},
 'index

In [13]:
# Convert the dictionary to a DataFrame
df_dict = pd.DataFrame.from_dict(trt_merge_dict, orient='index')

# Reset the index to have a default integer index and move the original index to a column
df_dict.reset_index(inplace=True)

# Rename the column for clarity
df_dict.rename(columns={'index': 'original_index'}, inplace=True)
df_dict.head(15)

Unnamed: 0,level_0,original_index,trt
0,index_30,30,"[804, 805]"
1,index_121,121,"[817, 818]"
2,index_52,52,"[837, 838]"
3,index_55,55,"[842, 843]"
4,index_56,56,"[843, 844]"
5,index_171,171,"[847, 848]"
6,index_59,59,"[864, 865]"
7,index_60,60,"[865, 866]"
8,index_61,61,"[866, 867]"
9,index_124,124,"[874, 875]"


<div class="alert alert-block alert-warning"> 
    At this point we found the problem that in some cases we have to merge more than 2 treatments. Consider as example, the iloc indexes, [8] and [9]
    <li>[8] corresponds to the treatments [684, 685] which are treatments corresponding to the same patient, beacause of the weight value</li>
    <li>[9] corresponds to the treatments [685, 686] which are treatments corresponding to the same patient, beacause of the weight value</li>
    We have an intersection at the value 685, this means that treatments 684, 685, 686 potentially corresponds to the same patient. 
</div>

In [14]:
def trt_compar(df, i):
    '''The function compares the trt values looking for an intersection between two continuous rows. i.e., [0] and [1] 
    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        DataFrame with the columns

        level_0	    original_index	    trt
    [0]	index_48	     48	         [608, 609]
    [1]	index_4	          4	         [618, 619]
    [2]	index_16	     16	         [624, 625]

    i : iterator to go through the different rows of the DataFrame      
    Returns
    -------
    True / False : boolean
        Boolean with the result of the comparisson
        
    '''
    value_01 = list(df_dict['trt'].iloc[i])[1]
    value_02 = list(df_dict['trt'].iloc[i+1])[0]
    if value_01 == value_02:
        return True
    else:
        return False

In [15]:
def dictionary_multiple_merge(df_dict):
    '''The function returns a dictionary with the list of all the merges we could potentially do. 
    This information was obtained according to the trt value and the patient's weight.
  
    Parameters
    ----------
    df_dict : pandas.core.frame.DataFrame
        DataFrame with the columns

        level_0	    original_index	    trt
    [0]	index_48	     48	         [608, 609]
    [1]	index_4	          4	         [618, 619]
    [2]	index_16	     16	         [624, 625]
      
    Returns
    -------
    dict : dict
        Dictionary with the potential treatments to merge. We expect something like the following output
        
        {'index_48': {'values': [608, 609]},
         'index_4': {'values': [618, 619]},
         'index_16': {'values': [624, 625]},
         'index_66': {'values': [632, 633]},
         'index_90': {'values': [649, 650]},
         'index_116': {'values': [660, 661]},
         'index_119': {'values': [680, 681]},
         'index_154': {'values': [682, 683]},
         'index_142': {'values': ([684, 685], [685, 686])}, ...
        
    '''
    dict={}
    i=0
    iter_range = len(df_dict)-1
    #Given a DataFrame like the one from the example, we iterate through the whole list of trt values ['iter_range']
    while i < iter_range:
        result = trt_compar(df_dict,i)
        list_trt = []
        if result == True:
            #Save the first tuple in a list
            list_trt.append((df_dict['trt'].iloc[i],
                       (df_dict['trt'].iloc[i+1])))
            #Search more tuples to save in the list
            for j in range(i+1,iter_range):
                result = trt_compar(df_dict,j)
                if result == True:
                    list_trt.append([df_dict['trt'].iloc[j+1]])
                else:
                    diff = j-i
                    i += diff
                    break    
    
            list_name = f"index_{df_dict['original_index'].iloc[i]}"
            list_merge = {'values': list_trt}
            dict.update({list_name: list_merge})        
        
        else:
            list_trt = list(df_dict['trt'].iloc[i])
            list_name = f"index_{df_dict['original_index'].iloc[i]}"
            list_merge = {'values': list_trt}
            dict.update({list_name: list_merge})
        i += 1
    return dict

In [16]:
dict = dictionary_multiple_merge(df_dict)

In [17]:
dict

{'index_30': {'values': [804, 805]},
 'index_121': {'values': [817, 818]},
 'index_52': {'values': [837, 838]},
 'index_56': {'values': [([842, 843], [843, 844])]},
 'index_171': {'values': [847, 848]},
 'index_61': {'values': [([864, 865], [865, 866]), [[866, 867]]]},
 'index_126': {'values': [([874, 875], [875, 876]), [[876, 877]]]},
 'index_65': {'values': [894, 895]},
 'index_67': {'values': [899, 900]},
 'index_5': {'values': [([911, 912], [912, 913])]},
 'index_43': {'values': [916, 917]},
 'index_75': {'values': [([935, 936], [936, 937])]},
 'index_97': {'values': [([946, 947], [947, 948])]},
 'index_99': {'values': [950, 951]},
 'index_102': {'values': [958, 959]},
 'index_140': {'values': [964, 965]},
 'index_93': {'values': [970, 971]},
 'index_143': {'values': [972, 973]},
 'index_149': {'values': [([988, 989], [989, 990])]},
 'index_155': {'values': [([999, 1000], [1000, 1001])]},
 'index_110': {'values': [([1005, 1006], [1006, 1007])]}}

In [18]:
dict.items()

dict_items([('index_30', {'values': [804, 805]}), ('index_121', {'values': [817, 818]}), ('index_52', {'values': [837, 838]}), ('index_56', {'values': [([842, 843], [843, 844])]}), ('index_171', {'values': [847, 848]}), ('index_61', {'values': [([864, 865], [865, 866]), [[866, 867]]]}), ('index_126', {'values': [([874, 875], [875, 876]), [[876, 877]]]}), ('index_65', {'values': [894, 895]}), ('index_67', {'values': [899, 900]}), ('index_5', {'values': [([911, 912], [912, 913])]}), ('index_43', {'values': [916, 917]}), ('index_75', {'values': [([935, 936], [936, 937])]}), ('index_97', {'values': [([946, 947], [947, 948])]}), ('index_99', {'values': [950, 951]}), ('index_102', {'values': [958, 959]}), ('index_140', {'values': [964, 965]}), ('index_93', {'values': [970, 971]}), ('index_143', {'values': [972, 973]}), ('index_149', {'values': [([988, 989], [989, 990])]}), ('index_155', {'values': [([999, 1000], [1000, 1001])]}), ('index_110', {'values': [([1005, 1006], [1006, 1007])]})])

In [19]:
def flatten_list(nested_list):
    """Recursively flatten a nested list or tuple."""
    flat_list = []
    for item in nested_list:
        if isinstance(item, (list, tuple)):
            flat_list.extend(flatten_list(item))
        else:
            flat_list.append(item)
    return flat_list

def remove_duplicates(input_list):
    """Remove duplicates from a list while maintaining order."""
    unique_list = []
    seen = set()
    for item in input_list:
        if item not in seen:
            unique_list.append(item)
            seen.add(item)
    return unique_list


In [20]:
# We create the final dictionary. Each key is a DataFrame; at the end we appended those corresponding to potentially merges

for key, value in dict.items():
    values = value['values']
    # If values is a tuple, combine the inner lists into a single list
    if isinstance(values, tuple):
        combined_values = []
        for sublist in values:
            combined_values.extend(sublist)
        values = combined_values
    # Remove duplicates from the list
    values = flatten_list(values)
    values = remove_duplicates(values)
    # Add prefix "df_" to each element in the list
    values = [f"df_{v}" for v in values]
    #Merge the DataFrames
    concat_df = pd.concat([df_dict_initial[key] for key in values], axis=0)
    values_str = '-'.join(values)
    name = f'{values_str}-concat'
    df_dict_initial[name] = concat_df


In [21]:
df_dict_initial.keys()

dict_keys(['df_801', 'df_802', 'df_803', 'df_804', 'df_805', 'df_806', 'df_807', 'df_808', 'df_809', 'df_810', 'df_811', 'df_812', 'df_813', 'df_814', 'df_815', 'df_816', 'df_817', 'df_818', 'df_819', 'df_820', 'df_821', 'df_822', 'df_823', 'df_824', 'df_825', 'df_826', 'df_827', 'df_828', 'df_829', 'df_830', 'df_831', 'df_832', 'df_833', 'df_834', 'df_835', 'df_836', 'df_837', 'df_838', 'df_839', 'df_840', 'df_841', 'df_842', 'df_843', 'df_844', 'df_845', 'df_846', 'df_847', 'df_848', 'df_849', 'df_850', 'df_851', 'df_853', 'df_854', 'df_855', 'df_856', 'df_857', 'df_858', 'df_859', 'df_860', 'df_861', 'df_862', 'df_863', 'df_864', 'df_865', 'df_866', 'df_867', 'df_868', 'df_869', 'df_870', 'df_871', 'df_872', 'df_873', 'df_874', 'df_875', 'df_876', 'df_877', 'df_878', 'df_879', 'df_880', 'df_881', 'df_882', 'df_883', 'df_884', 'df_885', 'df_886', 'df_887', 'df_888', 'df_889', 'df_891', 'df_892', 'df_893', 'df_894', 'df_895', 'df_896', 'df_897', 'df_898', 'df_899', 'df_900', 'df_901',

<div class="alert alert-block alert-warning">
  <p>
    <b>Evaluation of the results</b>
  </p>

  <p>
    

  </p>
    
  <p>
  </p>
</div>

<div class="alert alert-block alert-warning">
  <p>
    <b>Remark</b>
  </p>

  <p>
    The trt values does not, necessarly, correspond to a new series of data. In some cases many of the values are repeated.
    Lines of action:
      <ul>
            <li>Clean repeat information</li>
      </ul>

  </p>

</div>

<div class="alert alert-block alert-warning">

 <p>
    The following function does a comparision only when we are considering 2 DataFrames. When we are considering more than 2 DataFrames, it just merge them. For the nature of the data is very unlikley that 3 potentially DataFrames, or more, to merge are not corresponding to the same patient. Anyway after the merge we can compare the difference between the first timestamp of information and the last one. 
  </p>
</div>

In [22]:
'''Function to see if the potentially DataFrames are close enough, according to the timestamp, so we can merge them. days indicates the maximum tolerance 
allowed between DataFrames'''
days = 5
delete_elements=[]

for dframe in df_dict_initial.keys():
    if 'concat' in dframe:
        df_test = df_dict_initial[dframe]
        initial = df_test["Date__Heure"].iloc[0]
        final = df_test["Date__Heure"].iloc[-1]
        decision = abs(initial - final) <= pd.Timedelta(days=days)
        if decision == True:
            #Delete individual DataFrames
            base_part = dframe.replace('-concat', '')
            parts = base_part.split('-')
            for item in parts:
                delete_elements.append(item) 
        else:
            #Delete merge DataFrames
            delete_elements.append(dframe)

for element in delete_elements:
    del df_dict_initial[element] 

In [23]:
'''At this point df_dict_initial contains DataFrames corresponding to a single treatment in 
case the previous merged conditions were not satisfied or a merged DataFrame in the opposite case.
If the DataFrames were merged, the original ones were deleted'''
df_dict_initial.keys()

dict_keys(['df_801', 'df_802', 'df_803', 'df_806', 'df_807', 'df_808', 'df_809', 'df_810', 'df_811', 'df_812', 'df_813', 'df_814', 'df_815', 'df_816', 'df_817', 'df_818', 'df_819', 'df_820', 'df_821', 'df_822', 'df_823', 'df_824', 'df_825', 'df_826', 'df_827', 'df_828', 'df_829', 'df_830', 'df_831', 'df_832', 'df_833', 'df_834', 'df_835', 'df_836', 'df_839', 'df_840', 'df_841', 'df_842', 'df_843', 'df_844', 'df_845', 'df_846', 'df_849', 'df_850', 'df_851', 'df_853', 'df_854', 'df_855', 'df_856', 'df_857', 'df_858', 'df_859', 'df_860', 'df_861', 'df_862', 'df_863', 'df_868', 'df_869', 'df_870', 'df_871', 'df_872', 'df_873', 'df_874', 'df_875', 'df_876', 'df_877', 'df_878', 'df_879', 'df_880', 'df_881', 'df_882', 'df_883', 'df_884', 'df_885', 'df_886', 'df_887', 'df_888', 'df_889', 'df_891', 'df_892', 'df_893', 'df_894', 'df_895', 'df_896', 'df_897', 'df_898', 'df_899', 'df_900', 'df_901', 'df_902', 'df_903', 'df_904', 'df_905', 'df_906', 'df_907', 'df_908', 'df_909', 'df_910', 'df_914',

<div class="alert alert-block alert-warning">
  <p>
    <b>How to save this DataFrames?</b>
  </p>

  <p>
    <li>Will depend on how we are going to use them for the model deployment</li> 
    

  </p>

</div>

In [24]:
#For quality control, we can verify the first and the last row of the concat DataFrames and see if the difference is not bigger than 3 days

In [25]:
def quality_control(df_dict,days):
    '''The function is a quality control check to see if the difference between the intial and the last timestamp is not considerably big.
    We expect to have a difference of days because of the nature of the data.

    
    Parameters
    ----------
    df_dict : dict 
        dictionary in which each key is a DataFrame
    days : int
        tolerance in days

    
    Returns
    -------
    None : list
        the function prints 'Error + DataFrame name' i.e., 'Error df_601' if a DataFrame does not meet the given tolerance
        
    '''
    for dframe in df_dict.keys():
        df_test = df_dict[dframe]
        initial = df_test["Date__Heure"].iloc[0]
        final = df_test["Date__Heure"].iloc[-1]
        decision = abs(initial - final) <= pd.Timedelta(days=days)
        if decision == False:
            print(f'Error {dframe}')
        else:
            continue
    print('If no message was displayed before this, the quality control was passed')

In [26]:
df_test = df_dict_initial['df_801']
df_test.head(10)


Unnamed: 0,Date__Heure,P_Access,P_Filter,P_Effluent,P_Return,Q_Blood_Pump,Q_Replacement,Q_Dialysate,Q_PBP,DeltaP,TMP,TMPa,trt,Patient_weight__Kg_,Set
0,2011-07-28 22:58:00,54,-25,-26,-38,120,1200,2000,1200,-12,-23.5,-29,801,170.0,ST150
1,2011-07-28 22:59:00,7,57,-13,33,120,1200,2000,1200,-1,40.0,40,801,170.0,ST150
2,2011-07-28 23:00:00,7,94,-1,60,120,1200,2000,1200,9,60.0,65,801,170.0,ST150
3,2011-07-28 23:01:00,27,121,28,95,120,1200,2000,1200,1,62.0,63,801,170.0,ST150
4,2011-07-28 23:02:00,6,112,19,72,120,1200,2000,1200,15,55.0,63,801,170.0,ST150
5,2011-07-28 23:03:00,3,109,11,68,120,1200,2000,1200,16,59.5,68,801,170.0,ST150
6,2011-07-28 23:04:00,6,115,14,75,120,1200,2000,1200,15,63.0,71,801,170.0,ST150
7,2011-07-28 23:05:00,3,112,17,72,120,1200,2000,1200,15,57.0,65,801,170.0,ST150
8,2011-07-28 23:06:00,2,113,16,74,120,1200,2000,1200,14,59.5,67,801,170.0,ST150
9,2011-07-28 23:07:00,0,106,14,55,120,1200,2000,1200,26,48.5,62,801,170.0,ST150


In [27]:
df_test.tail(5)


Unnamed: 0,Date__Heure,P_Access,P_Filter,P_Effluent,P_Return,Q_Blood_Pump,Q_Replacement,Q_Dialysate,Q_PBP,DeltaP,TMP,TMPa,trt,Patient_weight__Kg_,Set
2106,2011-07-30 10:04:00,-18,174,-304,53,120,1200,2000,1200,96,399.5,448,801,170.0,ST150
2107,2011-07-30 10:05:00,-11,175,-312,70,120,1200,2000,1200,80,416.5,457,801,170.0,ST150
2108,2011-07-30 10:06:00,-10,196,-108,98,120,1200,2000,1200,73,237.0,274,801,170.0,ST150
2109,2011-07-30 10:07:00,-13,180,-278,78,120,1200,2000,1200,77,389.0,428,801,170.0,ST150
2110,2011-07-30 10:08:00,-17,171,-288,71,120,1200,2000,1200,75,391.0,429,801,170.0,ST150


In [28]:
#Let's try to save the final dictionary and see if we can open it in another notebook

In [29]:
file_no_ext = os.path.splitext(file)[0]
name_save = f'{path_to_save}/{file_no_ext}.pkl' 
with open(name_save, 'wb') as file:
    pickle.dump(df_dict_initial, file)

In [30]:
name_save

'/Users/luisescobar/Documents/Thesis/DataSets/Dictionary/completo1007_(edit).pkl'