In [867]:
# --<Environment Libs>--
import sys, os, subprocess  
import pip

# --<DataScience>--
import seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


# --<Feature Engineering>--
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from scipy.interpolate import interp1d
from scipy.interpolate import BSpline, splrep



from dataclasses import dataclass, field
assert sys.version_info >= (3, 5)

bScale: bool = False


<img src="./hyst_loop.jpg" alt="Hysteresis P/V Loop" style="opacity: 0.8;" />


In [868]:
#setups
plt.style.use('dark_background')
csfont = {'fontname':'Comic Sans MS'}
hfont = {'fontname':'Helvetica'}

In [869]:
def Scale_Dataset(DataSet: pd.DataFrame):
    Scaler = StandardScaler()
    scaled_data = Scaler.fit_transform(DataSet)
    DataSet = pd.DataFrame(scaled_data, columns=DataSet.columns)
    del scaled_data
    
    return DataSet


In [870]:
Training_Set = pd.DataFrame()
MultiCalss_validation = pd.DataFrame()
CrossValidation = pd.DataFrame()

Training_Set = pd.read_csv("TrainingSet.csv")
MultiCalss_validation = pd.read_csv("MultiValid.csv")
CrossValidation = pd.read_csv("CrossValid.csv")

# Renaming bad naming concentions
Training_Set.columns = Training_Set.columns.str.replace("u_in", "TV-i")
Training_Set.columns = Training_Set.columns.str.replace("u_out", "TV-e")
MultiCalss_validation.columns = MultiCalss_validation.columns.str.replace("u_in", "TV-i")
MultiCalss_validation.columns = MultiCalss_validation.columns.str.replace("u_out", "TV-e")
CrossValidation.columns = CrossValidation.columns.str.replace("u_in", "TV-i")
CrossValidation.columns = CrossValidation.columns.str.replace("u_out", "TV-e")

#Droping useless weight
Training_Set.dropna()
MultiCalss_validation.dropna()
CrossValidation.dropna()

Training_Set = Training_Set.dropna(subset=['pressure'])
MultiCalss_validation = MultiCalss_validation.dropna(subset=['pressure'])
CrossValidation = CrossValidation.dropna(subset=['pressure'])

Training_Set.drop(Training_Set.columns[[0,1]], axis =1, inplace=True)
CrossValidation.drop(CrossValidation.columns[[0,1]], axis =1, inplace=True)
MultiCalss_validation.drop(MultiCalss_validation.columns[[0,1]], axis =1, inplace=True)

# Optional Scaling

if bScale == True:
    Training_Set = Scale_Dataset(Training_Set)
    MultiCalss_validation = Scale_Dataset(MultiCalss_validation)
    CrossValidation = Scale_Dataset(CrossValidation)


print(Training_Set.describe())

          breath_id             R             C     time_step          TV-i  \
count  3.923322e+06  3.923322e+06  3.923322e+06  3.923322e+06  3.923322e+06   
mean   6.283926e+04  2.703409e+01  2.608208e+01  1.307071e+00  7.321213e+00   
std    3.633486e+04  1.959690e+01  1.715298e+01  7.659629e-01  1.343079e+01   
min    1.000000e+00  5.000000e+00  1.000000e+01  0.000000e+00  0.000000e+00   
25%    3.136800e+04  5.000000e+00  1.000000e+01  6.429992e-01  3.946657e-01   
50%    6.276700e+04  2.000000e+01  2.000000e+01  1.308026e+00  4.385773e+00   
75%    9.429000e+04  5.000000e+01  5.000000e+01  1.965321e+00  4.983905e+00   
max    1.257490e+05  5.000000e+01  5.000000e+01  2.937238e+00  1.000000e+02   

               TV-e      pressure  
count  3.923322e+06  3.923322e+06  
mean   6.204359e-01  1.121985e+01  
std    4.852785e-01  8.109754e+00  
min    0.000000e+00 -1.895744e+00  
25%    0.000000e+00  6.329607e+00  
50%    1.000000e+00  7.032628e+00  
75%    1.000000e+00  1.364103e+01  


In [871]:
Training_Set.iloc[:,1:].hist(bins = 50, figsize=(20,15))

array([[<Axes: title={'center': 'R'}>, <Axes: title={'center': 'C'}>],
       [<Axes: title={'center': 'time_step'}>,
        <Axes: title={'center': 'TV-i'}>],
       [<Axes: title={'center': 'TV-e'}>,
        <Axes: title={'center': 'pressure'}>]], dtype=object)

In [872]:
# Warning: long Time to plot if dataset is >= 100 Mbytes
set_attributes: list = ["pressure", "TV-e", "TV-i","R", "C", "time_step"]
scatter_matrix(Training_Set[set_attributes], figsize=(12,8))

<img src="./Scatter_Matrix.png" alt="Scatter Matrix" style="opacity: 1;" />

## >> |Open Questions
- Q: What is the data frequency of an individual Breath ?
- A: Frequency ​≈ 16.67 Hz
- Q: Does the timestep for individual readings remain **consistant** ?
- A: Not consistant, it's voletile between the ranges **(59 - 64 miliseconds)**

## Conclusions:
- (16.67)Hz * (60)milliseconds = 1 sec.
- Avg. breath cycle is 2.7 sec = **~ (50 * 7)** datapoints per breath

## >> |Feature Engineering
- Feature **A)** Reconstruction of sythetic datapoints: **Increase frequiency to ~ (32)Hz/sec.**
- Feature **B)** Create a custom **Hysteresis Cycle** for each breath id.

In [873]:
Interloper: int = 0

<img src="./boosted.png" alt="Scatter Matrix" style="opacity: 1;" />

## Interpolation of each Dataset
- Training Set
- MultiClass_validation Set
- Corss-Validation Set

In [874]:
gss = GroupShuffleSplit(test_size=0.999, random_state=42)
for train_idx, _ in gss.split(Training_Set, groups=Training_Set['breath_id']):
    training_mini = Training_Set.iloc[train_idx]
    
print(training_mini.head())

      breath_id   R   C  time_step      TV-i  TV-e   pressure
179       27147  20  10   1.287536  0.000000     1   6.751420
342       26151  50  10   0.689145  0.000000     0  11.672570
1687      62189  20  20   2.111571  4.796304     1   5.696887
1834     100201   5  10   0.515675  8.730425     0  14.133145
2059      79227  50  50   0.825188  2.074942     0   8.298067


In [883]:
import numpy as np
import pandas as pd
from scipy.interpolate import BSpline, splrep

def Reconstruct_Cycle(DataSet: pd.DataFrame, Single_Cycle: pd.DataFrame, Interloper: int):

    X_features = Single_Cycle[['time_step', 'R', 'C', 'TV-i', 'TV-e']].values
    Y_target = Single_Cycle['pressure'].values

    Interloper += Single_Cycle.shape[0]
    Synthetic_Timestep = np.linspace(Single_Cycle['time_step'].min(), Single_Cycle['time_step'].max(), Interloper)

    Symetra = splrep(Single_Cycle['time_step'], Y_target, s=0)
    Pressure_Prediction = BSpline(*Symetra)(Synthetic_Timestep)

    def reconstruct_variable(target_column):
        Y_target = Single_Cycle[target_column].values
        tck = splrep(Single_Cycle['time_step'], Y_target, s=0)  
        return BSpline(*tck)(Synthetic_Timestep)

    # Reconstruct other variables
    TV_i_pred = reconstruct_variable('TV-i')
    TV_e_pred = reconstruct_variable('TV-e')
    R_pred = reconstruct_variable('R')
    C_pred = reconstruct_variable('C')

    breath_id = Single_Cycle['breath_id'].values[0]
    Interpolated_Data = pd.DataFrame({
        'breath_id': np.full(Synthetic_Timestep.shape, breath_id),  
        'C': C_pred,
        'time_step': Synthetic_Timestep,
        'TV-i': TV_i_pred,
        'TV-e': TV_e_pred,
        'pressure': Pressure_Prediction
    })

    DataSet = pd.concat([DataSet, Interpolated_Data], ignore_index=True)
    
    return DataSet


In [876]:
def Reconstruct_Dataset(DataSet: pd.DataFrame, dataset_name: str, Interloper:int)-> pd.DataFrame:
    Single_Cycle = pd.DataFrame()
    for breath_id, group in DataSet.groupby('breath_id'):

        Single_Cycle = group.sort_values(by='time_step').copy()
        DataSet = Reconstruct_Cycle(DataSet, Single_Cycle, Interloper)

    DataSet.to_csv(dataset_name)
    
    return DataSet

In [877]:
import time,warnings
# Suppress specific Pandas warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore")

start = time.time()
pd.options.mode.chained_assignment = None

training_mini = Reconstruct_Dataset(training_mini,"mini_training.csv", Interloper)
#Reconstruct_Dataset(multiclass_mini,"mini_valid.csv")
#Reconstruct_Dataset(validation_mini,"mini_multiclass.csv")

end = time.time()
print("Total time for interpolations is :",
      (end-start) * 10**3, "ms")

Total time for interpolations is : 188.42482566833496 ms


## Feature Engineering
- **Pressure/Volume Loop** 

In [879]:
Random_id = training_mini[training_mini['breath_id'] == 80001 ].copy()
Random_id = Random_id.sort_values(by="time_step")

Hysteresis_Dynamics: dict = field(default_factory=dict)
target_columns = ["TV-i", "TV-e","pressure", "R", "C", "time_step"]

Hysteresis_Dynamics = Random_id[target_columns].to_dict(orient ='list')


# Default Datapoints (Left) - 16 Hz/sec
# Interpolation (Right) - 32 Hz/sec

<img src="./17Hz.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./32Hz.png" alt="Scatter Matrix" style="opacity: 1;" />

In [None]:
TV_i = np.array(Hysteresis_Dynamics['TV-i'])
TV_e = np.array(Hysteresis_Dynamics['TV-e'])
pressure = np.array(Hysteresis_Dynamics['pressure'])

TV_total = TV_i + TV_e

plt.ion() 
fig, ax = plt.subplots()
line, = ax.plot([], [], marker='o', linestyle='-', color='b', label='Hysteresis (TV-total vs Pressure)')

ax.set_title('Hysteresis Loop: TV-total/Pressure - 32Hz')
ax.set_xlabel('TV-total (TV-i + TV-e)')
ax.set_ylabel('Pressure')
ax.legend()

def update_plot(x_data, y_data):
    line.set_data(x_data, y_data)
    ax.relim()          
    ax.autoscale_view() 
    fig.canvas.draw()   
    fig.canvas.flush_events() 

for i in range(len(TV_total)):
    update_plot(TV_total[:i+1], pressure[:i+1])
    
plt.ioff()
plt.show()

In [881]:
Random_id.shape

(110, 7)