# **Feature Engineering**

## Key Features:
- __Mechanical Ventilation Data Analysis__
- __Organic Data Refactoring__
- __Feature Engineering__
- __Refactoring DataSets for Unsuperviced Learning Models__
---

## Maintainer

- **GitHub**: [Molderon](https://github.com/Molderon)
- **Email**: [Molderon@proton.me](mailto:Molderon@proton.me)
- **ML Model at** [ICU: Mechanical Ventilation: Asyncronies Classification](https://github.com/Molderon/ICU-Mechanical-Ventilation-Asyncronies)


> # Normal Respiratory hysteresis on **mechanical ventilation**
- Mechanical Ventilator **<->** Patient

<img src="./normalBreath2.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./ExampleBr.png" alt="Scatter Matrix" style="opacity: 1;" />


# **Asyncronous Respiratory Hysteresis** Examples:
- Mechanical Ventilation **<->** Patient

<img src="./undefined_async1.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./doubleTrigger3.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./failedtrigger.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./failed_trigger1.png" alt="Scatter Matrix" style="opacity: 1;" />

In [1]:
# --<Environment Libs>--
from dataclasses import dataclass, field
import sys, os, subprocess  
import pip, time

# --<DataScience>--
import seaborn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.spatial import ConvexHull
from scipy.fft import fft

from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from scipy.interpolate import PchipInterpolator
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


# --<Feature Engineering>--
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from scipy.interpolate import interp1d
from scipy.interpolate import BSpline, splrep


assert sys.version_info >= (3, 5)
bScale: bool = False
SingleSet: bool = True


<img src="./hyst_loop.jpg" alt="Hysteresis P/V Loop" style="opacity: 0.8;" />


In [2]:
#setups
plt.style.use('dark_background')
csfont = {'fontname':'Comic Sans MS'}
hfont = {'fontname':'Helvetica'}

In [3]:
def Load_LowFeq_sets():
    os.chdir("/home/molderon/Main/WorkSpace/Diploma Project/Feature Engineering/")
    Scale_Dataset = StandardScaler()

    try:
        if(SingleSet == False):
            Training_Set = pd.DataFrame()
            MultiCalss_validation = pd.DataFrame()
            CrossValidation = pd.DataFrame()

            Training_Set = pd.read_csv("TrainingSet.csv")
            MultiCalss_validation = pd.read_csv("MultiValid.csv")
            CrossValidation = pd.read_csv("CrossValid.csv")

            # Renaming bad naming concentions
            Training_Set.columns = Training_Set.columns.str.replace("u_in", "TV-i")
            Training_Set.columns = Training_Set.columns.str.replace("u_out", "TV-e")
            MultiCalss_validation.columns = MultiCalss_validation.columns.str.replace("u_in", "TV-i")
            MultiCalss_validation.columns = MultiCalss_validation.columns.str.replace("u_out", "TV-e")
            CrossValidation.columns = CrossValidation.columns.str.replace("u_in", "TV-i")
            CrossValidation.columns = CrossValidation.columns.str.replace("u_out", "TV-e")

            #Droping useless weight
            Training_Set.dropna()
            MultiCalss_validation.dropna()
            CrossValidation.dropna()

            Training_Set = Training_Set.dropna(subset=['pressure'])
            MultiCalss_validation = MultiCalss_validation.dropna(subset=['pressure'])
            CrossValidation = CrossValidation.dropna(subset=['pressure'])

            Training_Set.drop(Training_Set.columns[[0,1]], axis =1, inplace=True)
            CrossValidation.drop(CrossValidation.columns[[0,1]], axis =1, inplace=True)
            MultiCalss_validation.drop(MultiCalss_validation.columns[[0,1]], axis =1, inplace=True)

            # Optional Scaling

            if bScale == True:
                Training_Set = Scale_Dataset(Training_Set)
                MultiCalss_validation = Scale_Dataset(MultiCalss_validation)
                CrossValidation = Scale_Dataset(CrossValidation)

            print(Training_Set.describe())
            return Training_Set, MultiCalss_validation, CrossValidation

        else:
            Full_Dataset = pd.DataFrame()
            Full_Dataset = pd.read_csv("Default_MV_Data.csv")

            Full_Dataset.columns = Full_Dataset.columns.str.replace("u_in", "TV-i")
            Full_Dataset.columns = Full_Dataset.columns.str.replace("u_out", "TV-e")
            Full_Dataset.drop(Full_Dataset.columns[[0,1]], axis =1, inplace=True)
            Full_Dataset.dropna()

            if bScale == True:
                Full_Dataset = Scale_Dataset(Full_Dataset)

            return Full_Dataset
        
    except Exception:
        print(Exception.with_traceback())
        return None

In [59]:
Training_Set.iloc[:,1:].hist(bins = 50, figsize=(20,15))

array([[<Axes: title={'center': 'R'}>, <Axes: title={'center': 'C'}>],
       [<Axes: title={'center': 'time_step'}>,
        <Axes: title={'center': 'TV-i'}>],
       [<Axes: title={'center': 'TV-e'}>,
        <Axes: title={'center': 'pressure'}>]], dtype=object)

># ⚠️ **Conditional Warning: Time-Intensive Operation**
> **Condition: Datasets larger than: 300 Mbytes**

In [None]:
def Draw_ScatterMatrix(Dataset: pd.DataFrame):    
    set_attributes: list = ["pressure", "TV-e", "TV-i","R", "C", "time_step"]
    scatter_matrix(Dataset[set_attributes], figsize=(12,8))

Main_Data = Load_LowFeq_sets()
Draw_ScatterMatrix(Main_Data)

<img src="./air_plot.png" alt="Scatter Matrix" style="opacity: 1;" />

# Scatter Matrix of organic data

<img src="./Scatter_Matrix.png" alt="Scatter Matrix" style="opacity: 1;" />

## >> |Open Questions
- Q: What is the data frequency of an individual Breath ?
- A: Frequency ​≈ 26.67 Hz
- Q: Does the timestep for individual readings remain **consistant** ?
- A: Not consistant, it's voletile between the ranges **(32 - 40 miliseconds)**

## Conclusions:
- Avg. breath cycle is 2.7 sec = **~ (80 * 7)** datapoints per breath

## >> |Feature Engineering
- Feature **A)** Reconstruction of sythetic datapoints: **Increase frequiency to ~ (32)Hz/sec.**
- Feature **B)** Create a custom **Hysteresis Cycle Metric** for each breath id.

>## Interpolation of each Dataset
- Training Set
- MultiClass_validation Set
- Corss-Validation Set

In [None]:
def Example_Dataset(DataSet: pd.DataFrame):
   
    ExampleSet = pd.DataFrame()
    gss = GroupShuffleSplit(test_size=0.995, random_state=42) # loads 30% of Given Dataset
    for idx, _ in gss.split(DataSet, groups=DataSet['breath_id']):
        ExampleSet = DataSet.iloc[idx]
    return ExampleSet

In [None]:
Interloper: int = 5

In [None]:
def Reconstruct_Cycle(DataSet: pd.DataFrame, Single_Cycle: pd.DataFrame, Interloper: int):

    X_features = Single_Cycle[['time_step', 'R', 'C', 'TV-i', 'TV-e']].values
    Y_target = Single_Cycle['pressure'].values

    Interloper = min(Interloper + Single_Cycle.shape[0], 25) 
    Synthetic_Timestep = np.linspace(Single_Cycle['time_step'].min(), Single_Cycle['time_step'].max(), Interloper)


    try:
        Symetra = splrep(Single_Cycle['time_step'], Y_target, s=10)
        Pressure_Prediction = BSpline(*Symetra)(Synthetic_Timestep)
    except Exception as e:
        print(f"Error in pressure interpolation: {e}")
        return DataSet


    def reconstruct_variable(target_column, smooth_factor=1):
        Y_target = Single_Cycle[target_column].values
        try:
            tck = splrep(Single_Cycle['time_step'], Y_target, s=smooth_factor)
            return BSpline(*tck)(Synthetic_Timestep)
        except Exception as e:
            print(f"Error in {target_column} interpolation: {e}")
            return np.full(Synthetic_Timestep.shape, np.nan)

    TV_i_pred = reconstruct_variable('TV-i')
    TV_e_pred = reconstruct_variable('TV-e')
    R_pred = reconstruct_variable('R')
    C_pred = reconstruct_variable('C')


    breath_id = Single_Cycle['breath_id'].iloc[0]
    Interpolated_Data = pd.DataFrame({
        'breath_id': np.full(Synthetic_Timestep.shape, breath_id),
        'R': R_pred,
        'C': C_pred,
        'time_step': Synthetic_Timestep,
        'TV-i': TV_i_pred,
        'TV-e': TV_e_pred,
        'pressure': Pressure_Prediction
    })

    DataSet = pd.concat([DataSet, Interpolated_Data], ignore_index=True)
    return DataSet


In [7]:
def Reconstruct_Dataset(DataSet: pd.DataFrame, dataset_name: str, Interloper: int) -> pd.DataFrame:
    for breath_id, group in DataSet.groupby('breath_id'):
        Single_Cycle = group.sort_values(by='time_step').copy()
        
        DataSet = Reconstruct_Cycle(DataSet, Single_Cycle, Interloper)

    DataSet.to_csv(dataset_name)
    return DataSet


# Data Interpolation
 **Default Datapoints (Left) - 16 Hz/sec**   |   **Interpolation (Right) - 32 Hz/sec**

In [None]:
def Interpolation():
      import time,warnings

      warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
      warnings.filterwarnings("ignore")
      Boosted_Dataset = pd.DataFrame()
      start = time.time()
      pd.options.mode.chained_assignment = None
      
      DataSet = Reconstruct_Dataset(Load_LowFeq_sets(),"Boosted_MV.csv", Interloper)
      
      
      ''' //Example for splitted sets
            Boosted_TrainingSet = Reconstruct_Dataset(Training_Set,"Boosted_TrainingSet.csv.csv", Interloper)
            Boosted_MultiClass =  Reconstruct_Dataset(CrossValidation,"Boosted_MultiClass.csv", Interloper)
            Boosted_CrossValidation = Reconstruct_Dataset(MultiCalss_validation,"Boosted_CrossValidation.csv", Interloper)
      '''
      
      end = time.time()
      print("Total time for interpolations is :",
            (end-start)/60, "Min")

># ⚠️ **Warning: Time-Intensive Operation**
>
> The interpolation process you're about to run is computationally expensive and may take **several hours** to complete, the author advices to use small sliced datasets.
># **Beware**
>
> - Ensure that your system has sufficient resources (CPU/RAM) to handle the process.
> - Avoid interrupting the process once it has started.
> - If possible, run this operation on a machine that will not be needed for other tasks during execution.


In [13]:
Interpolation()

Total time for interpolations is : 15.2435875137647 Min


### Interpolation Description:


1. **Input Features**: 
   $$
   X = [\text{time\_step}, R, C, TV_{i}, TV_{e}] \in \mathbb{R}^{n \times m}
   $$
   where \(n\) is the number of samples and \(m\) is the number of features.

   - \(Y\) is defined as the target variable:
   $$
   Y = \text{pressure} \in \mathbb{R}^{n}
   $$

2. **Synthetic Time Steps**: 
   - The number of interpolated points  :
   $$
   n_{\text{interloper}} = n + I
   $$
   where \(I\) is the additional points specified by the user.

   - A synthetic time step vector is created:
   $$
   t_{\text{synthetic}} = \text{linspace}(t_{\text{min}}, t_{\text{max}}, n_{\text{interloper}})
   $$

3. **Pressure Prediction Using B-Splines**: 
   - The B-spline representation of the pressure is generated using the `splrep` function:
   $$
   tck_{\text{pressure}} = \text{splrep}(S[\text{time\_step}], Y, s=0)
   $$
   - The predicted pressure values at synthetic time steps are calculated as:
   $$
   P_{\text{pred}} = BSpline(tck_{\text{pressure}})(t_{\text{synthetic}})
   $$

4. **Variable Reconstruction**:
   - A nested function is defined to reconstruct other variables in the dataset:
   $$
   V_{\text{pred}} = BSpline(tck_{\text{variable}})(t_{\text{synthetic}})
   $$

   - This is computed as:
   $$
   tck_{\text{variable}} = \text{splrep}(S[\text{time\_step}], S[\text{variable}], s=3)
   $$

5. **Constructing the Output DataFrame**: 
   - Finally, an interpolated dataset \(I\) is created:
   $$
   I = \begin{bmatrix}
   \text{breath\_id} & C_{\text{pred}} & t_{\text{synthetic}} & TV_{i_{\text{pred}}} & TV_{e_{\text{pred}}} & P_{\text{pred}} \\
   \end{bmatrix}
   $$

   - The final reconstructed dataset is obtained by concatenating the original dataset \(D\) with the interpolated data \(I\):
   $$
   D_{\text{new}} = D \cup I
   $$

In [3]:
def Plot_Hysteresis(Hysteresis_Dynamics: pd.DataFrame):
    TV_i = np.array(Hysteresis_Dynamics['TV-i'])
    TV_e = np.array(Hysteresis_Dynamics['TV-e'])
    pressure = np.array(Hysteresis_Dynamics['pressure'])

    TV_total = TV_i + TV_e

    plt.ion() 
    fig, ax = plt.subplots()
    line, = ax.plot([], [], marker='o', linestyle='-', color='b', label='Hysteresis (TV-total / Pressure)')

    ax.set_title('Hysteresis Loop: TV-total/Pressure - 35.0 Hz')
    ax.set_xlabel('TV-total (TV-i + TV-e)')
    ax.set_ylabel('Pressure')
    ax.legend()

    def update_plot(x_data, y_data):
        line.set_data(x_data, y_data)
        ax.relim()          
        ax.autoscale_view() 
        fig.canvas.draw()   
        fig.canvas.flush_events() 

    for i in range(len(TV_total)):
        update_plot(TV_total[:i+1], pressure[:i+1])

    plt.ioff()
    plt.show()

In [4]:
def Draw_Reconstructed_Cycles(breath_id: int, DataSet: pd.DataFrame):
    Random_ids = pd.DataFrame()
    Random_ids = DataSet[DataSet['breath_id'] == breath_id].copy()
    #Random_ids = Random_ids.sort_values(by="time_step")

    Hysteresis_Dynamics: dict = field(default_factory=dict)
    target_columns = ["TV-i", "TV-e","pressure", ]

    Hysteresis_Dynamics = Random_ids[target_columns].to_dict(orient ='list')
    Plot_Hysteresis(Hysteresis_Dynamics)

# Frequency Comparison
>  - Left(26Hz) - Default

> - Right(32Hz) - Interpolated

<img src="./17Hz.png" alt="Scatter Matrix" style="opacity: 1;" />
<img src="./32Hz.png" alt="Scatter Matrix" style="opacity: 1;" />

># Polynomial Hysteresis Area 

In [14]:
def Hysteresis_Area_Metric(DataSet: pd.DataFrame):
    os.chdir("/home/molderon/Main/WorkSpace/Diploma Project/Classification Algorithms/")
    start_time = time.time()

    if(SingleSet == False):
        Boosted_TrainingSet = Feature_Engineering(Boosted_TrainingSet)
        Boosted_TrainingSet.to_csv("Cluster_TrainingSet.csv")

        Boosted_MultiClass = Feature_Engineering(Boosted_MultiClass)
        Boosted_MultiClass.to_csv("Cluster_MultiClass.csv")

        Boosted_CrossValidation= Feature_Engineering(Boosted_CrossValidation)
        Boosted_CrossValidation.to_csv("Cluster_CrossValid.csv")
    else:
        DataSet = Feature_Engineering(DataSet)
        return DataSet

    end_time = time.time()

    print("Execution time:", (end_time - start_time)/60, "::mins")

>## Feature Engineering
- **Pressure/Volume Loop** 

# |Creating P/V cycles as a feature
- **Working with the Interloped Datasets _~(32Hz)~_**

- **The area of the polygon can be calculated using the formula:**

$$
A = \frac{1}{2} \left| \sum_{i=1}^{n} \left( x_i \cdot y_{i+1} - y_i \cdot x_{i+1} \right) \right|
$$

Where:
- \(x_i\) and \(y_i\) are the coordinates of the polygon's vertices.
- The indices wrap around, i.e., \(y_{n+1} = y_1\) and \(x_{n+1} = x_1\).
> __________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________

### Area Calculation Using the Trapezoidal Rule

The area under a curve defined by discrete points can be calculated using the **trapezoidal rule**, which approximates the integral of the function. Given a set of points \((x_i, y_i)\), the area \(A\) under the curve from \(x_1\) to \(x_n\) can be represented mathematically as:

$$
A = \int_{a}^{b} f(x) \, dx \approx \sum_{i=1}^{n-1} \frac{(y_i + y_{i+1})}{2} (x_{i+1} - x_i)
$$

Where:
- \( A \) is the approximate area under the curve.
- \( n \) is the number of discrete points.
- \( x_i \) are the \( x \)-coordinates of the points.
- \( y_i \) are the \( y \)-coordinates of the points.
- The term \(\frac{(y_i + y_{i+1})}{2}\) represents the average height of the function between \(x_i\) and \(x_{i+1}\).
- The difference \((x_{i+1} - x_i)\) represents the width of the interval.

In the context of the provided code:
**Area Calculation**:
   The area is calculated as follows:

$$
\text{area} = |A| = \left| \int_{x_{\text{closed}}} y_{\text{closed}} \, dx \right| \approx \text{np.abs}\left(\text{np.trapz}(y_{\text{closed}}, x_{\text{closed}})\right)
$$


In [37]:
os.chdir("/home/molderon/Main/WorkSpace/Diploma Project/Classification Algorithms/")
DataSet = pd.DataFrame()
DataSet = pd.read_csv("ClusterSet_26Hz.csv")
DataSet.drop(DataSet.columns[[0,2,6]], axis =1, inplace=True)

DataSet = Feature_Engineering(DataSet)
DataSet.head()


  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)
  curvature = np.abs(dx * ddy - dy * ddx) / np.powe

Unnamed: 0,breath_id,TV-i,TV-e,pressure,Polynomial_Area,Hull_Area,ft_Tidal_Volume_1,ft_Tidal_Volume_2,ft_Tidal_Volume_3,ft_Tidal_Volume_4,...,ft_Pressure_2,ft_Pressure_3,ft_Pressure_4,ft_Pressure_5,Perimeter,Width,Height,Centroid_X,Centroid_Y,Curvature
0,1.0,0.083334,0.0,5.837492,272.354053,333.60197,861.680524,451.834374,251.375479,36.596856,...,248.568905,117.650352,17.603779,53.889511,105.280344,28.229702,13.708918,10.771007,10.841247,9.310586
1,1.0,18.383041,0.0,5.907794,,,,,,,...,,,,,,,,,,
2,1.0,22.509278,0.0,7.876254,,,,,,,...,,,,,,,,,,
3,1.0,22.808822,0.0,11.742872,,,,,,,...,,,,,,,,,,
4,1.0,25.35585,0.0,12.234987,,,,,,,...,,,,,,,,,,


In [36]:

def Terraform_DataSet(Hysteresis_Sample: pd.DataFrame):
    # Calculate Tidal Volume and Apparent Pressure
    Tidal_Volume = Hysteresis_Sample['TV-i'].values + Hysteresis_Sample['TV-e'].values
    Apparent_Pressure = Hysteresis_Sample["pressure"].values

    # Compute the area under the curve (Polynomial Area) using trapezoidal integration
    Polynomial_Area = np.abs(np.trapz(Apparent_Pressure, Tidal_Volume))

    # Compute Convex Hull and its area
    Hull_Data = np.column_stack((Tidal_Volume, Apparent_Pressure))
    Hull = ConvexHull(Hull_Data)
    Hull_Area = Hull.volume

    # Fourier Transform for Tidal Volume and Pressure
    ft_Tidal_Volume = np.abs(fft(Tidal_Volume))[:5]  # Magnitudes only
    ft_Pressure = np.abs(fft(Apparent_Pressure))[:5]  # Magnitudes only

    # Perimeter, width, height, and centroid of the hysteresis loop
    perimeter = np.sum(np.sqrt(np.diff(Tidal_Volume)**2 + np.diff(Apparent_Pressure)**2))
    width = np.max(Tidal_Volume) - np.min(Tidal_Volume)
    height = np.max(Apparent_Pressure) - np.min(Apparent_Pressure)
    centroid_x = np.mean(Tidal_Volume)
    centroid_y = np.mean(Apparent_Pressure)

    # Curvature calculation
    dx = np.gradient(Tidal_Volume)
    dy = np.gradient(Apparent_Pressure)
    ddx = np.gradient(dx)
    ddy = np.gradient(dy)
    curvature = np.abs(dx * ddy - dy * ddx) / np.power(dx**2 + dy**2, 1.5)



    # Construct features dictionary and expand arrays into separate fields
    features = {
        "Polynomial_Area": Polynomial_Area,
        "Hull_Area": Hull_Area,
        **{f"ft_Tidal_Volume_{i+1}": coef for i, coef in enumerate(ft_Tidal_Volume)},
        **{f"ft_Pressure_{i+1}": coef for i, coef in enumerate(ft_Pressure)},
        "Perimeter": perimeter,
        "Width": width,
        "Height": height,
        "Centroid_X": centroid_x,
        "Centroid_Y": centroid_y,
        "Curvature": np.mean(curvature),  # or another aggregate like max or std if preferred
    }
    
    # Convert features into a DataFrame and concatenate with original Hysteresis_Sample
    features_df = pd.DataFrame(features, index=[0])
    enhanced_Hysteresis_Sample = pd.concat([Hysteresis_Sample.reset_index(drop=True), features_df], axis=1)

    return enhanced_Hysteresis_Sample


def Feature_Engineering(DataSet: pd.DataFrame):
    #DataSet = DataSet.sort_values(by=['breath_id', 'time_step']).copy()
    Terraformed_Data = []

    for breath_id, group in DataSet.groupby('breath_id'):
        Hysteresis_Sample = Terraform_DataSet(group)  
        Terraformed_Data.append(Hysteresis_Sample)

    DataSet = pd.concat(Terraformed_Data, ignore_index=True)

    return DataSet


def Clean_DataSet(DataSet: pd.DataFrame):
    DataSet.dropna(subset=['pressure', 'Polynomial_Area'], inplace=True)
    DataSet = DataSet[DataSet['Polynomial_Area'] <= 417]
    DataSet = DataSet[~((DataSet['Polynomial_Area'] > 73) & (DataSet['Polynomial_Area'] < 150))]
    DataSet.drop(DataSet.columns[[0,2,3]], axis =1, inplace=True)
    # Within These ranges The individual Breath cycles do not connect or are incomplete
    return DataSet

<img src="./HeatMap.png" alt="Scatter Matrix" style="opacity: 1;" />

# |Cleaning The DataSet

> # Machine Learning Starts Soon :)

> # Converging data for **Unsupervised Learning**
-  a.k.a **Theoretical Rocket Surgery**

In [None]:
DataSet = Feature_Engineering(DataSet)

In [44]:
TF_ClusterSet.head(100)

Unnamed: 0,breath_id,TV-i,TV-e,pressure,Polynomial_Area,Hull_Area,ft_Tidal_Volume_1,ft_Tidal_Volume_2,ft_Tidal_Volume_3,ft_Tidal_Volume_4,...,ft_Pressure_2,ft_Pressure_3,ft_Pressure_4,ft_Pressure_5,Perimeter,Width,Height,Centroid_X,Centroid_Y,Curvature
0,1.0,0.083334,0.0,5.837492,272.354053,333.601970,861.680524,451.834374,251.375479,36.596856,...,248.568905,117.650352,17.603779,53.889511,105.280344,28.229702,13.708918,10.771007,10.841247,9.310586
1,3.0,0.000000,0.0,5.064168,282.157479,350.167422,484.607933,183.391093,68.468914,56.159057,...,409.824319,164.998616,67.671789,116.198749,122.759376,18.353980,22.777895,6.057599,13.534698,11.190253
8,21.0,6.979638,0.0,5.556283,285.462213,284.261625,477.471900,174.175168,90.445702,26.363827,...,497.497105,252.938132,86.892208,93.935865,82.735227,12.333462,29.526901,5.968399,13.539971,11.201373
9,22.0,14.156295,0.0,6.048398,311.541505,363.457081,774.512804,395.511758,237.230950,106.858771,...,201.371493,68.527681,36.813551,27.466856,136.055303,49.286837,12.654386,9.681410,8.917604,2.046271
11,26.0,4.268687,0.0,6.189002,416.084625,443.828258,406.942851,178.520409,58.984276,58.751781,...,552.568813,197.157508,93.959423,138.033832,140.442385,24.620311,25.660283,5.086786,15.595429,3.317151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224,657.0,0.114779,0.0,5.134470,29.550652,70.252772,475.599652,123.752437,130.405328,47.394848,...,106.278825,56.295609,18.507232,12.673343,52.903910,11.887792,7.803538,5.944996,7.219808,38.036295
225,659.0,8.002394,0.0,6.540513,369.729321,423.587893,644.297052,354.213383,269.526140,164.717221,...,303.602454,143.049412,30.740182,23.312994,110.784419,42.178824,15.958587,8.053713,11.404543,95.165645
226,661.0,6.643202,0.0,6.399909,29.325423,39.774227,352.666308,112.946452,33.360350,20.677945,...,72.367612,31.630268,5.507230,16.221833,208.179672,13.692702,4.499337,4.408329,7.574833,112.213230
231,684.0,0.000000,0.0,6.470211,25.667443,107.953187,312.124408,78.018470,55.859572,25.761736,...,232.777249,145.198573,47.885830,30.558781,68.583482,8.916445,15.466472,3.901555,10.029257,2.528210


In [39]:
TF_ClusterSet = DataSet.drop_duplicates(subset='breath_id', keep='first').reset_index(drop=True)

In [43]:
#TF_ClusterSet.drop(TF_ClusterSet.columns[[1,2,3,4]], axis =1, inplace=True)
#TF_ClusterSet.head()
TF_ClusterSet.dropna(subset='Curvature', inplace=True)
TF_ClusterSet.to_csv("TEST_Cluster26Hz.csv")

In [8]:
os.chdir("/home/molderon/Main/WorkSpace/Diploma Project/Classification Algorithms/")
DataSet = pd.DataFrame()
DataSet = pd.read_csv("ClusterSet_26Hz.csv")