In [None]:
import pandas as pd
import numpy as np

## Wet-up removed (-30, 40) --> Normalized

In [None]:
# Timeseries
# We use the normalized data without the wet up period (from -30 to 40), and then apply the moving average
ecd_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/ecd_norm_window.csv")
ecd_syn_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/syn_norm_window.csv")
ecd_con_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/cont_norm_window.csv")

ecd_tot_norm = pd.concat([ecd_norm, ecd_syn_norm, ecd_con_norm], axis = 0)
un_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/un_norm_window.csv")


# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticPC.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/PCAggContaminated.csv')

ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])

## Normalized 

In [None]:
# Timeseries
# We use the normalized data without the wet up period (from -30 to 40), and then apply the moving average
ecd_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/ecd_norm.csv")
ecd_syn_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/syn_norm.csv")
ecd_con_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/cont_norm.csv")

ecd_tot_norm = pd.concat([ecd_norm, ecd_syn_norm, ecd_con_norm], axis = 0)
un_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/un_norm.csv")


# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticPC.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/PCAggContaminated.csv')

ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])

## Wet-up removed

In [None]:
# Timeseries
# We use the normalized data without the wet up period (from -30 to 40), and then apply the moving average
ECD = pd.read_csv("../../../data/TimeSeriesData/Windowed/ECD.csv")
ecd_syn = pd.read_csv("../../../data/TimeSeriesData/Windowed/syn.csv")
ecd_con = pd.read_csv("../../../data/TimeSeriesData/Windowed/cont.csv")

ecd_tot = pd.concat([ECD, ecd_syn, ecd_con], axis = 0)
un = pd.read_csv("../../../data/TimeSeriesData/Windowed/un.csv")


# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticPC.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/PCAggContaminated.csv')

ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])

## Creating Moving Average

In [None]:
def creating_moving_average_data(data, window, center = True):
    rolling_data = data.drop(columns = 'TestId').rolling(window, axis = 1, center = center).mean().dropna(axis = 1, how = 'all').reset_index(drop = True)
    rolling_data['TestId'] = data['TestId'].reset_index(drop = True)
    rolling_data = rolling_data.dropna(axis = 0, thresh = 2).reset_index(drop = True) # For series that are smaller than window
    return rolling_data

In [None]:
ecd_MA = creating_moving_average_data(ecd_tot_norm, 31)
un_MA = creating_moving_average_data(un_norm, 31)

In [None]:
ecd_MA_w = creating_moving_average_data(ecd_tot, 31)
un_MA_w = creating_moving_average_data(un, 31)

## Creating Windows

In [None]:
def window(start, end, ts, pred, window, center = True):
    """Windows the data output by the `creating_moving_average_data` function. 

    Args:
        start (int): Start of window with respect to sample detect time (i.e -15 represents 15 seconds before sample detect)
        end (int): End of window with respect to sample detect time (i.e 30 represents 30 seconds after sample detect)
        ts (pandas data frame): Output from the `creating_moving_average` function (a dataframe containing the moving averages). 
        pred (pandas data frame): Dataframe containg the predictor file containing a column with the SampleDetectTime (i.e when windowing the unsuccessful readings, use the predictor file for the unsuccessul readings).
        window (int): The window that was used when calculating the moving average for the ts dataframe (needs to be an odd number if centered).
        center (bool, default = True): Whether or not the moving average that was calculated for the ts dataframe was centered (True) or not(False).

    Returns:
        A new pandas data frame with z-normalized time series stored in the rows. 
    """
    
    # Removing the readings with a sample detect time of 0
    # ids with sample detect time different than 0
    ids = pred[pred['SampleDetectTime']!=0]['TestID'].reset_index(drop = True)
    
    ts = ts[ts['TestId'].isin(ids)]
    
    # Retrieving sample detect time for each reading that has sample detect time different than 0
    sample_detect_time = pred[pred['SampleDetectTime'] !=0]['SampleDetectTime'].reset_index(drop = True)
    
    # Retrieving the index corresponding to the sample detect time.
    # We substract terms corresponding to the number of indexes that were removed during the calculation of the moving average (i.e when the centered window is 
    # of size 31, the first 15 columns of our moving average is NA because there is not enough points to calculate the mean). These columns were removed
    # in the function that creates the dataframe containing the moving average which is why we have to take this into consideration when finding the index.
    
    if center == True:
        sample_detect_index = ((sample_detect_time/0.2) - (window - 1)/2).astype(int).reset_index(drop = True)
       
    else:
        sample_detect_index = ((sample_detect_time/0.2) - (window - 1)).astype(int).reset_index(drop = True)
        
    # Retrieving the indices corresponding to the start and the end of the desired window
    index = pd.concat([ids,sample_detect_time, sample_detect_index, int(start/0.2) + sample_detect_index, int(end/0.2) + sample_detect_index], axis = 1).reset_index(drop = True)
    index.columns = ["TestId","Sample detect time", "Sample detect index", "Start", "End"]
    index['Start'] = index['Start'].astype(int)
    index['End'] = index['End'].astype(int)
    
    # Merge the Start and End indices to the rolling mean dataframe
    ts = ts.merge(index, how = 'left', on = 'TestId')
    
    # Select the window
    subsets = [ts.iloc[row, ts['Start'][row]:ts['End'][row]].reset_index(drop = True) for row in range(len(ts))]
    subsets = pd.DataFrame(subsets)
    subsets.columns =  [str(round(m,1)) for m in np.arange(start,end, 0.2)]
    
    # Join the TestId to the windows 
    windowed_data = pd.concat([pd.DataFrame(ts['TestId']).reset_index(drop = True), subsets], axis = 1)
    windowed_data = windowed_data.dropna()
    return windowed_data

In [None]:
# # Define three windows for now (use the normalized waveforms). 
# un_cal = window(start = -15, end = -3, ts = un_MA, pred = un_pred, window = 31)
# un_post = window(start = 12, end = 16, ts = un_MA, pred = un_pred, window = 31)
# un_sample = window(start = 32, end = 35, ts = un_MA, pred = un_pred, window = 31)

# ecd_cal = window(start = -15, end =  -3, ts = ecd_MA, pred = ecd_pred_tot, window = 31)
# ecd_post = window(start = 12, end =  16, ts = ecd_MA, pred = ecd_pred_tot, window = 31)
# ecd_sample = window(start = 32, end = 35, ts = ecd_MA, pred = ecd_pred_tot, window = 31)

In [None]:
def window_smoothed(start, end, data):
    # start and end must be strings
    # Must include a trialing 0 for integers i.e if start = '-15', put '-15.0' instead 
    ids = data['TestId']
    window = pd.concat([ids, data.loc[:,start:end]], axis = 1)
    return window

In [None]:
un_cal = window_smoothed('-15.0', '-3.0', un_MA)
ecd_cal = window_smoothed('-15.0', '-3.0', ecd_MA)

un_post = window_smoothed('12.0', '16.0', un_MA)
ecd_post = window_smoothed('12.0', '16.0', ecd_MA)

un_sample = window_smoothed('32.0', '35.0', un_MA)
ecd_sample = window_smoothed('32.0', '35.0', ecd_MA)

In [None]:
un_cal = window_smoothed('-15.0', '-3.0', un_MA_w)
ecd_cal = window_smoothed('-15.0', '-3.0', ecd_MA_w)

un_post = window_smoothed('12.0', '16.0', un_MA_w)
ecd_post = window_smoothed('12.0', '16.0', ecd_MA_w)

un_sample = window_smoothed('32.0', '35.0', un_MA_w)
ecd_sample = window_smoothed('32.0', '35.0', ecd_MA_w)

## Keeping only the TestIDs that are common in all 3 windows 

We want to find metrics that compare the behavior in each of the windows. For example, subtracting the mean in cal to the mean in post. We can only do this for readings that are present in all of the windows (this excludes the shorter waveforms that don't make it to the post/sample window). This is why we will only consider the testids that are in the sample window (if they make it to sample, they have to also be in post and cal).

Lets create a dataframe containing the ECDs in the sample window to the unsuccessful readings in the sample window. We will add a label (True if ECD and False if not), to make it easier to identify which is which later once we have clustered.

In [None]:
common_un_ids = un_sample['TestId'].reset_index(drop = True)

In [None]:
un_cal = un_cal[un_cal['TestId'].isin(common_un_ids)]
un_post = un_post[un_post['TestId'].isin(common_un_ids)]

In [None]:
common_ecd_ids = ecd_sample['TestId'].reset_index(drop = True)
ecd_cal = ecd_cal[ecd_cal['TestId'].isin(common_ecd_ids)]
ecd_post = ecd_post[ecd_post['TestId'].isin(common_ecd_ids)]

In [None]:
# Adding label to differentiate ECDs from unsuccessful
un_cal['Label'] = False
ecd_cal['Label'] = True

un_post['Label'] = False
ecd_post['Label'] = True

un_sample['Label'] = False
ecd_sample['Label'] = True

# Concatenating the ECD readings with the unsuccessful readings
conv_cal = pd.concat([un_cal, ecd_cal], axis = 0).reset_index(drop = True)
conv_post = pd.concat([un_post, ecd_post], axis = 0).reset_index(drop = True)
conv_sample = pd.concat([un_sample, ecd_sample], axis = 0).reset_index(drop = True)

In [None]:
conv_cal['mean'] = conv_cal.drop(columns = ['TestId', 'Label']).mean(axis = 1)
conv_post['mean'] = conv_post.drop(columns = ['TestId', 'Label']).mean(axis = 1)
conv_sample['mean'] = conv_sample.drop(columns = ['TestId', 'Label']).mean(axis = 1)

In [None]:
cal_feat = conv_cal[['TestId', 'Label', 'mean']]
post_feat = conv_post[['TestId', 'Label', 'mean']]
sample_feat = conv_sample[['TestId', 'Label', 'mean']]

In [None]:
cal_feat

In [None]:
feat = cal_feat.join(post_feat[['mean', 'TestId']].set_index('TestId'), on = 'TestId', rsuffix = '_post')
feat = feat.rename(columns = {'mean': 'mean_cal'})

feat = feat.join(sample_feat[['mean', 'TestId']].set_index('TestId'), on = 'TestId', how = 'right', rsuffix = '_sample')
feat= feat.rename(columns = {'mean': 'mean_sample'})

In [None]:
feat['cal-post'] = abs(feat['mean_cal'] - feat['mean_post'])
feat['cal-sample'] = abs(feat['mean_cal'] - feat['mean_sample'])
feat['sample-post'] = abs(feat['mean_sample'] - feat['mean_post'])

In [None]:
feat

In [None]:
feat = feat[~feat['TestId'].isin([9610647, 9610462])]

In [None]:
alt.Chart(feat).transform_fold(
    ['cal-post',
     'cal-sample',
     'sample-post'],
    as_ = ['Measurement_type', 'value']
).transform_density(
    density='value',
    bandwidth=0.3,
    groupby=['Measurement_type'],
    extent= [0, 1],
    counts = True,
    steps=200
).mark_area().encode(
    alt.X('value:Q'),
    alt.Y('density:Q', stack='zero'),
    alt.Color('Measurement_type:N')
).properties(width=400, height=100)

In [None]:
alt.Chart(feat).mark_area().transform_density(
    'cal-post',
    as_=['cal-post', 'density']).encode(
    alt.X('cal-post'),
    alt.Y('density:Q'),
    alt.Color('Label'))

In [None]:
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('cal-post'),
    color=alt.Color('Label')
).properties(width=300)

In [None]:
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('cal-sample'),
    color=alt.Color('Label')
).properties(width=300)

In [None]:
import altair as alt
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('sample-post'),
    color=alt.Color('Label')
).properties(width=300)