# **Description**
This notebook was used in the first iteration to separate the files into widows relative to sample detect time. It wasn't great though because the indexing was highly dependent on the amount of wet-up removed. A much better function for this (window_after_zeroed()) is included in the final preprocessing notebook. 

In [None]:
# Split the time series into segments based on sample detect time. 
# Will turn into a script later. 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
ecd_ts = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Time Series/ECDTS/ECD_TS_STAND.csv')
un_ts = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Time Series/unsuccesful time series/US_TS_STAND.csv')
sy_ts = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Time Series/ECDTS/ECD_TS_SYNTH_STAND.csv')
con_ts =  pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Time Series/ECDTS/ECD_TS_Contaminated.csv')

un_pred = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Raw Data Predictors/Unsuccessful.csv')
ecd_pred = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Raw Data Predictors/ECD.csv')
sy_pred = pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Raw Data Predictors/SyntheticPC.csv')
con_pred =  pd.read_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Raw Data Predictors/PCAggContaminated.csv')

In [None]:
# Remove readings with Sample Detect Time of 0. 
un_ts = un_ts[un_ts['TestId'].isin(un_pred['TestID'][un_pred['SampleDetectTime']!=0])]
sy_ts = sy_ts[sy_ts['TestId'].isin(sy_pred['TestID'][sy_pred['SampleDetectTime']!=0])]

In [None]:
# Histogram with the lengths of the unsuccessful readings
junk = plt.hist(len(un_ts.columns) - un_ts.isnull().sum(axis=1), bins = 30)

In [None]:
# Histogram with the lengths of the ECD readings
junk = plt.hist(len(ecd_ts.columns) - ecd_ts.isnull().sum(axis=1), bins = 30)

In [None]:
# Histogram with the lengths of the synthetic ECD readings
junk = plt.hist(len(sy_ts.columns) - sy_ts.isnull().sum(axis=1), bins = 30)

In [None]:
# Median unsuccessful reading. 
t = np.arange(150, 300.2, 0.2)
plt.plot(t, un_ts.median()[1:])

In [None]:
# Median 'wild' ECD reading. 
plt.plot(t, ecd_ts.median()[1:])
np.mean(ecd_pred['SampleDetectTime'])

In [None]:
# Median synthetic ECD reading. 
plt.plot(t, sy_ts.median()[1:])
np.mean(sy_pred['SampleDetectTime'])

In [None]:
# Subset the times series based on start and end times relative to sample detect time. 
# start - seconds to start from relative to sample detect (eg. -5 would start 5 seconds before, 5 would start 5 seconds after)
# end - seconds to end from relative to sample detect. 
# ts - data frame where each row is a time series
# pred - data frame where each row is a reading, has the sample detect time column. 
def window(start, end, ts, pred):
    # Get the list of IDs for both data sources. 
    ts_ids = ts['TestId']
    pred_ids = pred['TestID']
    
    # Convert sample detect time to samples from second; subtract 750 as that's the number of samples removed for wet-up. 
    sample_detect = (pred['SampleDetectTime']/0.2).astype(int) - 750
    
    # Make a data frame with test ids, and start and end indices to window based on. 
    # Add 1s to indices to ignore the first column (which has test ids). 
    index = pd.concat([pred_ids, int(start/0.2)+sample_detect+1, int(end/0.2) +sample_detect+1], axis = 1)
    index.columns = ["TestId", "Start", "End"]
    # Merge start and end indices into time series data frame based on ids. 
    ts = ts.merge(index, how = 'left', on = 'TestId')
    # Save the order of ids for later use. 
    ids = ts['TestId']
    
    #Subset each time series based on it's start and end indices. 
    subsets = [ts.iloc[row, ts['Start'][row]:ts['End'][row]].reset_index(drop = True) for row in range(len(ts))]
    
    #Return a dataframe with test ids reatttached. 
    return pd.concat([pd.DataFrame(ids), pd.DataFrame(subsets)], axis = 1)

In [None]:
# Define three windows for now. 
un_cal = window(-15, -3, un_ts, un_pred)
ecd_cal = window(-15, -3, ecd_ts, ecd_pred)
syn_cal = window(-15, -3, sy_ts, sy_pred)

un_post = window(12, 16, un_ts, un_pred)
ecd_post = window(12, 16, ecd_ts, ecd_pred)
syn_post = window(12, 16, sy_ts, sy_pred)

un_sample = window(32, 35, un_ts, un_pred)
ecd_sample = window(32, 35, ecd_ts, ecd_pred)
syn_sample = window(32, 35, sy_ts, sy_pred)

# Drop rows with NAs. Might want to come back later and play with adjusting the wet-up period to be variable depending on sample detect time, but for now just leave it. 



In [None]:
un_cal.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/un_cal.csv', index = False)
un_post.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/un_post.csv', index = False)
un_sample.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/un_sample.csv',  index = False)

ecd_cal.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/ecd_cal.csv', index = False)
ecd_post.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/ecd_post.csv', index = False)
ecd_sample.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/ecd_sample.csv',  index = False)

syn_cal.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/syn_cal.csv', index = False)
syn_post.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/syn_post.csv', index = False)
syn_sample.dropna().to_csv('/Users/saral/OneDrive - UBC/MDS/Capstone/Code + Data/Data/Windowed Time Series/syn_sample.csv',  index = False)