In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal

In [None]:
# Timeseries
analyte_b = pd.read_csv("../../../data/TimeSeriesData/ecdContactTS/ECD_A2_all.csv")
ecd = pd.read_csv("../../../data/TimeSeriesData/ecdContactTS/ECD_TS.csv")
ecd_syn = pd.read_csv("../../../data/TimeSeriesData/ecdContactTS/ECD_TS_Synthetic.csv")
ecd_con = pd.read_csv("../../../data/TimeSeriesData/ecdContactTS/ECD_TS_Contaminated.csv")
un = pd.read_csv("../../../data/TimeSeriesData/UnsuccessfulReadingsTS/US_TS.csv")

ecd_tot = pd.concat([ecd, ecd_syn, ecd_con])

# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ecdContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticECD.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDAggContaminated.csv')

ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])

In [None]:
analyte_b_1 = analyte_b[analyte_b['TestId'] == 8105859].drop(columns = ['TestId', ' Channel'], axis = 1).iloc[0].dropna()
analyte_a_1 = ecd[ecd['TestId'] == 8105859].drop(columns = ['TestId'], axis = 1).iloc[0].dropna()

In [None]:
x_un = np.arange(0, len(un), 1)  
x_b = np.arange(0, len(analyte_b_1), 1)
x_a = np.arange(0, len(analyte_a_1), 1)


fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
axes[0].plot(x_b[1200:1400], analyte_b_1[1200:1400])
axes[1].plot(x_a[1200:1400], analyte_a_1[1200:1400])
fig.tight_layout()

## Creating Moving Average

In [None]:
# We use the normalized data, and then apply the moving average
ecd_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/ecd_norm_window.csv")
ecd_syn_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/syn_norm_window.csv")
ecd_con_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/cont_norm_window.csv")
un_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/un_norm_window.csv")

ecd_tot_norm = pd.concat([ecd_norm, ecd_syn_norm, ecd_con_norm], axis = 0)

In [None]:
def creating_moving_average_data(data, window, center = True):
    rolling_data = data.drop(columns = 'TestId').rolling(window, axis = 1, center = center).mean().dropna(axis = 1, how = 'all').reset_index(drop = True)
    rolling_data['TestId'] = data['TestId'].reset_index(drop = True)
    rolling_data = rolling_data.dropna(axis = 0, thresh = 2).reset_index(drop = True) # For series that are smaller than window
    return rolling_data

In [None]:
ecd_rolling_norm = creating_moving_average_data(ecd_tot_norm, 31)
un_rolling_norm = creating_moving_average_data(un_norm, 31)

In [None]:
def moving_average_plot(rolling_data, raw_data, num_plots):
    for i in range(num_plots):
        testid = 8883093
        #int(rolling_data.iloc[i,:]['TestId'])


        MA = rolling_data[rolling_data['TestId'] == testid].drop(columns = 'TestId').iloc[0].dropna()
        raw = raw_data[raw_data['TestId'] == testid].drop(columns = 'TestId').iloc[0].dropna()

        raw = pd.DataFrame(raw)
        raw = raw.rename(columns = {raw.columns[0]: 'raw'})

        MA = pd.DataFrame(MA)
        MA = MA.rename(columns = {MA.columns[0]: 'MA'})

        raw_MA_combined = raw.join(MA).dropna(axis = 0)
        
        plot1 = raw_MA_combined[['raw', 'MA']].plot(figsize=(10, 5), title = f'Trace for TestId : {testid}')
        plot1

In [None]:
#ecd_sara = window(start = -30, end =  40, ts = ecd_rolling_norm, pred = ecd_pred_tot, window = 31)
moving_average_plot(ecd_rolling_norm, ecd_tot_norm, 1)

In [None]:
sara = ecd_rolling_norm[ecd_rolling_norm['TestId'] == 8883093]
sara.to_csv('MA_test.csv')

## Visualizations of Moving Average

#### Remove Wet-up --> Normalized --> Applied moving average

In [None]:
loc = 83
ide = ecd['TestId'][loc]

In [None]:
sample_start = int(ecd_pred[ecd_pred['TestID'] == ide]['SampleDetectTime'].item()/0.2)
window_start = int(-30/0.2)
window_end = int(40/0.2)

In [None]:
# Removing the wetup periods
analyte_b_window = analyte_b[analyte_b['TestId'] == ide].iloc[:, sample_start + window_start + 2:sample_start+window_end+2]
analyte_a_window = ecd[ecd['TestId'] == ide].iloc[:, sample_start + window_start + 1 :sample_start + window_end + 1]

# We add 15 because we are doing a centered rolling mean with a window of 31 (15 on each side)
moving_window = ecd[ecd['TestId'] == ide].iloc[:, sample_start + window_start -15 + 1  :sample_start + window_end + 15 + 1]

In [None]:
analyte_a_norm = (analyte_a_window - analyte_a_window.min(axis = 1).item() )/ (analyte_a_window.max(axis = 1).item() - analyte_a_window.min(axis = 1).item())
analyte_b_norm = (analyte_b_window - analyte_b_window.min(axis = 1).item() )/ (analyte_b_window.max(axis = 1).item() - analyte_b_window.min(axis = 1).item())
moving_norm = (moving_window - moving_window.min(axis = 1).item() )/ (moving_window.max(axis = 1).item() - moving_window.min(axis = 1).item())

In [None]:
rolling_norm = moving_norm.rolling(window = 31, axis = 1, center = True).mean().dropna(axis = 1, how = 'all')

In [None]:
x = np.arange(window_start, window_end)
plt.plot(x, analyte_b_norm.transpose(), c = 'b', label = 'analyte_b', alpha = 0.3)
plt.plot(x, analyte_a_norm.transpose(), c = 'r', label = 'analyte_a', alpha = 0.3)
plt.plot(x, rolling_norm.transpose(), c = 'r', label = 'Rolling average for analyte_a')
plt.xlabel("Index w.r.t sample detection")
plt.ylabel("Normalized Signal")
plt.title("Comparision of normalized signals")
plt.legend()

#### Pulling out smaller window

In [None]:
small_win_start = int(-15/0.2)
small_win_end = int(-3/0.2)

In [None]:
analyte_b_cal = analyte_b_norm.iloc[:, small_win_start:small_win_end]
analyte_a_cal = analyte_a_norm.iloc[:, small_win_start:small_win_end]
ma_cal = rolling_norm.iloc[:, small_win_start:small_win_end]

In [None]:
x = np.arange(small_win_start,small_win_end)
plt.plot(x, analyte_b_cal.transpose(), c = 'b', label = 'analyte_b', alpha = 0.3)
plt.plot(x, analyte_a_cal.transpose(), c = 'r', label = 'analyte_a', alpha = 0.3)
plt.plot(x, ma_cal.transpose(), c = 'r', label = 'Rolling average for analyte_a')
plt.xlabel("Index w.r.t sample detection")
plt.ylabel("Normalized Signal")
plt.title("Comparision of normalized signals for calibration window")
plt.legend()

#### Look at the power spectrum

In [None]:
# Calculate the power spectra in a featureless region
f, ps = signal.periodogram(analyte_a_cal, fs = 5)
f1, ps_1 = signal.periodogram(ma_cal, fs = 5)
f2, ps_2 = signal.periodogram(analyte_b_cal, fs = 5)

plt.figure(figsize=(10,8))
with plt.style.context(('ggplot')):
    plt.plot(f, ps[0], 'r', label = 'analyte_a, no smoothing', alpha = 0.3)
    plt.plot(f1, ps_1[0], 'r', label = 'Moving average')
    plt.plot(f2, ps_2[0], 'b', label = 'analyte_b', alpha = 0.3)
    plt.legend()
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Power')
    plt.title('Overlayed power spectral density estimations')

In [None]:
# Calculate the power spectra in a featureless region
f, ps = signal.periodogram(analyte_a_cal, fs = 5)
f1, ps_1 = signal.periodogram(ma_cal, fs = 5)
f2, ps_2 = signal.periodogram(analyte_b_cal, fs = 5)

 
 
plt.figure(figsize=(10,8))
with plt.style.context(('ggplot')):
    plt.plot(f1, ps_1[0], 'r', label = 'Moving average')
    plt.plot(f2, ps_2[0], 'b', label = 'analyte_b', alpha = 0.3)
    plt.legend()
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Power')
    plt.title('Overlayed power spectral density estimations')

## Windowing Moving Average

In [None]:
# Timeseries
# We use the normalized data, and then apply the moving average
ecd_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/ecd_norm.csv")
ecd_syn_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/syn_norm.csv")
ecd_con_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/cont_norm.csv")
un_norm = pd.read_csv("../../../data/TimeSeriesData/Normalized/un_norm.csv")

ecd_tot_norm = pd.concat([ecd_norm, ecd_syn_norm, ecd_con_norm], axis = 0)

# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ecdContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticECD.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDAggContaminated.csv')

ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])

In [None]:
ecd_rolling_norm = creating_moving_average_data(ecd_tot_norm, 31)
un_rolling_norm = creating_moving_average_data(un_norm, 31)

In [None]:
def window(start, end, ts, pred, window, center = True):
    """Windows the data output by the `creating_moving_average_data` function. 

    Args:
        start (int): Start of window with respect to sample detect time (i.e -15 represents 15 seconds before sample detect)
        end (int): End of window with respect to sample detect time (i.e 30 represents 30 seconds after sample detect)
        ts (pandas data frame): Output from the `creating_moving_average` function (a dataframe containing the moving averages). 
        pred (pandas data frame): Dataframe containg the predictor file containing a column with the SampleDetectTime (i.e when windowing the unsuccessful readings, use the predictor file for the unsuccessul readings).
        window (int): The window that was used when calculating the moving average for the ts dataframe (needs to be an odd number if centered).
        center (bool, default = True): Whether or not the moving average that was calculated for the ts dataframe was centered (True) or not(False).

    Returns:
        A new pandas data frame with z-normalized time series stored in the rows. 
    """
    
    # Removing the readings with a sample detect time of 0
    # ids with sample detect time different than 0
    ids = pred[pred['SampleDetectTime']!=0]['TestID'].reset_index(drop = True)
    
    ts = ts[ts['TestId'].isin(ids)]
    
    # Retrieving sample detect time for each reading that has sample detect time different than 0
    sample_detect_time = pred[pred['SampleDetectTime'] !=0]['SampleDetectTime'].reset_index(drop = True)
    
    # Retrieving the index corresponding to the sample detect time.
    # We substract terms corresponding to the number of indexes that were removed during the calculation of the moving average (i.e when the centered window is 
    # of size 31, the first 15 columns of our moving average is NA because there is not enough points to calculate the mean). These columns were removed
    # in the function that creates the dataframe containing the moving average which is why we have to take this into consideration when finding the index.
    
    if center == True:
        sample_detect_index = ((sample_detect_time/0.2) - (window - 1)/2).astype(int).reset_index(drop = True)
       
    else:
        sample_detect_index = ((sample_detect_time/0.2) - (window - 1)).astype(int).reset_index(drop = True)
        
    # Retrieving the indices corresponding to the start and the end of the desired window
    index = pd.concat([ids,sample_detect_time, sample_detect_index, int(start/0.2) + sample_detect_index, int(end/0.2) + sample_detect_index], axis = 1).reset_index(drop = True)
    index.columns = ["TestId","Sample detect time", "Sample detect index", "Start", "End"]
    index['Start'] = index['Start'].astype(int)
    index['End'] = index['End'].astype(int)
    
    # Merge the Start and End indices to the rolling mean dataframe
    ts = ts.merge(index, how = 'left', on = 'TestId')
    
    # Select the window
    subsets = [ts.iloc[row, ts['Start'][row]:ts['End'][row]].reset_index(drop = True) for row in range(len(ts))]
    subsets = pd.DataFrame(subsets)
    subsets.columns =  [str(round(m,1)) for m in np.arange(start,end, 0.2)]
    
    # Join the TestId to the windows 
    windowed_data = pd.concat([pd.DataFrame(ts['TestId']).reset_index(drop = True), subsets], axis = 1)
    windowed_data = windowed_data.dropna()
    return windowed_data

In [None]:
# Define three windows for now (use the normalized waveforms). 
un_cal = window(start = -15, end = -3, ts = un_rolling_norm, pred = un_pred, window = 31)
un_post = window(start = 12, end = 16, ts = un_rolling_norm, pred = un_pred, window = 31)
un_sample = window(start = 32, end = 35, ts = un_rolling_norm, pred = un_pred, window = 31)

ecd_cal = window(start = -15, end =  -3, ts = ecd_rolling_norm, pred = ecd_pred_tot, window = 31)
ecd_post = window(start = 12, end =  16, ts = ecd_rolling_norm, pred = ecd_pred_tot, window = 31)
ecd_sample = window(start = 32, end = 35, ts = ecd_rolling_norm, pred = ecd_pred_tot, window = 31)

We want to find metrics that compare the behavior in each of the windows. For example, subtracting the mean in cal to the mean in post. We can only do this for readings that are present in all of the windows (this excludes the shorter waveforms that don't make it to the post/sample window). This is why we will only consider the testids that are in the sample window (if they make it to sample, they have to also be in post and cal).

Lets create a dataframe containing the ecd contacts in the sample window to the unsuccessful readings in the sample window. We will add a label (True if ecd and False if not), to make it easier to identify which is which later once we have clustered.

In [None]:
# Keeecdg only the TestId that are in all 3 windows (meaning only the ids that remain in the sample window)
common_ids = un_sample['TestId'].reset_index(drop = True)
un_cal = un_cal[un_cal['TestId'].isin(common_ids)]
un_post = un_post[un_post['TestId'].isin(common_ids)]

common_ecd_ids = ecd_sample['TestId'].reset_index(drop = True)
ecd_cal = ecd_cal[ecd_cal['TestId'].isin(common_ecd_ids)]
ecd_post = ecd_post[ecd_post['TestId'].isin(common_ecd_ids)]

# Adding label to differentiate ecds from unsuccessful
un_cal['Label'] = False
ecd_cal['Label'] = True

un_post['Label'] = False
ecd_post['Label'] = True

un_sample['Label'] = False
ecd_sample['Label'] = True

# Concatenating the ecd contact readings with the unsuccessful readings
MA_cal = pd.concat([un_cal, ecd_cal], axis = 0).reset_index(drop = True)
MA_post = pd.concat([un_post, ecd_post], axis = 0).reset_index(drop = True)
MA_sample = pd.concat([un_sample, ecd_sample], axis = 0).reset_index(drop = True)

In [None]:
MA_cal['mean'] = MA_cal.drop(columns = ['TestId', 'Label']).mean(axis = 1)
MA_post['mean'] = MA_post.drop(columns = ['TestId', 'Label']).mean(axis = 1)
MA_sample['mean'] = MA_sample.drop(columns = ['TestId', 'Label']).mean(axis = 1)

In [None]:
import matplotlib.pyplot as plt
# Calibration period for unsuccessful
x = [float(x) for x in MA_cal.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_cal[MA_cal['Label'] == False].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Calibration period for ecd
x = [float(x) for x in MA_cal.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_cal[MA_cal['Label'] == True].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Post period for unsuccessful
x = [float(x) for x in MA_post.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_post[MA_post['Label'] == False].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Post period for ecds
x = [float(x) for x in MA_post.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_post[MA_post['Label'] == True].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Sample period for unsuccessful
x = [float(x) for x in MA_sample.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_sample[MA_sample['Label'] == False].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Sample period for ecds
x = [float(x) for x in MA_sample.drop(columns = ['mean', 'Label', 'TestId']).columns]
number_of_lines = 250
for row in range(number_of_lines):
    y = MA_sample[MA_sample['Label'] == True].drop(columns = ['mean', 'Label', 'TestId']).iloc[row,:]
    plt.plot(x, y)

In [None]:
# Complete waveform for unsuccessful
number_of_lines = 250
for row in range(number_of_lines):
    x = [float(x) for x in un_rolling.drop(columns = ['TestId']).iloc[row,:].dropna().index]
    y = un_rolling.drop(columns = ['TestId']).iloc[row,:].dropna()
    plt.plot(x, y)


In [None]:
# Complete waveform for ecds
number_of_lines = 250
for row in range(number_of_lines):
    x = [float(x) for x in ecd_rolling.drop(columns = ['TestId']).iloc[row,:].dropna().index]
    y = ecd_rolling.drop(columns = ['TestId']).iloc[row,:].dropna()
    plt.plot(x, y)

In [None]:
cal_feat = MA_cal[['TestId', 'Label', 'mean']]
post_feat = MA_post[['TestId', 'Label', 'mean']]
sample_feat = MA_sample[['TestId', 'Label', 'mean']]

In [None]:
feat = cal_feat.join(post_feat[['mean', 'TestId']].set_index('TestId'), on = 'TestId', rsuffix = '_post')
feat = feat.rename(columns = {'mean': 'mean_cal'})

feat = feat.join(sample_feat[['mean', 'TestId']].set_index('TestId'), on = 'TestId', how = 'right', rsuffix = '_sample')
feat= feat.rename(columns = {'mean': 'mean_sample'})

In [None]:
feat['cal-post'] = abs(feat['mean_cal'] - feat['mean_post'])
feat['cal-sample'] = abs(feat['mean_cal'] - feat['mean_sample'])
feat['sample-post'] = abs(feat['mean_sample'] - feat['mean_post'])

In [None]:
feat

In [None]:
import altair as alt

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('cal-post'),
    color=alt.Color('Label')
).properties(width=300)

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('cal-sample'),
    color=alt.Color('Label')
).properties(width=300)

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(feat).mark_boxplot(size=50).encode(
    x='Label',
    y=alt.Y('sample-post'),
    color=alt.Color('Label')
).properties(width=300)

In [None]:
feat

## RANDOM FOREST (FOR CURIOSITY)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV

In [None]:
x = feat[['mean_cal', 'mean_post', 'mean_sample', 'cal-post', 'cal-sample', 'sample-post']]
y = feat[['Label']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y, random_state = 2022)

In [None]:
metrics = {'balanced_accuracy': metrics.make_scorer(metrics.balanced_accuracy_score), 
           'precision': metrics.make_scorer(metrics.precision_score),
          'recall' : metrics.make_scorer(metrics.recall_score),
          'f1' : metrics.make_scorer(metrics.f1_score),
          'log-loss' : metrics.make_scorer(metrics.log_loss)}

In [None]:
print(f"% of ecds in training set: {np.round(np.mean(y_train['Label'])*100,3)}")
print(f"% of ecds in testing set: {np.round(np.mean(y_test['Label'])*100,3)}")

In [None]:
# Lets fit a rf with default parameters
rf_default = RandomForestClassifier(random_state = 2022)

# Define evaluation procedure
cv_rf_default = cross_validate(rf_default, X = X_train, y = y_train,scoring = metrics, n_jobs = -1, cv = 5, verbose = 1)

pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0)

In [None]:
# Average of all folds
metrics_rf_default = pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0).mean(axis = 0)
metrics_rf_default = pd.DataFrame(metrics_rf_default).set_axis(['Avg Fit Time', 'Avg Score Time', '5-Fold Balanced Accuracy', '5-Fold Precision', '5-Fold Recall', '5-Fold F1', '5-Fold Log Loss'], axis = 0).T.set_axis(['Random Forest'])
metrics_rf_default

## With the returncode

In [None]:
# Concatenating return code
x = feat.merge(pred, on = 'TestId', how = 'left')[['mean_cal', 'mean_post', 'mean_sample', 'cal-post', 'cal-sample', 'sample-post', 'ReturnCode']]
y = feat.merge(pred, on = 'TestId', how = 'left')[['Label_x']]

In [None]:
x = x.replace(np.nan, 'noreturncode', regex=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'team' column 
encoder_df = pd.DataFrame(encoder.fit_transform(x[['ReturnCode']]).toarray())

#merge one-hot encoded columns back with original DataFrame
final_x = x.join(encoder_df)

In [None]:
x = final_x.drop(columns = 'ReturnCode')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y, random_state = 2022)
print(f"% of ecds in training set: {np.round(np.mean(y_train['Label_x'])*100,3)}")
print(f"% of ecds in testing set: {np.round(np.mean(y_test['Label_x'])*100,3)}")

In [None]:
X_train

In [None]:
# Lets fit a rf with default parameters
rf_default = RandomForestClassifier(random_state = 2022)

# Define evaluation procedure
cv_rf_default = cross_validate(rf_default, X = X_train, y = y_train,scoring = metrics, n_jobs = -1, cv = 5, verbose = 1)

pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0)

In [None]:
# Average of all folds
metrics_rf_default = pd.DataFrame.from_dict(cv_rf_default).set_axis(['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_5'], axis=0).mean(axis = 0)
metrics_rf_default = pd.DataFrame(metrics_rf_default).set_axis(['Avg Fit Time', 'Avg Score Time', '5-Fold Balanced Accuracy', '5-Fold Precision', '5-Fold Recall', '5-Fold F1', '5-Fold Log Loss'], axis = 0).T.set_axis(['Random Forest'])
metrics_rf_default

### Evaluating on testing data


In [None]:
rf_default = RandomForestClassifier(random_state = 2022)
rf_default.fit(X_train, y_train)
predictions_rf_tuned = rf_default.predict(X_test)

In [None]:
from sklearn import metrics
test_metrics_rf = [metrics.balanced_accuracy_score(y_test, predictions_rf_tuned),
                    metrics.precision_score(y_test, predictions_rf_tuned),
                    metrics.recall_score(y_test, predictions_rf_tuned),
                    metrics.f1_score(y_test, predictions_rf_tuned),
                    metrics.log_loss(y_test, predictions_rf_tuned)]
overall_performance = pd.DataFrame(test_metrics_rf).T
overall_performance.set_axis(['Balanced Accuracy', 'Precision', 'Recall', 'F1 Score', 'Log Loss'], axis = 1)

## CLUSTERING

In [None]:
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(5, 5))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(feat[['cal-post', 'cal-sample', 'sample-post']], method='ward'))

In [None]:
plt.figure(figsize=(5, 5))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(feat[['cal-post', 'cal-sample', 'sample-post']], method='single'))

In [None]:
plt.figure(figsize=(5, 5))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(feat[['cal-post', 'cal-sample', 'sample-post']], method='complete'))

In [None]:
plt.figure(figsize=(5, 5))  
plt.title("Dendrograms")  
dend = shc.dendrogram(shc.linkage(feat[['cal-post', 'cal-sample', 'sample-post']], method='average'))

In [None]:
from sklearn.cluster import AgglomerativeClustering

# clust_ward_2 = AgglomerativeClustering(n_clusters = 2, affinity='euclidean', linkage='ward')
# clust_ward_2.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_ward_3 = AgglomerativeClustering(n_clusters = 3, affinity='euclidean', linkage='ward')
# clust_ward_3.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_ward_4 = AgglomerativeClustering(n_clusters = 4, affinity='euclidean', linkage='ward')
# clust_ward_4.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_ward_30 = AgglomerativeClustering(n_clusters = 30, affinity='euclidean', linkage='ward')
# clust_ward_30.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

clust_ward_40 = AgglomerativeClustering(n_clusters = 40, affinity='euclidean', linkage='ward')
clust_ward_40.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

clust_ward_100 = AgglomerativeClustering(n_clusters = 100, affinity='euclidean', linkage='ward')
clust_ward_100.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_single = AgglomerativeClustering(n_clusters = 2, affinity='euclidean', linkage='single')
# clust_single.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_complete = AgglomerativeClustering(n_clusters = 2, affinity='euclidean', linkage='complete')
# clust_complete.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])

# clust_average = AgglomerativeClustering(n_clusters = 2, affinity='euclidean', linkage='average')
# clust_average.fit_predict(feat[['cal-post', 'cal-sample', 'sample-post']])


In [None]:
# feat['clust_ward_2'] = clust_ward_2.labels_
# feat['clust_ward_3'] = clust_ward_3.labels_
# feat['clust_ward_4'] = clust_ward_4.labels_
# feat['clust_ward_30'] = clust_ward_30.labels_
feat['clust_ward_40'] = clust_ward_40.labels_
feat['clust_ward_100'] = clust_ward_100.labels_
# feat['clust_single'] = clust_single.labels_
# feat['clust_complete'] = clust_complete.labels_
# feat['clust_average'] = clust_average.labels_

In [None]:
feat

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_2'),
    alt.Y('count()'),
    alt.Color('Label'))

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_3'),
    alt.Y('count()'),
    alt.Color('Label'))

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_4'),
    alt.Y('count()'),
    alt.Color('Label'))

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_30'),
    alt.Y('count()'),
    alt.Color('Label'))

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_40'),
    alt.Y('count()'),
    alt.Color('Label'))

In [None]:
alt.Chart(feat).mark_bar().encode(
    alt.X('clust_ward_100'),
    alt.Y('count()'),
    alt.Color('Label'))

## Visualizations of clusters formed

In [None]:
# Predictors
un_pred = pd.read_csv('../../../data/RawDataPredictors/New/Unsuccessful.csv')
ecd_pred = pd.read_csv('../../../data/RawDataPredictors/New/ecdContact.csv')
syn_pred = pd.read_csv('../../../data/RawDataPredictors/New/SyntheticECD.csv')
con_pred = pd.read_csv('../../../data/RawDataPredictors/New/ECDAggContaminated.csv')

In [None]:
ecd_pred['Label'] = 'wild'
syn_pred['Label'] = 'syn'
con_pred['Label'] = 'con'
un_pred['Label'] = 'un'

In [None]:
ecd_pred_tot = pd.concat([ecd_pred, syn_pred, con_pred])
ecd_pred_tot = ecd_pred_tot.rename({'TestID':'TestId'}, axis = 1)

un_pred = un_pred.rename({'TestID' : 'TestId'}, axis = 1)

pred = pd.concat([un_pred, ecd_pred_tot])

In [None]:
# Time series (normalized and from -30 to 40)
ecd = pd.read_csv("../../../data/TimeSeriesData/Normalized/ecd_norm_window.csv")
ecd_syn = pd.read_csv("../../../data/TimeSeriesData/Normalized/syn_norm_window.csv")
ecd_con = pd.read_csv("../../../data/TimeSeriesData/Normalized/cont_norm_window.csv")
un = pd.read_csv("../../../data/TimeSeriesData/Normalized/un_norm_window.csv")

In [None]:
ecd_ts = pd.concat([ecd, ecd_syn, ecd_con])

In [None]:
ts = pd.concat([ecd_ts, un])

In [None]:
ts_pred = ts.merge(pred, on = 'TestId', how = 'left')

In [None]:
ts_pred.head()

In [None]:
# Now join the cluster
ts_pred_cluster = ts_pred.merge(feat.drop(columns = 'Label', axis = 1), on = 'TestId', how = 'left')

In [None]:
ts_pred_cluster.ReturnCode = ts_pred_cluster.ReturnCode.astype(str)
ts_pred_cluster.FluidType = ts_pred_cluster.FluidType.astype(str)
ts_pred_cluster.Label = ts_pred_cluster.Label.astype(str)

In [None]:
ts_pred_cluster.iloc[:, 1:351]

In [None]:
method = 'clust_ward_40'
for cluster in range(len(ts_pred_cluster[method].unique())):
    #for error in combo['ReturnCode'].unique():
        #clust_size = sum(combo['Cluster'] == cluster)
        #num_error = sum((combo['Cluster'] == cluster) & (combo['ReturnCode'] == error))
        #print('Proportion of\t', error, '\t\tin cluster', cluster, 'is\t', round(num_error/clust_size, 2), '(', num_error, '/', clust_size, ')')
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize = (20,5))
    
    ax1.hist(ts_pred_cluster[ts_pred_cluster[method] == cluster]['ReturnCode'], bins = len((ts_pred_cluster[ts_pred_cluster[method] == cluster]['ReturnCode']).unique()))
    ax2.hist(ts_pred_cluster[ts_pred_cluster[method] == cluster]['FluidType'], bins = len((ts_pred_cluster[ts_pred_cluster[method] == cluster]['FluidType']).unique()))
    ax3.hist(ts_pred_cluster[ts_pred_cluster[method] == cluster]['Label'], bins = len((ts_pred_cluster[ts_pred_cluster[method] == cluster]['Label']).unique()))
    ax4.plot(ts_pred_cluster[ts_pred_cluster[method] == cluster].iloc[:, 1:351].transpose())
    ax1.tick_params(labelrotation=90)
    ax2.tick_params(labelrotation=90)
    ax3.tick_params(labelrotation=90)
    fig.show()
    #print()